diff --git a/.github/workflows/simple.yml b/.github/workflows/simple.yml
index aca3aada82e959dc6ad9ce7493d52a9a43352c8e..7072f40240ea96c36d41ac029d544e4a1579d2a2 100644
--- a/.github/workflows/simple.yml
+++ b/.github/workflows/simple.yml
@@ -27,7 +27,7 @@ jobs:
     if: github.ref == 'refs/heads/master' || (github.event.pull_request.draft == false && contains(github.event.pull_request.requested_reviewers.*.login, 'oneflow-ci-bot') && contains(github.event.pull_request.labels.*.name, 'need-simple-ci'))
     runs-on: ${{ matrix.os }}
     needs: [cancel_previous]
-    timeout-minutes: 120
+    timeout-minutes: 240
     strategy:
       fail-fast: true
       max-parallel: 3
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 1cf13d3e7afb752e1c46d75339b4b110699ede0f..01de83e923fe5b525b3f2f2c1837c542a80ef44f 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -53,11 +53,13 @@ jobs:
         id: license_check
         run: |
           python3 ci/check/run_license_format.py -i oneflow -c
+          python3 ci/check/run_license_format.py -i python -c
       - name: Add license
         id: license_fmt
         if: ${{ failure() }}
         run: |
           python3 ci/check/run_license_format.py -i oneflow --fix
+          python3 ci/check/run_license_format.py -i python --fix
       - name: Check C++/CUDA format
         id: cpp_check
         run: |
@@ -521,7 +523,7 @@ jobs:
         run: |
           docker run \
             ${{ env.extra_docker_args }} ${{ env.pip_cache_docker_args }} \
-            -e ONEFLOW_TEST_DIR=$PWD/oneflow/python/test/modules \
+            -e ONEFLOW_TEST_DIR=$PWD/python/oneflow/test/modules \
             ${{ env.image_tag }} \
             bash -c "python3 -m pip config set global.index-url ${{ env.pip_index_mirror }} && bash ci/test/try_install.sh && bash ci/test_multi_client/generic_test.sh"
       - name: Dataloader API test
@@ -530,7 +532,7 @@ jobs:
         run: |
           docker run \
             ${{ env.extra_docker_args }} ${{ env.pip_cache_docker_args }} \
-            -e ONEFLOW_TEST_DIR=$PWD/oneflow/python/test/dataloader \
+            -e ONEFLOW_TEST_DIR=$PWD/python/oneflow/test/dataloader \
             ${{ env.image_tag }} \
             bash -c "python3 -m pip config set global.index-url ${{ env.pip_index_mirror }} && bash ci/test/try_install.sh && bash ci/test/generic_test.sh"
       - name: Tensor API test
@@ -539,7 +541,7 @@ jobs:
         run: |
           docker run \
             ${{ env.extra_docker_args }} ${{ env.pip_cache_docker_args }} \
-            -e ONEFLOW_TEST_DIR=$PWD/oneflow/python/test/tensor \
+            -e ONEFLOW_TEST_DIR=$PWD/python/oneflow/test/tensor \
             ${{ env.image_tag }} \
             bash -c "python3 -m pip config set global.index-url ${{ env.pip_index_mirror }} && bash ci/test/try_install.sh && bash ci/test/generic_test.sh"
       - name: Graph API test
@@ -547,7 +549,7 @@ jobs:
         run: |
           docker run \
             ${{ env.extra_docker_args }} ${{ env.pip_cache_docker_args }} \
-            -e ONEFLOW_TEST_DIR=$PWD/oneflow/python/test/graph \
+            -e ONEFLOW_TEST_DIR=$PWD/python/oneflow/test/graph \
             ${{ env.image_tag }} \
             bash -c "python3 -m pip config set global.index-url ${{ env.pip_index_mirror }} && bash ci/test/try_install.sh && bash ci/test/generic_test.sh"
       - name: Checkout Oneflow-Inc/models
@@ -555,7 +557,7 @@ jobs:
         uses: actions/checkout@v2
         with:
           repository: Oneflow-Inc/models
-          ref: 830a6b91f10c0a04a68843370cea6319a21ed9c2
+          ref: c822b17f9ed598185cf1fa70f570871d12db7442
           path: oneflow-models
       - name: Speed test
         id: speed
diff --git a/.gitignore b/.gitignore
index 46c05ff0c1794515429900fa26abb02fc6a282f2..78efe4306d0435f9ef8e0192ab4804899164c0a1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -31,4 +31,4 @@ compile_commands.json
 /distributed-tmp
 /serving-tmp
 test_tmp_dir
-oneflow/python/version.py
+unittest-log-*
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 68b7fa17bfb1912c0319bd2b099cedd76a9f1c27..f29207db6aaa966e175a17fc21ad08c1b7876786 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -105,8 +105,7 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 set(THIRD_PARTY_DIR "${PROJECT_BINARY_DIR}/third_party_install"
   CACHE PATH "Where to install third party headers and libs")
 
-set(THIRD_PARTY_SUBMODULE_DIR "${PROJECT_SOURCE_DIR}/build/third_party"
-  CACHE PATH "Where the third party submodules are")
+set(ONEFLOW_PYTHON_DIR "${PROJECT_SOURCE_DIR}/python" CACHE PATH "oneflow python src dir")
 
 if(WIN32)
   set(CMAKE_BUILD_TYPE Debug)
diff --git a/README.md b/README.md
index 7d3e2303b91a536c4c07a3708db2b0efeae91d52..5ea0a201dcd4e2d56cf76dfff2c87dcef83fc135 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 **OneFlow is a performance-centered and open-source deep learning framework.**
 
 [![Simple CI](https://github.com/Oneflow-Inc/oneflow/actions/workflows/simple.yml/badge.svg)](https://github.com/Oneflow-Inc/oneflow/actions/workflows/simple.yml)
-
+[![Documentation Status](https://readthedocs.org/projects/oneflow/badge/?version=master)](https://oneflow.readthedocs.io/en/master/?badge=master)
 ## Latest News
 - Version 0.4.0 is out!
   - New Pytorch flavored APIs (`import oneflow.experimental as flow`)
diff --git a/ci/check/run_license_format.py b/ci/check/run_license_format.py
index f528bbc29d946eba18431dafd1a039fd086db95f..622c064735176e765a33d26d3ec34ef3eabfd693 100644
--- a/ci/check/run_license_format.py
+++ b/ci/check/run_license_format.py
@@ -85,6 +85,7 @@ if __name__ == "__main__":
     parser.add_argument(
         "-v", "--verbose", default=False, action="store_true", required=False
     )
+    parser.add_argument("--silent", default=False, action="store_true", required=False)
     parser.add_argument(
         "-c", "--check", default=False, action="store_true", required=False
     )
@@ -109,4 +110,5 @@ if __name__ == "__main__":
                     if args.verbose:
                         print("license already added:", p)
                 else:
-                    print("license just added:", p)
+                    if args.silent == False:
+                        print("license just added:", p)
diff --git a/ci/test/1node_benchmark_test.sh b/ci/test/1node_benchmark_test.sh
index e1fbe28a1faf32379209d05aed8564437eed0bb2..44cfe0ff32ae90380f805efec84a61c099ca2a4b 100644
--- a/ci/test/1node_benchmark_test.sh
+++ b/ci/test/1node_benchmark_test.sh
@@ -1,7 +1,7 @@
 set -xe
 
 rm -rf /benchmarks
-cp -r oneflow/compatible_single_client_python/benchmarks /benchmarks
+cp -r python/oneflow/compatible/single_client/benchmarks /benchmarks
 cd /benchmarks
 
 python3 cnn_benchmark/of_cnn_benchmarks.py \
@@ -42,7 +42,7 @@ python3 cnn_benchmark/of_cnn_benchmarks.py \
     --iter_num=5 \
     --learning_rate=0.01 \
     --optimizer="sgd" \
-    --loss_print_every_n_iter=1 
+    --loss_print_every_n_iter=1
 
 python3 bert_benchmark/run_pretraining.py \
     --gpu_num_per_node=1 \
diff --git a/ci/test/1node_benchmark_test_fp16.sh b/ci/test/1node_benchmark_test_fp16.sh
index 2d6abd6a1067c55b0bedbd79044ea274875611eb..7d61c4552d8f43c5fd9faa3ff176105f2ee39b65 100644
--- a/ci/test/1node_benchmark_test_fp16.sh
+++ b/ci/test/1node_benchmark_test_fp16.sh
@@ -1,7 +1,7 @@
 set -ex
 
 rm -rf /benchmarks
-cp -r oneflow/compatible_single_client_python/benchmarks /benchmarks
+cp -r python/oneflow/compatible/single_client/benchmarks /benchmarks
 cd /benchmarks
 
 python3 cnn_benchmark/of_cnn_benchmarks.py \
diff --git a/ci/test/1node_custom_op_test.sh b/ci/test/1node_custom_op_test.sh
index db8c2de3838696674cdf102d3fd41e6cda654088..141b8152bdc728fcf99b05c897a5943ab464bf89 100644
--- a/ci/test/1node_custom_op_test.sh
+++ b/ci/test/1node_custom_op_test.sh
@@ -3,11 +3,11 @@
 set -xe
 
 src_dir=${ONEFLOW_SRC_DIR:-"$PWD"}
-test_tmp_dir=${ONEFLOW_TEST_TMP_DIR:-"/test_tmp_dir"}
+test_tmp_dir=${ONEFLOW_TEST_TMP_DIR:-"./test_tmp_dir"}
 
 rm -rf $test_tmp_dir
 mkdir -p $test_tmp_dir
-cp -r $src_dir/oneflow/compatible_single_client_python/test/custom_ops $test_tmp_dir
+cp -r $src_dir/python/oneflow/compatible/single_client/test/custom_ops $test_tmp_dir
 cd $test_tmp_dir
 
 export ONEFLOW_TEST_DEVICE_NUM=1
diff --git a/ci/test/1node_model_eager_test.sh b/ci/test/1node_model_eager_test.sh
index 5bb5c6ca504a4d09625ff445594e3577cfdd5354..d7f4291723723fc5764a630dd67995f307ac8984 100644
--- a/ci/test/1node_model_eager_test.sh
+++ b/ci/test/1node_model_eager_test.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -xe
 
-cp -r oneflow/python/test /test_dir
+cp -r python/oneflow/test /test_dir
 cd /test_dir
 
 python3 models/eager_1node_test.py
diff --git a/ci/test/1node_model_serve_test.sh b/ci/test/1node_model_serve_test.sh
index 1942f0eba9a62376146ab09b39b9d9fdc7d3d1e1..84d250de475033e47a281320e0cfc4fcc1ee20bc 100644
--- a/ci/test/1node_model_serve_test.sh
+++ b/ci/test/1node_model_serve_test.sh
@@ -6,7 +6,7 @@ test_tmp_dir=${ONEFLOW_TEST_TMP_DIR:-"/test_tmp_dir"}
 
 rm -rf $test_tmp_dir
 mkdir -p $test_tmp_dir
-cp -r $src_dir/oneflow/compatible_single_client_python/test $test_tmp_dir
+cp -r $src_dir/python/oneflow/compatible/single_client/test $test_tmp_dir
 cd $test_tmp_dir
 
 export ONEFLOW_TEST_DEVICE_NUM=1
diff --git a/ci/test/1node_model_test.sh b/ci/test/1node_model_test.sh
index 589557e69358369c4d5e0140cc40f8734b23a676..db34ef873ff8f92c71eb4289a9a7aade7d5fed7c 100644
--- a/ci/test/1node_model_test.sh
+++ b/ci/test/1node_model_test.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -xe
 
-cp -r oneflow/compatible_single_client_python/test /test_dir
+cp -r python/oneflow/compatible/single_client/test /test_dir
 cd /test_dir
 
 python3 models/1node_test.py
diff --git a/ci/test/1node_op_test.sh b/ci/test/1node_op_test.sh
index 28943e01ad39a0a300afb97b7e238c607a1432e2..0c3057d8365bc162da5cbaf93e45c22839db8821 100644
--- a/ci/test/1node_op_test.sh
+++ b/ci/test/1node_op_test.sh
@@ -10,7 +10,7 @@ test_tmp_dir=${ONEFLOW_TEST_TMP_DIR:-"./test_tmp_dir"}
 
 rm -rf $test_tmp_dir
 mkdir -p $test_tmp_dir
-cp -r $src_dir/oneflow/compatible_single_client_python/test $test_tmp_dir
+cp -r $src_dir/python/oneflow/compatible/single_client/test $test_tmp_dir
 cd $test_tmp_dir
 
 python3 -m oneflow --doctor
diff --git a/ci/test/2node_op_test.sh b/ci/test/2node_op_test.sh
index 892bd94f1c7b798765eeec3f6ef8d0bb30ad83da..6d0312f033f41cfa2251323a4902f5e72e787449 100644
--- a/ci/test/2node_op_test.sh
+++ b/ci/test/2node_op_test.sh
@@ -10,7 +10,7 @@ test_tmp_dir=${ONEFLOW_TEST_TMP_DIR:-"/test_tmp_dir"}
 rm -rf $test_tmp_dir
 mkdir -p $test_tmp_dir
 chmod -R o+w $test_tmp_dir
-cp -r $src_dir/oneflow/compatible_single_client_python/test $test_tmp_dir
+cp -r $src_dir/python/oneflow/compatible/single_client/test $test_tmp_dir
 cd $test_tmp_dir
 
 ONEFLOW_TEST_DEVICE_NUM=1 python3 test/ops/test_assign.py --failfast --verbose
diff --git a/ci/test/doctest.sh b/ci/test/doctest.sh
index 4989c62ce06066d4f038e78e56f3edb1ebdd7a1b..d7a83c840756cf78cad6eab32d5153c500f6b1fd 100644
--- a/ci/test/doctest.sh
+++ b/ci/test/doctest.sh
@@ -11,7 +11,7 @@ python3 -c 'import oneflow; f=open("oneflow_path.txt", "w"); f.write(oneflow.__p
 gpu_num=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
 python3 $src_dir/ci/test/parallel_run.py \
     --gpu_num=${gpu_num} \
-    --dir=$(cat oneflow_path.txt)/python \
+    --dir=$(cat oneflow_path.txt) \
     --timeout=1 \
     --verbose \
     --chunk=1 \
diff --git a/ci/test/dry_run_test.sh b/ci/test/dry_run_test.sh
index f1f3dbff16f152c57aa5848fc5b4be16f641826c..332cf2901eb7c81e8e341f8e51f0cc5d2e7799e6 100644
--- a/ci/test/dry_run_test.sh
+++ b/ci/test/dry_run_test.sh
@@ -9,7 +9,7 @@ test_tmp_dir=${ONEFLOW_TEST_TMP_DIR:-"./test_tmp_dir"}
 
 rm -rf $test_tmp_dir
 mkdir -p $test_tmp_dir
-cp -r $src_dir/oneflow/compatible_single_client_python/benchmarks $test_tmp_dir
+cp -r $src_dir/python/oneflow/compatible/single_client/benchmarks $test_tmp_dir
 cd $test_tmp_dir/benchmarks
 
 export ONEFLOW_DRY_RUN=1
diff --git a/ci/test/generic_test.sh b/ci/test/generic_test.sh
index 9eb09fe06d43da237970ce7b64f22363d90731e2..16ac4c7395767d16c52ddb508d8ed9a38bf15a52 100644
--- a/ci/test/generic_test.sh
+++ b/ci/test/generic_test.sh
@@ -5,9 +5,9 @@ export TF_CPP_MIN_LOG_LEVEL=3
 export PYTHONUNBUFFERED=1
 
 src_dir=${ONEFLOW_SRC_DIR:-"$PWD"}
-test_dir=${ONEFLOW_TEST_DIR:-"$PWD/oneflow/python/test/ops"}
+test_dir=${ONEFLOW_TEST_DIR:-"$PWD/python/oneflow/test/ops"}
 test_tmp_dir=${ONEFLOW_TEST_TMP_DIR:-"./test_tmp_dir"}
-export ONEFLOW_TEST_UTILS_DIR=$src_dir/oneflow/python/test_utils
+export ONEFLOW_TEST_UTILS_DIR=$src_dir/python/oneflow/test_utils
 
 rm -rf $test_tmp_dir
 mkdir -p $test_tmp_dir
diff --git a/ci/test/parallel_run.py b/ci/test/parallel_run.py
index cdf803e5cf1637f8a5548647e019616cf2379cf3..345d26134aaeeb4618b28325ac0b28e72945876d 100644
--- a/ci/test/parallel_run.py
+++ b/ci/test/parallel_run.py
@@ -13,13 +13,14 @@ import uuid
 def gen_cmds(cmd=None, dir=None, doctest=False):
     if doctest:
         paths = glob.glob(os.path.join(dir, "**/*.py"), recursive=True)
-        print(paths)
+        paths = [p for p in paths if "compatible" not in p and "single_client" not in p]
         with_doctest = []
         for p in paths:
             with open(p) as f:
                 content = f.read()
-                if "doctest" in content and "__" not in p:
+                if "doctest" in content:
                     with_doctest.append("{} {} -v".format(cmd, p))
+        print(with_doctest)
         return with_doctest
     else:
         paths = glob.glob(os.path.join(dir, "test_*.py"), recursive=False)
diff --git a/ci/test/test_xla.sh b/ci/test/test_xla.sh
index de0f6697eccc08a70f48ee08b4b8ad647a7de29f..29f72f177fc8a4ca09d56494f65eef56dccd8c3d 100644
--- a/ci/test/test_xla.sh
+++ b/ci/test/test_xla.sh
@@ -5,7 +5,7 @@ test_tmp_dir=${ONEFLOW_TEST_TMP_DIR:-"/test_tmp_dir"}
 
 rm -rf $test_tmp_dir
 mkdir -p $test_tmp_dir
-cp -r $src_dir/oneflow/compatible_single_client_python/test/xrt $test_tmp_dir
+cp -r $src_dir/python/oneflow/compatible/single_client/test/xrt $test_tmp_dir
 cd $test_tmp_dir
 python3 -c "import oneflow.compatible.single_client as flow; assert flow.sysconfig.with_xla()"
-for f in $src_dir/oneflow/compatible_single_client_python/test/xrt/*.py; do python3 "$f"; done
+for f in $src_dir/python/oneflow/compatible/single_client/test/xrt/*.py; do python3 "$f"; done
diff --git a/ci/test_multi_client/generic_test.sh b/ci/test_multi_client/generic_test.sh
index 941d727e6c6a61fdc12513788989218ad48a6911..ef56e85c3fe90a839b596dcf013f8d2f4e075e4d 100644
--- a/ci/test_multi_client/generic_test.sh
+++ b/ci/test_multi_client/generic_test.sh
@@ -4,9 +4,9 @@ set -xe
 export PYTHONUNBUFFERED=1
 
 src_dir=${ONEFLOW_SRC_DIR:-"$PWD"}
-test_dir=${ONEFLOW_TEST_DIR:-"$PWD/oneflow/python/test/modules"}
+test_dir=${ONEFLOW_TEST_DIR:-"$PWD/python/oneflow/test/modules"}
 test_tmp_dir=${ONEFLOW_TEST_TMP_DIR:-"./test_tmp_dir"}
-export ONEFLOW_TEST_UTILS_DIR=$src_dir/oneflow/python/test_utils
+export ONEFLOW_TEST_UTILS_DIR=$src_dir/python/oneflow/test_utils
 
 
 rm -rf $test_tmp_dir
diff --git a/cmake/oneflow.cmake b/cmake/oneflow.cmake
index 7497afa5095941dba1ded7beb6960ad619d32a68..3e51545cda2156725f610d5eae93f7414b763db8 100644
--- a/cmake/oneflow.cmake
+++ b/cmake/oneflow.cmake
@@ -166,6 +166,7 @@ endforeach()
 # clang format
 add_custom_target(of_format
   COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/ci/check/run_license_format.py -i ${CMAKE_CURRENT_SOURCE_DIR}/oneflow --fix
+  COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/ci/check/run_license_format.py -i ${ONEFLOW_PYTHON_DIR} --fix
   COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/ci/check/run_clang_format.py --source_dir ${CMAKE_CURRENT_SOURCE_DIR}/oneflow --fix --quiet
   COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/ci/check/run_py_format.py --source_dir ${CMAKE_CURRENT_SOURCE_DIR} --fix
   )
@@ -276,7 +277,7 @@ pybind11_add_module(oneflow_internal ${PYBIND11_SRCS} ${of_pybind_obj_cc} ${PYBI
 set_property(TARGET oneflow_internal PROPERTY CXX_VISIBILITY_PRESET "default")
 add_dependencies(oneflow_internal of_cfgobj generate_py_cfg)
 set_target_properties(oneflow_internal PROPERTIES PREFIX "_")
-set_target_properties(oneflow_internal PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/python_scripts/oneflow")
+set_target_properties(oneflow_internal PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${ONEFLOW_PYTHON_DIR}/oneflow")
 target_link_libraries(oneflow_internal PRIVATE ${of_libs} ${oneflow_third_party_libs} of_pyext_obj ${oneflow_exe_third_party_libs})
 target_include_directories(oneflow_internal PRIVATE ${Python_INCLUDE_DIRS} ${Python_NumPy_INCLUDE_DIRS})
 
@@ -288,55 +289,16 @@ if (WITH_XLA)
   list(APPEND gen_pip_args --xla)
 endif()
 
-set(of_pyscript_dir "${PROJECT_BINARY_DIR}/python_scripts")
 add_custom_target(of_pyscript_copy ALL
-    COMMAND ${Python_EXECUTABLE} ${PROJECT_SOURCE_DIR}/tools/clean_generated_api.py --root_path=${of_pyscript_dir}
-    COMMAND "${CMAKE_COMMAND}" -E copy
-        "${PROJECT_SOURCE_DIR}/oneflow/init.py" "${of_pyscript_dir}/oneflow/__init__.py"
-    COMMAND "${CMAKE_COMMAND}" -E copy
-        "${PROJECT_SOURCE_DIR}/oneflow/__main__.py" "${of_pyscript_dir}/oneflow/__main__.py"
-    COMMAND rm -rf ${of_pyscript_dir}/oneflow/python
-    COMMAND rm -rf ${of_pyscript_dir}/oneflow/compatible
-    COMMAND ${CMAKE_COMMAND} -E create_symlink "${PROJECT_SOURCE_DIR}/oneflow/python" "${of_pyscript_dir}/oneflow/python"
-    COMMAND "${CMAKE_COMMAND}" -E copy
-        "${PROJECT_SOURCE_DIR}/oneflow/single_client_init.py" "${of_pyscript_dir}/oneflow/compatible/single_client/__init__.py"
-    COMMAND ${CMAKE_COMMAND} -E create_symlink "${PROJECT_SOURCE_DIR}/oneflow/compatible_single_client_python" "${of_pyscript_dir}/oneflow/compatible/single_client/python"
-    COMMAND ${CMAKE_COMMAND} -E touch "${of_pyscript_dir}/oneflow/compatible/__init__.py"
-    COMMAND "${CMAKE_COMMAND}" -E copy
-        "${PROJECT_SOURCE_DIR}/oneflow/single_client_main.py" "${of_pyscript_dir}/oneflow/compatible/single_client/__main__.py"
-    COMMAND ${CMAKE_COMMAND} -E make_directory "${of_pyscript_dir}/oneflow/distributed"
-    COMMAND ${CMAKE_COMMAND} -E create_symlink "${PROJECT_SOURCE_DIR}/oneflow/python/distributed/launch.py" "${of_pyscript_dir}/oneflow/distributed/launch.py"
-    COMMAND ${CMAKE_COMMAND} -E copy_directory "${of_proto_python_dir}/oneflow/core" "${of_pyscript_dir}/oneflow/core"
-    COMMAND ${CMAKE_COMMAND} -E touch "${of_pyscript_dir}/oneflow/core/__init__.py"
-    COMMAND ${CMAKE_COMMAND} -E copy_directory "${of_proto_python_dir}/oneflow/core" "${of_pyscript_dir}/oneflow/compatible/single_client/core"
-    COMMAND ${CMAKE_COMMAND} -E touch "${of_pyscript_dir}/oneflow/compatible/single_client/core/__init__.py"
-    COMMAND ${CMAKE_COMMAND} -E make_directory "${of_pyscript_dir}/oneflow/F"
-    COMMAND ${CMAKE_COMMAND} -E touch "${of_pyscript_dir}/oneflow/F/__init__.py"
-    COMMAND ${CMAKE_COMMAND} -E make_directory "${of_pyscript_dir}/oneflow/compatible/single_client/F"
-    COMMAND ${CMAKE_COMMAND} -E touch "${of_pyscript_dir}/oneflow/compatible/single_client/F/__init__.py"
-    COMMAND ${CMAKE_COMMAND} -E make_directory "${of_pyscript_dir}/oneflow/experimental/F"
-    COMMAND ${CMAKE_COMMAND} -E touch "${of_pyscript_dir}/oneflow/experimental/F/__init__.py"
-    COMMAND ${CMAKE_COMMAND} -E make_directory "${of_pyscript_dir}/oneflow/compatible/single_client/experimental/F"
-    COMMAND ${CMAKE_COMMAND} -E touch "${of_pyscript_dir}/oneflow/compatible/single_client/experimental/F/__init__.py"
-    COMMAND ${CMAKE_COMMAND} -E make_directory "${of_pyscript_dir}/oneflow/python_gen"
-    COMMAND ${CMAKE_COMMAND} -E touch "${of_pyscript_dir}/oneflow/python_gen/__init__.py"
-    COMMAND ${CMAKE_COMMAND} -E make_directory "${of_pyscript_dir}/oneflow/compatible/single_client/python_gen"
-    COMMAND ${CMAKE_COMMAND} -E touch "${of_pyscript_dir}/oneflow/compatible/single_client/python_gen/__init__.py"
-    COMMAND ${Python_EXECUTABLE} ${PROJECT_SOURCE_DIR}/tools/generate_pip_version.py ${gen_pip_args} --src=${PROJECT_SOURCE_DIR}
-    COMMAND ${Python_EXECUTABLE} "${PROJECT_SOURCE_DIR}/tools/generate_oneflow_symbols_export_file.py"
-        "${PROJECT_SOURCE_DIR}/oneflow/python" "${of_pyscript_dir}/oneflow/python_gen/__export_symbols__.py" "python"
-    COMMAND ${Python_EXECUTABLE} "${PROJECT_SOURCE_DIR}/tools/generate_oneflow_symbols_export_file.py"
-        "${of_pyscript_dir}/oneflow/compatible" "${of_pyscript_dir}/oneflow/compatible/single_client/python_gen/__export_symbols__.py" "compatible")
+    COMMAND ${CMAKE_COMMAND} -E touch "${of_proto_python_dir}/oneflow/core/__init__.py"
+    COMMAND ${CMAKE_COMMAND} -E create_symlink "${of_proto_python_dir}/oneflow/core" "${ONEFLOW_PYTHON_DIR}/oneflow/core"
+    COMMAND ${Python_EXECUTABLE} ${PROJECT_SOURCE_DIR}/tools/generate_pip_version.py ${gen_pip_args} --src=${PROJECT_SOURCE_DIR} --out=${ONEFLOW_PYTHON_DIR}/oneflow/version.py
+)
 
 # source this file to add oneflow in PYTHONPATH
-file(WRITE "${PROJECT_BINARY_DIR}/source.sh" "export PYTHONPATH=${of_pyscript_dir}:$PYTHONPATH")
+file(WRITE "${PROJECT_BINARY_DIR}/source.sh" "export PYTHONPATH=${ONEFLOW_PYTHON_DIR}:$PYTHONPATH")
 
 add_dependencies(of_pyscript_copy of_protoobj)
-add_custom_target(generate_api ALL
-  COMMAND rm -rf ${of_pyscript_dir}/oneflow/generated
-  COMMAND export PYTHONPATH=${of_pyscript_dir}:$ENV{PYTHONPATH} && ${Python_EXECUTABLE} ${PROJECT_SOURCE_DIR}/tools/generate_oneflow_api.py --root_path=${of_pyscript_dir}/oneflow)
-add_dependencies(generate_api of_pyscript_copy)
-add_dependencies(generate_api oneflow_internal)
 
 file(RELATIVE_PATH PROJECT_BINARY_DIR_RELATIVE ${PROJECT_SOURCE_DIR} ${PROJECT_BINARY_DIR})
 
@@ -373,10 +335,10 @@ if(BUILD_TESTING)
 endif()
 
 # build include
-set(ONEFLOW_INCLUDE_DIR "${PROJECT_BINARY_DIR}/python_scripts/oneflow/include")
+set(ONEFLOW_INCLUDE_DIR "${ONEFLOW_PYTHON_DIR}/oneflow/include")
 add_custom_target(of_include_copy
   COMMAND ${CMAKE_COMMAND} -E remove_directory "${ONEFLOW_INCLUDE_DIR}" && ${CMAKE_COMMAND} -E make_directory "${ONEFLOW_INCLUDE_DIR}")
-add_dependencies(of_include_copy generate_api)
+add_dependencies(of_include_copy oneflow_internal)
 foreach(of_include_src_dir ${ONEFLOW_INCLUDE_SRC_DIRS})
   set(oneflow_all_include_file)
   file(GLOB_RECURSE oneflow_all_include_file "${of_include_src_dir}/*.*")
diff --git a/docker/package/manylinux/build_wheel.py b/docker/package/manylinux/build_wheel.py
index 453e8cd1f80a610bb05abd7352a8da79a2088785..32be2d7ad92e2dc1e59ba5a7f80331fc2bd2850a 100644
--- a/docker/package/manylinux/build_wheel.py
+++ b/docker/package/manylinux/build_wheel.py
@@ -88,6 +88,7 @@ def common_cmake_args(cache_dir=None, extra_oneflow_cmake_args=None):
 
 
 def get_build_dir_arg(cache_dir, oneflow_src_dir):
+    return ""
     build_dir_real = os.path.join(cache_dir, "build")
     build_dir_mount = os.path.join(oneflow_src_dir, "build")
     return f"-v {build_dir_real}:{build_dir_mount}"
@@ -138,6 +139,7 @@ def get_common_docker_args(
     current_dir=None,
     house_dir=None,
     use_system_proxy=True,
+    inplace=False,
 ):
     root = Path(cache_dir)
     child = Path(current_dir)
@@ -150,7 +152,10 @@ def get_common_docker_args(
         house_dir_arg = f"-v {house_dir}:{house_dir}"
     build_dir_arg = get_build_dir_arg(cache_dir, oneflow_src_dir)
     proxy_env_arg = get_proxy_env_args() if use_system_proxy else ""
-    return f"-v {oneflow_src_dir}:{oneflow_src_dir} {proxy_env_arg} {pwd_arg} {house_dir_arg} {cache_dir_arg} {build_dir_arg} -w {current_dir} --shm-size=8g"
+    inplace_attr = ""
+    if inplace == False:
+        inplace_attr = ":ro"
+    return f"-v {oneflow_src_dir}:{oneflow_src_dir}{inplace_attr} {proxy_env_arg} {pwd_arg} {house_dir_arg} {cache_dir_arg} {build_dir_arg} -w {current_dir} --shm-size=8g"
 
 
 def build_third_party(
@@ -221,9 +226,23 @@ def build_oneflow(
     use_system_proxy,
     enter_bash,
     skip_audit,
+    inplace,
 ):
     oneflow_build_dir = os.path.join(cache_dir, "build-oneflow")
     python_bin = get_python_bin(python_version)
+    oneflow_python_dir = os.path.join(oneflow_src_dir, "python")
+    inplace_arg = ""
+    oneflow_python_dir_cmd = ""
+    if inplace == False:
+        oneflow_python_dir = "/tmp/oneflow_python"
+        inplace_arg = f"-DONEFLOW_PYTHON_DIR={oneflow_python_dir}"
+        oneflow_python_dir_cmd = f"""
+        cp -r {oneflow_src_dir}/python {oneflow_python_dir}
+        cd {oneflow_python_dir}
+        git init
+        git clean -fXd
+        cd -
+        """
     cmake_cmd = " ".join(
         [
             "cmake",
@@ -235,6 +254,7 @@ def build_oneflow(
             "-DCMAKE_EXPORT_COMPILE_COMMANDS=1",
             f"-DPython3_EXECUTABLE={python_bin}",
             oneflow_src_dir,
+            inplace_arg,
         ]
     )
     common_docker_args = get_common_docker_args(
@@ -243,6 +263,7 @@ def build_oneflow(
         current_dir=oneflow_build_dir,
         house_dir=house_dir,
         use_system_proxy=use_system_proxy,
+        inplace=inplace,
     )
     docker_cmd = (
         f"docker run --network=host --rm {common_docker_args} {extra_docker_args}"
@@ -254,6 +275,8 @@ export LD_LIBRARY_PATH=/opt/intel/lib/intel64_lin:/opt/intel/mkl/lib/intel64:$LD
 export LD_LIBRARY_PATH=/opt/intel/lib:$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=/opt/intel/oneapi/mkl/latest/lib/intel64:$LD_LIBRARY_PATH
 export ONEFLOW_SRC_DIR={oneflow_src_dir}
+export ONEFLOW_PYTHON_DIR={oneflow_python_dir}
+{oneflow_python_dir_cmd}
 export ONEFLOW_CMAKE_CMD="{cmake_cmd}"
 """
     if enter_bash:
@@ -267,10 +290,9 @@ cmake --build . -j `nproc`
         pass
     else:
         bash_cmd += f"""
-rm -rf {oneflow_build_dir}/python_scripts/*.egg-info
-cd {oneflow_src_dir}
-rm -rf build/*
-{python_bin} setup.py bdist_wheel -d /tmp/tmp_wheel --build_dir {oneflow_build_dir} --package_name {package_name}
+cd {oneflow_python_dir}
+{python_bin} setup.py bdist_wheel -d /tmp/tmp_wheel --package_name {package_name}
+cd -
 """
     if skip_audit:
         bash_cmd += f"""
@@ -358,6 +380,7 @@ if __name__ == "__main__":
     )
     parser.add_argument("--cpu", default=False, action="store_true", required=False)
     parser.add_argument("--bash", default=False, action="store_true", required=False)
+    parser.add_argument("--inplace", default=False, action="store_true", required=False)
     parser.add_argument("--retry", default=0, type=int)
     args = parser.parse_args()
     if args.skip_img:
@@ -508,6 +531,7 @@ gcc --version
                     args.use_system_proxy,
                     args.bash,
                     args.skip_audit,
+                    args.inplace,
                 )
 
         try:
diff --git a/docs/source/image.rst b/docs/source/image.rst
index 83a82d47be8067cca3a1dd35fa2c81a98aebf994..6cb5ec8cad966608f19d4ef1140d471d8166eaf6 100644
--- a/docs/source/image.rst
+++ b/docs/source/image.rst
@@ -4,13 +4,8 @@ Image operations for neural networks
 --------------------------------------
 .. currentmodule:: oneflow.nn.image
 .. automodule:: oneflow.nn.image
-    :members: ImageBatchAlign, 
-        ImageDecode, 
-        ImageFlip, 
-        ImageNormalize, 
-        ImageResize, 
-        Resize, 
-        batch_align, 
-        decode, 
-        flip, 
-        normalize
\ No newline at end of file
+    :members: Resize,
+        batch_align,
+        decode,
+        flip,
+        normalize
diff --git a/docs/source/nn.rst b/docs/source/nn.rst
index e9517b5269a301b43624586df4c5acdd2c3c14d6..1eaa272a5833addc0555e72863950b20e9b2576d 100644
--- a/docs/source/nn.rst
+++ b/docs/source/nn.rst
@@ -4,76 +4,75 @@ Operators for neural networks
 ----------------------------------
 .. currentmodule:: oneflow.nn
 .. automodule:: oneflow.nn
-    :members: AdaptiveAvgPool1d, 
-        AdaptiveAvgPool2d, 
-        AdaptiveAvgPool3d, 
-        AvgPool1d, 
-        AvgPool2d, 
-        AvgPool3d, 
-        BCELoss, 
-        BCEWithLogitsLoss, 
-        BatchNorm1d, 
-        BatchNorm2d, 
-        COCOReader, 
-        CTCLoss, 
-        CoinFlip,  
-        ConstantPad1d, 
-        ConstantPad2d, 
-        ConstantPad3d, 
-        Conv1d, 
-        Conv2d, 
-        ConvTranspose2d, 
-        CropMirrorNormalize, 
-        CrossEntropyLoss, 
-        Dropout, 
-        ELU, 
-        Embedding, 
-        Flatten, 
-        GELU, 
-        GroupNorm, 
-        Hardsigmoid, 
-        Hardswish, 
-        Hardtanh, 
-        Identity, 
-        InstanceNorm1d, 
-        InstanceNorm2d, 
-        InstanceNorm3d, 
-        KLDivLoss, 
-        L1Loss, 
-        LayerNorm, 
-        LeakyReLU, 
-        Linear, 
-        LogSigmoid, 
-        LogSoftmax, 
-        MSELoss, 
-        MarginRankingLoss, 
-        MaxPool1d, 
-        MaxPool2d, 
-        MaxPool3d, 
-        Mish, 
-        NLLLoss, 
-        OFRecordImageDecoder, 
-        OFRecordImageDecoderRandomCrop, 
-        OfrecordRawDecoder, 
-        OfrecordReader, 
-        PReLU, 
-        Parameter, 
-        ParameterDict, 
-        ParameterList, 
-        PixelShuffle, 
-        PixelShufflev2, 
-        ReLU, 
-        ReLU6, 
-        ReflectionPad2d, 
-        ReplicationPad2d, 
-        Sequential, 
-        Sigmoid, 
-        SmoothL1Loss, 
-        Softmax, 
-        Softplus, 
-        Tanh, 
-        Upsample, 
-        UpsamplingBilinear2d, 
-        UpsamplingNearest2d, 
-        ZeroPad2d, 
+    :members: AdaptiveAvgPool1d,
+        AdaptiveAvgPool2d,
+        AdaptiveAvgPool3d,
+        AvgPool1d,
+        AvgPool2d,
+        AvgPool3d,
+        BCELoss,
+        BCEWithLogitsLoss,
+        BatchNorm1d,
+        BatchNorm2d,
+        COCOReader,
+        CTCLoss,
+        CoinFlip,
+        ConstantPad1d,
+        ConstantPad2d,
+        ConstantPad3d,
+        Conv1d,
+        Conv2d,
+        ConvTranspose2d,
+        CropMirrorNormalize,
+        CrossEntropyLoss,
+        Dropout,
+        ELU,
+        Embedding,
+        Flatten,
+        GELU,
+        GroupNorm,
+        Hardsigmoid,
+        Hardswish,
+        Hardtanh,
+        Identity,
+        InstanceNorm1d,
+        InstanceNorm2d,
+        InstanceNorm3d,
+        KLDivLoss,
+        L1Loss,
+        LayerNorm,
+        LeakyReLU,
+        Linear,
+        LogSigmoid,
+        LogSoftmax,
+        MSELoss,
+        MarginRankingLoss,
+        MaxPool1d,
+        MaxPool2d,
+        MaxPool3d,
+        Mish,
+        NLLLoss,
+        OFRecordImageDecoder,
+        OFRecordImageDecoderRandomCrop,
+        OfrecordRawDecoder,
+        OfrecordReader,
+        PReLU,
+        Parameter,
+        ParameterDict,
+        ParameterList,
+        PixelShuffle,
+        ReLU,
+        ReLU6,
+        ReflectionPad2d,
+        ReplicationPad2d,
+        Sequential,
+        Sigmoid,
+        SmoothL1Loss,
+        Softmax,
+        Softplus,
+        Tanh,
+        Upsample,
+        UpsamplingBilinear2d,
+        UpsamplingNearest2d,
+        ZeroPad2d,
         ctc_greedy_decoder
diff --git a/docs/source/optim.rst b/docs/source/optim.rst
index b52c473db24633578058e8f095911e5a30465952..a1d598f334a0a97dd20ae1d0d982be88fb47fd9e 100644
--- a/docs/source/optim.rst
+++ b/docs/source/optim.rst
@@ -4,15 +4,14 @@ Optimizers
 ----------------------------------
 .. currentmodule:: oneflow.optim
 .. automodule:: oneflow.optim
-    :members: Adam, 
-        AdamW, 
-        Optimizer, 
-        RMSprop, 
-        SGD, 
+    :members: Adam,
+        AdamW,
+        Optimizer,
+        RMSprop,
+        SGD,
         lr_scheduler
 
 .. automodule:: oneflow.optim.lr_scheduler
-    :members: CosineAnnealingLR, 
-            LambdaLR, 
-            LrScheduler, 
+    :members: CosineAnnealingLR,
+            LambdaLR,
             StepLR
diff --git a/oneflow/__main__.py b/oneflow/__main__.py
index 7b575523a1d024c935cd68e78c15a5f9897416c8..5ecd35d64fee4e6bea9c5e3be092b47b350906a3 100644
--- a/oneflow/__main__.py
+++ b/oneflow/__main__.py
@@ -45,6 +45,7 @@ def main():
             StartWorker(f.read())
     if args.doctor:
         import oneflow
+        import oneflow.sysconfig
 
         print("path:", oneflow.__path__)
         print("version:", oneflow.__version__)
diff --git a/oneflow/api/python/flags.cpp b/oneflow/api/python/flags.cpp
index 32e53492e8d373cb60cf001d74b600469b57e9c1..c0b66062c551281771c1253f921a7c3e8d1669b9 100644
--- a/oneflow/api/python/flags.cpp
+++ b/oneflow/api/python/flags.cpp
@@ -42,6 +42,14 @@ ONEFLOW_API_PYBIND11_MODULE("flags", m) {
 #endif  // WITH_XLA
   });
 
+  m.def("with_rdma", []() {
+#ifdef WITH_RDMA
+    return true;
+#else
+    return false;
+#endif  // WITH_RDMA
+  });
+
   m.def("has_rpc_backend_grpc", []() {
 #ifdef RPC_BACKEND_GRPC
     return true;
diff --git a/oneflow/compatible_single_client_python/experimental/interface_op_read_and_write.py b/oneflow/compatible_single_client_python/experimental/interface_op_read_and_write.py
index bf0fb08b03d1cc304a615cab9fa0a50ea375722c..0f9b0ba2629a2a99c018a169e4508c18abff5ed0 100644
--- a/oneflow/compatible_single_client_python/experimental/interface_op_read_and_write.py
+++ b/oneflow/compatible_single_client_python/experimental/interface_op_read_and_write.py
@@ -31,7 +31,6 @@ from oneflow.compatible.single_client.python.framework import (
 )
 from oneflow.compatible.single_client.python.framework import runtime_mode as rt_mode
 from oneflow.compatible.single_client.python.oneflow_export import oneflow_export
-from oneflow.compatible.single_client.python.eager import op_executor as op_executor
 from oneflow._oneflow_internal.oneflow.core.job import placement as placement_cfg
 from oneflow._oneflow_internal.oneflow.core.register import logical_blob_id as lbi_util
 from oneflow._oneflow_internal.oneflow.core.common import shape as shape_proto_cfg
diff --git a/oneflow/compatible_single_client_python/experimental/enable_typing_check.py b/oneflow/compatible_single_client_python/experimental/typing_check.py
similarity index 100%
rename from oneflow/compatible_single_client_python/experimental/enable_typing_check.py
rename to oneflow/compatible_single_client_python/experimental/typing_check.py
diff --git a/oneflow/compatible_single_client_python/framework/compile_context.py b/oneflow/compatible_single_client_python/framework/compile_context.py
index cb7caf0fef704a863665d52675591c2050ab9fe5..9ae7e89d55ff828e5e57f49b8103f51ea0b942bd 100644
--- a/oneflow/compatible_single_client_python/framework/compile_context.py
+++ b/oneflow/compatible_single_client_python/framework/compile_context.py
@@ -18,9 +18,6 @@ from __future__ import absolute_import
 from contextlib import contextmanager
 
 from oneflow.compatible import single_client as flow
-from oneflow.compatible.single_client.python.experimental import (
-    name_scope as name_scope,
-)
 from oneflow.compatible.single_client.python.framework import c_api_util as c_api_util
 from oneflow.compatible.single_client.python.framework import (
     distribute_context as distribute_ctx,
@@ -33,9 +30,6 @@ from oneflow.compatible.single_client.python.framework import (
 )
 from oneflow.compatible.single_client.python.framework import hob as hob
 from oneflow.compatible.single_client.python.lib.core import enable_if as enable_if
-from oneflow.compatible.single_client.python.experimental import (
-    name_scope as name_scope,
-)
 import oneflow._oneflow_internal
 
 
diff --git a/oneflow/compatible_single_client_python/framework/interpret_util.py b/oneflow/compatible_single_client_python/framework/interpret_util.py
index f17035edb1de9ed998ea46f5e913c9fb1fa27d73..08e36a7fa633158efc7f4d66e7795546480e9397 100644
--- a/oneflow/compatible_single_client_python/framework/interpret_util.py
+++ b/oneflow/compatible_single_client_python/framework/interpret_util.py
@@ -20,7 +20,6 @@ from oneflow.compatible.single_client.python.framework import (
 )
 from oneflow.compatible.single_client.python.framework import hob as hob
 from oneflow.compatible.single_client.python.lib.core import enable_if as enable_if
-from oneflow.compatible.single_client.python.eager import op_executor as op_executor
 from oneflow.compatible.single_client.python.eager import gradient_util as gradient_util
 from oneflow.compatible import single_client as flow
 import oneflow._oneflow_internal
@@ -66,6 +65,8 @@ def LazyOpKernelInfer(add_and_infer, op_conf, opkernel_object):
 def EagerForward(add_and_infer, op_conf, scope_symbol=None):
     op_attribute = add_and_infer(op_conf, scope_symbol)
     parallel_conf = scope_symbol.device_parallel_desc_symbol.parallel_conf
+    from oneflow.compatible.single_client.python.eager import op_executor as op_executor
+
     op_executor.Interpret(op_attribute, parallel_conf, blob_register)
     bw_blob_register = gradient_util.GetDefaultBackwardBlobRegister()
     gradient_util.TrySetBackwardUsedBlobObject(
@@ -77,6 +78,8 @@ def EagerForward(add_and_infer, op_conf, scope_symbol=None):
 @enable_if.condition(hob.in_global_mode & hob.eager_execution_enabled)
 def EagerOpKernelForward(add_and_infer, op_conf, opkernel_object):
     op_attribute = add_and_infer(op_conf, opkernel_object.scope_symbol)
+    from oneflow.compatible.single_client.python.eager import op_executor as op_executor
+
     op_executor.OpKernelCall(opkernel_object, op_attribute, blob_register)
     bw_blob_register = gradient_util.GetDefaultBackwardBlobRegister()
     gradient_util.TrySetBackwardUsedBlobObject(
diff --git a/oneflow/compatible_single_client_python/summary/summary_hparams.py b/oneflow/compatible_single_client_python/summary/summary_hparams.py
index 0f4f3117ffb2fab332d0a918f3432ca59340c52e..91a901fdd30614eec541a250240f3f7953e1d448 100644
--- a/oneflow/compatible_single_client_python/summary/summary_hparams.py
+++ b/oneflow/compatible_single_client_python/summary/summary_hparams.py
@@ -168,7 +168,7 @@ def _get_value(value):
         return value
 
 
-@oneflow_export("summary.Hparam")
+@oneflow_export("summary.HParam")
 class HParam(object):
     r"""The class of Hparam
 
diff --git a/oneflow/compatible_single_client_python/test/ops/test_bias_add.py b/oneflow/compatible_single_client_python/test/ops/test_bias_add.py
index 35a936ff82ca42ea5b0e9fdbc60d30ff502ff5d5..22636d39d204c8cb124d38b47495d656fb556f57 100644
--- a/oneflow/compatible_single_client_python/test/ops/test_bias_add.py
+++ b/oneflow/compatible_single_client_python/test/ops/test_bias_add.py
@@ -100,10 +100,10 @@ def CompareBiasAddWithTensorFlow(
     op_args=None,
     input_minval=-10,
     input_maxval=10,
-    y_rtol=1e-5,
-    y_atol=1e-5,
-    x_diff_rtol=1e-5,
-    x_diff_atol=1e-5,
+    y_rtol=1e-3,
+    y_atol=1e-3,
+    x_diff_rtol=1e-2,
+    x_diff_atol=1e-2,
 ):
     assert device_type in ["gpu", "cpu"]
     if op_args is None:
diff --git a/oneflow/compatible_single_client_python/test/ops/test_global_function_input_output.py b/oneflow/compatible_single_client_python/test/ops/test_global_function_input_output.py
index 2b1f488275515d19127aa4bd7ba444aad7a6093d..2658bda4939b720be7be01956cdeed8bb75d2537 100644
--- a/oneflow/compatible_single_client_python/test/ops/test_global_function_input_output.py
+++ b/oneflow/compatible_single_client_python/test/ops/test_global_function_input_output.py
@@ -98,6 +98,7 @@ def _test_input_ndarray_not_contiguous(test_case, shape):
 #     # ret = foo_job(input).get()
 #     # test_case.assertTrue(np.allclose(input, ret.numpy()))
 @flow.unittest.skip_unless_1n1d()
+@unittest.skipIf(os.getenv("ONEFLOW_DRY_RUN"), "can't run in dry run")
 class TestGlobalFunctionInputOutput(flow.unittest.TestCase):
     def test_lazy_input_output(test_case):
         flow.clear_default_session()
diff --git a/oneflow/compatible_single_client_python/test/ops/test_quantization_aware_training.py b/oneflow/compatible_single_client_python/test/ops/test_quantization_aware_training.py
index 373fbb1084962dabf6f3396c1ae478029a325001..7bd7947642e0161c43aefee4a8ce3b3b657e4a1a 100644
--- a/oneflow/compatible_single_client_python/test/ops/test_quantization_aware_training.py
+++ b/oneflow/compatible_single_client_python/test/ops/test_quantization_aware_training.py
@@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 """
+import os
 import unittest
 from collections import OrderedDict
 
@@ -52,6 +53,7 @@ def _test(test_case, per_channel, symmetric, target_backend, build_backbone_fn):
     res_qat = run_with_func_config(build_backbone_fn, qat_func_config)
 
 
+@unittest.skipIf(os.getenv("ONEFLOW_DRY_RUN"), "can't run in dry run")
 class TestQAT(flow.unittest.TestCase):
     def test_qat(test_case):
         def build_conv_with_bias(x):
diff --git a/oneflow/init.py b/oneflow/init.py
index 6a769bd5c9bd0c1ac059a95440d625b495836273..df5f28bc8b58ddb7b9f7e7cb54b8181618b2fbfe 100644
--- a/oneflow/init.py
+++ b/oneflow/init.py
@@ -15,6 +15,7 @@ limitations under the License.
 """
 # __init__.py, rename to avoid being added to PYTHONPATH
 from __future__ import absolute_import
+import collections
 
 import oneflow._oneflow_internal
 
@@ -49,6 +50,22 @@ from oneflow.python.version import __version__
 from oneflow.core.job.job_set_pb2 import ConfigProto
 from oneflow.core.job.job_conf_pb2 import JobConfigProto
 
+_DEPRECATED = set()
+
+
+def oneflow_deprecate(*api_names, **kwargs):
+    def Decorator(func_or_class):
+        _DEPRECATED.add(func_or_class)
+        return func_or_class
+
+    return Decorator
+
+
+def is_deprecated(func_or_class):
+    return (
+        isinstance(func_or_class, collections.Hashable) and func_or_class in _DEPRECATED
+    )
+
 
 import oneflow.python.framework.register_python_callback
 
@@ -104,9 +121,9 @@ def _SyncOnMasterFn():
     def Sync():
         if not oneflow._oneflow_internal.IsEnvInited():
             return
-        if oneflow.python.framework.distribute.is_multi_client():
+        if oneflow.framework.distribute.is_multi_client():
             oneflow._oneflow_internal.eager.multi_client.Sync()
-        elif oneflow.python.framework.distribute.get_rank() == 0:
+        elif oneflow.framework.distribute.get_rank() == 0:
             oneflow._oneflow_internal.eager.single_client.Sync()
 
     return Sync
@@ -114,7 +131,7 @@ def _SyncOnMasterFn():
 
 atexit.register(oneflow._oneflow_internal.SetShuttingDown)
 atexit.register(oneflow._oneflow_internal.DestroyEnv)
-atexit.register(oneflow.python.framework.session_context.TryCloseDefaultSession)
+atexit.register(oneflow.framework.session_context.TryCloseDefaultSession)
 # Global<ResourceDesc, ForSession>::Get(), used by vm in background thread,
 # will be set to nullptr by TryCloseDefaultSession,
 # so sync vm in advance to avoid data race
@@ -122,7 +139,6 @@ atexit.register(_SyncOnMasterFn)
 
 del atexit
 
-del absolute_import
 del oneflow
 
 import oneflow.python.framework.docstr as docstr
@@ -131,3 +147,9 @@ from oneflow.python.framework.docstr.utils import register_docstr
 register_docstr()
 del register_docstr
 del docstr
+from . import linalg
+from . import autograd
+from . import optim
+import oneflow.nn.image
+import oneflow.nn.modules.permute
+import oneflow.tmp
diff --git a/oneflow/python/experimental/interface_op_read_and_write.py b/oneflow/python/experimental/interface_op_read_and_write.py
index 82acacd94d741af56e9002049c30b837d5cd9b8a..de9af258587620017497a7c4260b528577fb6281 100644
--- a/oneflow/python/experimental/interface_op_read_and_write.py
+++ b/oneflow/python/experimental/interface_op_read_and_write.py
@@ -23,7 +23,6 @@ import oneflow.python.framework.push_util as push_util
 import oneflow.python.framework.session_context as session_ctx
 import oneflow.python.framework.runtime_mode as rt_mode
 from oneflow.python.oneflow_export import oneflow_export
-import oneflow.python.eager.op_executor as op_executor
 import oneflow._oneflow_internal.oneflow.core.job.placement as placement_cfg
 import oneflow._oneflow_internal.oneflow.core.register.logical_blob_id as lbi_util
 import oneflow._oneflow_internal.oneflow.core.common.shape as shape_proto_cfg
diff --git a/oneflow/python/framework/compile_context.py b/oneflow/python/framework/compile_context.py
index ff2a3f365850879222241b683f8f184fd05bc32a..635f413fc9e7caf08999f24cb34eb21891cc4c7b 100644
--- a/oneflow/python/framework/compile_context.py
+++ b/oneflow/python/framework/compile_context.py
@@ -18,14 +18,12 @@ from __future__ import absolute_import
 from contextlib import contextmanager
 
 import oneflow
-import oneflow.python.experimental.name_scope as name_scope
 import oneflow.python.framework.c_api_util as c_api_util
 import oneflow.python.framework.distribute_context as distribute_ctx
 import oneflow.python.framework.placement_context as placement_context
 import oneflow.python.framework.session_context as session_ctx
 import oneflow.python.framework.hob as hob
 import oneflow.python.lib.core.enable_if as enable_if
-import oneflow.python.experimental.name_scope as name_scope
 import oneflow
 import oneflow._oneflow_internal
 
diff --git a/oneflow/python/framework/interpret_util.py b/oneflow/python/framework/interpret_util.py
index 1529e7959293603af75c0214a48f5bb6485045c3..498718a2b7f2562ca8aea84c01f51772f22e9ce7 100644
--- a/oneflow/python/framework/interpret_util.py
+++ b/oneflow/python/framework/interpret_util.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 import oneflow.python.framework.compile_context as compile_ctx
 import oneflow.python.framework.hob as hob
 import oneflow.python.lib.core.enable_if as enable_if
-import oneflow.python.eager.op_executor as op_executor
 import oneflow.python.eager.gradient_util as gradient_util
 import oneflow
 import oneflow._oneflow_internal
@@ -64,6 +63,8 @@ def LazyOpKernelInfer(add_and_infer, op_conf, opkernel_object):
 def EagerForward(add_and_infer, op_conf, scope_symbol=None):
     op_attribute = add_and_infer(op_conf, scope_symbol)
     parallel_conf = scope_symbol.device_parallel_desc_symbol.parallel_conf
+    import oneflow.python.eager.op_executor as op_executor
+
     op_executor.Interpret(op_attribute, parallel_conf, blob_register)
     bw_blob_register = gradient_util.GetDefaultBackwardBlobRegister()
     gradient_util.TrySetBackwardUsedBlobObject(
@@ -75,6 +76,8 @@ def EagerForward(add_and_infer, op_conf, scope_symbol=None):
 @enable_if.condition(hob.in_global_mode & hob.eager_execution_enabled)
 def EagerOpKernelForward(add_and_infer, op_conf, opkernel_object):
     op_attribute = add_and_infer(op_conf, opkernel_object.scope_symbol)
+    import oneflow.python.eager.op_executor as op_executor
+
     op_executor.OpKernelCall(opkernel_object, op_attribute, blob_register)
     bw_blob_register = gradient_util.GetDefaultBackwardBlobRegister()
     gradient_util.TrySetBackwardUsedBlobObject(
diff --git a/oneflow/python/framework/unittest.py b/oneflow/python/framework/unittest.py
index 26bb73369f615d719a49ec1a35a69ba6676b9b7c..b5aa84b046c8c01fe4ab3f7567d3c6df9ee90f57 100644
--- a/oneflow/python/framework/unittest.py
+++ b/oneflow/python/framework/unittest.py
@@ -31,6 +31,7 @@ from oneflow.core.job.env_pb2 import EnvProto
 from oneflow.python.oneflow_export import oneflow_export
 from typing import Any, Dict, Callable
 import subprocess
+import oneflow.env
 
 
 class _ClearDefaultSession(object):
diff --git a/oneflow/python/nn/__init__.py b/oneflow/python/nn/__init__.py
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..f6e5a557df25d8da77362474fe5f80eb8c234b97 100644
--- a/oneflow/python/nn/__init__.py
+++ b/oneflow/python/nn/__init__.py
@@ -0,0 +1,16 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from . import functional
diff --git a/oneflow/python/nn/modules/padding.py b/oneflow/python/nn/modules/padding.py
index 542a29ed4ef73cad71662947322878b0b8cda49e..d5ab6b8b13dbe2dfcd3d1e2ac85b417b5864b5f6 100644
--- a/oneflow/python/nn/modules/padding.py
+++ b/oneflow/python/nn/modules/padding.py
@@ -25,7 +25,7 @@ class ReplicationPad2d(Module):
     r"""The interface is consistent with PyTorch.
     The documentation is referenced from:
     https://pytorch.org/docs/stable/generated/torch.nn.ReplicationPad2d.html?highlight=replicationpad2d#torch.nn.ReplicationPad2d
-    
+
     Pads the input tensor using the replication of the input boundary.
 
     Args:
@@ -148,7 +148,7 @@ class ReflectionPad2d(Module):
                   [ 5.,  4.,  3.,  4.,  5.,  4.,  3.],
                   [ 8.,  7.,  6.,  7.,  8.,  7.,  6.],
                   [ 5.,  4.,  3.,  4.,  5.,  4.,  3.]],
-        <BLANKLINE>         
+        <BLANKLINE>
                  [[14., 13., 12., 13., 14., 13., 12.],
                   [11., 10.,  9., 10., 11., 10.,  9.],
                   [14., 13., 12., 13., 14., 13., 12.],
@@ -211,7 +211,7 @@ class ConstantPad1d(Module):
 
     .. code-block:: python
 
-        >>> import oneflow.experimental as flow
+        >>> import oneflow as flow
         >>> import numpy as np
 
         >>> input = flow.tensor(np.arange(8).reshape(2,2,2).astype(np.float32))
@@ -254,13 +254,13 @@ class ConstantPad2d(Module):
     The documentation is referenced from:
     https://pytorch.org/docs/stable/generated/torch.nn.ConstantPad2d.html?highlight=constantpad2d#torch.nn.ConstantPad2d
 
-    This operator pads the input with constant value that user specifies. 
+    This operator pads the input with constant value that user specifies.
     User can set the amount of padding by setting the parameter `paddings`.
 
     Args:
         padding (int, tuple, list):  the size of the padding.
-            If is `int`, uses the same padding in all boundaries. 
-            If a 4-`tuple`, uses 
+            If is `int`, uses the same padding in all boundaries.
+            If a 4-`tuple`, uses
             (:math:`\mathrm{padding_{left}}`, :math:`\mathrm{padding_{right}}`, :math:`\mathrm{padding_{top}}`, :math:`\mathrm{padding_{bottom}}`)
 
         value (int, float): The constant value used for padding. Defaults to 0.
@@ -349,7 +349,7 @@ class ConstantPad3d(Module):
             (:math:`\text{padding_left}`, :math:`\text{padding_right}`,
             :math:`\text{padding_top}`, :math:`\text{padding_bottom}`,
             :math:`\text{padding_front}`, :math:`\text{padding_back}`)
-        
+
         value (int, float): The constant value used for padding. Defaults to 0.
 
     Shape:
diff --git a/oneflow/python/optim/__init__.py b/oneflow/python/optim/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c35b85000785bb668107eaf58fe05797089485b
--- /dev/null
+++ b/oneflow/python/optim/__init__.py
@@ -0,0 +1,16 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from . import lr_scheduler
diff --git a/oneflow/python/test/README.md b/oneflow/python/test/README.md
deleted file mode 100644
index 9ed34bd91c131476923f826789a52fe186799622..0000000000000000000000000000000000000000
--- a/oneflow/python/test/README.md
+++ /dev/null
@@ -1,56 +0,0 @@
-# 娴嬭瘯宸ュ叿浣跨敤绠€浠�
-
-Created: Oct 4, 2020 10:52 AM
-
-瀵筼p鐨勬祴璇曚唬鐮佽繘琛屼簡鏇存柊锛屼富瑕佽В鍐崇殑闂锛�
-
-1. 鏃т唬鐮佸湪 python 鑷甫鐨� unittest 涓婂紩鍏ヤ簡涓€浜涜繕鏈夌偣澶嶆潅鐨勬娊璞★紝瀵艰嚧骞惰杩愯鍗曞厓娴嬭瘯寰堥毦鍋氬埌
-2. 涓嶈兘闅忔剰杩愯鍗曚竴鑴氭湰锛屾瘮濡� `python3 oneflow/python/test/ops/test_add.py` 杩欐牱
-3. 瀵逛簬鍚姩娴嬭瘯鐨勯厤缃俊鎭紝閮借闈犲懡浠よ浼犲叆锛屽 CI 涓嶅弸濂�
-
-鏂扮殑缂栧啓瑙勮寖锛�
-
-```cpp
-@flow.unittest.skip_unless_1n1d()
-class TestAdd(flow.unittest.TestCase):
-    def test_naive(test_case):
-        ....
-
-    def test_broadcast(test_case):
-        ....
-
-if __name__ == "__main__":
-    unittest.main()
-```
-
-- 蹇呴』鎶奰test__***` 鍑芥暟鍐欏湪涓€涓户鎵� `flow.unittest.TestCase` 鐨勭被閲岄潰
-- 蹇呴』鍔犱竴涓� `if __name__ == "__main__":` 锛岄噷闈㈣皟鐢� `unittest.main()`
-- 蹇呴』鍔犱笂 skip decorator锛屾瘮濡� `@flow.unittest.skip_unless_1n1d()` 鏍囪杩欎釜娴嬭瘯鐢ㄤ緥鍙湪1 node 1 device 鐨勬儏鍐典笅鎵嶈兘杩愯銆傛敞鎰忥細杩欓噷鐨� device 涓嶄粎瑕佽€冭檻鍒� oneflow 鐢ㄤ簡鍑犱釜 gpu锛岃繕瑕佽€冭檻鍒拌繖涓剼鏈噷闈� tensorflow/pytorch 鐢ㄥ埌浜嗗嚑涓� gpu
-- skip decorator 鍙互鏀惧湪 class 澶翠笂涔熷彲浠ユ斁鍦� method 澶翠笂锛屾斁鍦� class 澶翠笂鐨勮瘽锛屼笉婊¤冻鏉′欢鏁翠釜 class 鍐呴儴鎵€鏈� test method 閮戒細璺宠繃
-- 鍦� python unit test 鐨勮鑼冧笂娌℃湁寮曞叆棰濆鐨勬娊璞★紝浜嗚В鏇村锛歔https://docs.python.org/3/library/unittest.html](https://docs.python.org/3/library/unittest.html)
-
-濡備綍杩愯锛�
-
-- 鏁翠綋杩愯锛岃繘鍏� `oneflow/python/test/ops`鐩綍锛岃繍琛宍python3 -m unittest`
-
-    ```cpp
-    oneflow/python/test/ops
-    export ONEFLOW_TEST_DEVICE_NUM=1
-    python3 -m unittest --failfast --verbose
-    ```
-
-    鎴栬€咃細
-
-    ```cpp
-    python3 -m unittest discover oneflow/python/test/ops
-    ```
-
-    鏇村鐢ㄦ硶璇峰弬鑰� [https://docs.python.org/3/library/unittest.html](https://docs.python.org/3/library/unittest.html)
-
-- 閫氳繃璁剧疆鐜鍙橀噺 `ONEFLOW_TEST_DEVICE_NUM` 杩囨护瑕佽繍琛屽嚑鍗＄殑鑴氭湰锛屽鏋滄病鏈夌粰锛岄粯璁ゅ氨鏄�1
-- 澶氭満鑴氭湰闇€瑕佽缃� `ONEFLOW_TEST_NODE_LIST` 鍜宍ONEFLOW_TEST_MASTER_PORT`鐜鍙橀噺鏉ユ寚瀹氬鏈虹殑 ip 鍦板潃鍜� control port
-- 杩愯鍗曚竴鑴氭湰锛屽彲浠ョ洿鎺ョ敤 python3 浜岃繘鍒惰繍琛屼竴涓枃浠讹紝鎺ュ彈 python unitest 鐨勬墍鏈夊懡浠よ鍙傛暟锛屽 `--failfast` , `--verbose`
-
-    ```cpp
-    python3 oneflow/python/test/ops/test_add.py --verbose
-    ```
diff --git a/oneflow/python/test/graph/test_input_op_expr.py b/oneflow/python/test/graph/test_input_op_expr.py
index 4cb52209f03f9f3dc485ac40be66118e44730bfb..448913d28d59dc5939315b2acce4ad7cf13bfe05 100644
--- a/oneflow/python/test/graph/test_input_op_expr.py
+++ b/oneflow/python/test/graph/test_input_op_expr.py
@@ -30,9 +30,7 @@ import oneflow.python.framework.c_api_util as c_api_util
 class TestFeedInputTensor(unittest.TestCase):
     def test_feed_input_tensor(test_case):
         test_case.assertTrue(oneflow.distributed.is_multi_client())
-        test_case.assertTrue(
-            oneflow.python.framework.env_util.HasAllMultiClientEnvVars()
-        )
+        test_case.assertTrue(oneflow.framework.env_util.HasAllMultiClientEnvVars())
 
         x = flow.Tensor(1, 1, 10, 10)
         flow.nn.init.uniform_(x, a=-1.0, b=1.0)
diff --git a/oneflow/python/test/graph/test_output_op_expr.py b/oneflow/python/test/graph/test_output_op_expr.py
index 84780bc13c60dfc39f49bec62a0f1102304f0119..5a6e9978d5e94394b95945023bac1c1c48453200 100644
--- a/oneflow/python/test/graph/test_output_op_expr.py
+++ b/oneflow/python/test/graph/test_output_op_expr.py
@@ -30,9 +30,7 @@ import oneflow.python.framework.c_api_util as c_api_util
 class TestFetchOutputTensor(unittest.TestCase):
     def test_fetch_output_tensor(test_case):
         test_case.assertTrue(oneflow.distributed.is_multi_client())
-        test_case.assertTrue(
-            oneflow.python.framework.env_util.HasAllMultiClientEnvVars()
-        )
+        test_case.assertTrue(oneflow.framework.env_util.HasAllMultiClientEnvVars())
 
         x = flow.Tensor(1, 1, 10, 10)
         flow.nn.init.uniform_(x, a=-1.0, b=1.0)
diff --git a/oneflow/python/test/graph/test_user_op_expr.py b/oneflow/python/test/graph/test_user_op_expr.py
index ac06a750e5a249c1af3e64f725063b3d4fff4c77..5162522c3b6c4a5d66c457eadfdbebb021ca35da 100644
--- a/oneflow/python/test/graph/test_user_op_expr.py
+++ b/oneflow/python/test/graph/test_user_op_expr.py
@@ -30,9 +30,7 @@ import oneflow.python.framework.c_api_util as c_api_util
 class TestUserOpGraph(unittest.TestCase):
     def test_user_op_graph(test_case):
         test_case.assertTrue(oneflow.distributed.is_multi_client())
-        test_case.assertTrue(
-            oneflow.python.framework.env_util.HasAllMultiClientEnvVars()
-        )
+        test_case.assertTrue(oneflow.framework.env_util.HasAllMultiClientEnvVars())
 
         x0 = flow.Tensor(20, 30)
         weight0 = flow.Tensor(30, 50)
diff --git a/oneflow/python/test/graph/test_variable_op_expr.py b/oneflow/python/test/graph/test_variable_op_expr.py
index a5d3b0576587cab80817db7bf5b95d46aba4f104..3ac6ee221a4138754474dda70993934755fcb9a9 100644
--- a/oneflow/python/test/graph/test_variable_op_expr.py
+++ b/oneflow/python/test/graph/test_variable_op_expr.py
@@ -30,9 +30,7 @@ import oneflow.python.framework.c_api_util as c_api_util
 class TestFeedVariableTensor(unittest.TestCase):
     def test_feed_var_tensor(test_case):
         test_case.assertTrue(oneflow.distributed.is_multi_client())
-        test_case.assertTrue(
-            oneflow.python.framework.env_util.HasAllMultiClientEnvVars()
-        )
+        test_case.assertTrue(oneflow.framework.env_util.HasAllMultiClientEnvVars())
 
         x = flow.Tensor(1, 1, 10, 10)
         flow.nn.init.uniform_(x, a=-1.0, b=1.0)
diff --git a/oneflow/python/test/modules/test_conv.py b/oneflow/python/test/modules/test_conv.py
index 347f41f07cecdd95f87c9eda6f5effaaab7d62c8..4abc9c0040f4be2783a34f0df94e9c726f81fbf8 100644
--- a/oneflow/python/test/modules/test_conv.py
+++ b/oneflow/python/test/modules/test_conv.py
@@ -1187,7 +1187,7 @@ def _test_conv2d(
         conv.bias = flow.nn.Parameter(flow.Tensor(bias))
     conv.to(to_device)
     of_out = conv(x)
-    test_case.assertTrue(np.allclose(of_out.numpy(), output, rtol=1e-4, atol=1e-8))
+    test_case.assertTrue(np.allclose(of_out.numpy(), output, rtol=1e-3, atol=1e-7))
 
 
 def _test_conv2d_backward(
@@ -1565,7 +1565,7 @@ def _test_conv2d_large_out_channel(test_case, device):
             ]
         ]
     )
-    test_case.assertTrue(np.allclose(output.numpy(), np_out, 1e-6, 1e-6))
+    test_case.assertTrue(np.allclose(output.numpy(), np_out, 1e-5, 1e-6))
     output = output.sum()
     output.backward()
     np_grad = np.array(
diff --git a/oneflow/python/unittest/__init__.py b/oneflow/python/unittest/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b4534d2df3583dd24b55486877665d2c4f47ef1
--- /dev/null
+++ b/oneflow/python/unittest/__init__.py
@@ -0,0 +1,16 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from . import env
diff --git a/oneflow/single_client_init.py b/oneflow/single_client_init.py
index 1d4860f07dbfa7f278b4a999af44254809ac110f..36c73da489eb9d1e69ab33e7c43979d39ea2a0f1 100644
--- a/oneflow/single_client_init.py
+++ b/oneflow/single_client_init.py
@@ -141,5 +141,3 @@ sys.exit = custom_exit
 
 del custom_exit
 del sys
-
-del absolute_import
diff --git a/python/.gitignore b/python/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..e9f3e2c68fa37dfc5e6dc11b790c5007bf480931
--- /dev/null
+++ b/python/.gitignore
@@ -0,0 +1,139 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+/oneflow/include
+/oneflow/core
+/oneflow/compatible/single_client/core
+/oneflow/version.py
+lib.py
+*.ast.py
+unittest-log-*
+log
+output
diff --git a/python/oneflow/F/__init__.py b/python/oneflow/F/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/python/oneflow/__init__.py b/python/oneflow/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..381a87898e5b862d65162ad23fdd2b4d52b35555
--- /dev/null
+++ b/python/oneflow/__init__.py
@@ -0,0 +1,364 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import collections
+
+import oneflow._oneflow_internal
+
+oneflow._oneflow_internal.CheckAndClearRegistryFlag()
+Size = oneflow._oneflow_internal.Size
+device = oneflow._oneflow_internal.device
+placement = oneflow._oneflow_internal.placement
+no_grad = oneflow._oneflow_internal.autograd.no_grad
+locals()["dtype"] = oneflow._oneflow_internal.dtype
+locals()["char"] = oneflow._oneflow_internal.char
+locals()["float16"] = oneflow._oneflow_internal.float16
+locals()["half"] = oneflow._oneflow_internal.float16
+locals()["float32"] = oneflow._oneflow_internal.float32
+locals()["float"] = oneflow._oneflow_internal.float
+locals()["double"] = oneflow._oneflow_internal.double
+locals()["float64"] = oneflow._oneflow_internal.float64
+locals()["int8"] = oneflow._oneflow_internal.int8
+locals()["int"] = oneflow._oneflow_internal.int32
+locals()["int32"] = oneflow._oneflow_internal.int32
+locals()["int64"] = oneflow._oneflow_internal.int64
+locals()["long"] = oneflow._oneflow_internal.int64
+locals()["uint8"] = oneflow._oneflow_internal.uint8
+locals()["record"] = oneflow._oneflow_internal.record
+locals()["tensor_buffer"] = oneflow._oneflow_internal.tensor_buffer
+from oneflow.core.job.job_conf_pb2 import JobConfigProto
+from oneflow.core.job.job_set_pb2 import ConfigProto
+from oneflow.version import __version__
+
+_DEPRECATED = set()
+
+
+def oneflow_deprecate(*api_names, **kwargs):
+    def Decorator(func_or_class):
+        _DEPRECATED.add(func_or_class)
+        return func_or_class
+
+    return Decorator
+
+
+def is_deprecated(func_or_class):
+    return (
+        isinstance(func_or_class, collections.Hashable) and func_or_class in _DEPRECATED
+    )
+
+
+import atexit
+
+import oneflow.framework.c_api_util
+import oneflow.framework.register_class_method_util as register_class_method_util
+import oneflow.framework.register_python_callback
+
+INVALID_SPLIT_AXIS = oneflow._oneflow_internal.INVALID_SPLIT_AXIS
+register_class_method_util.RegisterMethod4Class()
+oneflow._oneflow_internal.RegisterGILForeignLockHelper()
+import oneflow.framework.env_util as env_util
+import oneflow.framework.scope_util as scope_util
+import oneflow.framework.session_context as session_ctx
+from oneflow.framework.multi_client_session import MultiClientSession
+from oneflow.framework.session_util import Session
+
+if not env_util.HasAllMultiClientEnvVars():
+    env_util.SetDefaultMultiClientEnvVars()
+oneflow._oneflow_internal.SetIsMultiClient(True)
+env_util.api_env_init()
+session_ctx.OpenDefaultSession(
+    MultiClientSession(oneflow._oneflow_internal.NewSessionId())
+)
+scope_util.InitScopeStack()
+oneflow._oneflow_internal.EnableEagerEnvironment(True)
+del env_util
+from oneflow.framework import python_callback, register_python_callback
+
+oneflow._oneflow_internal.RegisterGlobalForeignCallback(
+    python_callback.global_python_callback
+)
+del python_callback
+del register_python_callback
+from oneflow.framework import watcher
+
+oneflow._oneflow_internal.RegisterGlobalWatcher(watcher._global_watcher)
+del watcher
+
+
+def _SyncOnMasterFn():
+    import oneflow
+
+    def Sync():
+        if not oneflow._oneflow_internal.IsEnvInited():
+            return
+        if oneflow.framework.distribute.is_multi_client():
+            oneflow._oneflow_internal.eager.multi_client.Sync()
+        elif oneflow.framework.distribute.get_rank() == 0:
+            oneflow._oneflow_internal.eager.single_client.Sync()
+
+    return Sync
+
+
+atexit.register(oneflow._oneflow_internal.SetShuttingDown)
+atexit.register(oneflow._oneflow_internal.DestroyEnv)
+atexit.register(oneflow.framework.session_context.TryCloseDefaultSession)
+atexit.register(_SyncOnMasterFn)
+del atexit
+del oneflow
+import oneflow.framework.docstr as docstr
+from oneflow.framework.docstr.utils import register_docstr
+
+register_docstr()
+del register_docstr
+del docstr
+import oneflow.nn.image
+import oneflow.nn.modules.acosh
+import oneflow.nn.modules.activation
+import oneflow.nn.modules.argwhere
+import oneflow.nn.modules.atan2
+import oneflow.nn.modules.atanh
+import oneflow.nn.modules.bmm
+import oneflow.nn.modules.constant
+import oneflow.nn.modules.diag
+import oneflow.nn.modules.flip
+import oneflow.nn.modules.floor
+import oneflow.nn.modules.greater
+import oneflow.nn.modules.greater_equal
+import oneflow.nn.modules.in_top_k
+import oneflow.nn.modules.masked_select
+import oneflow.nn.modules.math_ops
+import oneflow.nn.modules.norm
+import oneflow.nn.modules.permute
+import oneflow.nn.modules.round
+import oneflow.nn.modules.sign
+import oneflow.nn.modules.sinh
+import oneflow.nn.modules.tan
+import oneflow.nn.modules.tensor_ops
+import oneflow.tmp
+from oneflow.advanced.distribute_ops import cast_to_current_logical_view
+from oneflow.deprecated.initializer_util import (
+    truncated_normal_initializer as truncated_normal,
+)
+from oneflow.experimental.namescope import deprecated_name_scope as name_scope
+from oneflow.framework.check_point_v2 import GetAllVariables as get_all_variables
+from oneflow.framework.check_point_v2 import Load as load
+from oneflow.framework.check_point_v2 import LoadVariables as load_variables
+from oneflow.framework.check_point_v2 import save
+from oneflow.framework.dtype import convert_oneflow_dtype_to_numpy_dtype, dtypes
+from oneflow.framework.env_util import (
+    api_enable_eager_execution as enable_eager_execution,
+)
+from oneflow.framework.env_util import api_get_current_machine_id as current_machine_id
+from oneflow.framework.env_util import api_get_current_resource as current_resource
+from oneflow.framework.function_desc import (
+    api_current_global_function_desc as current_global_function_desc,
+)
+from oneflow.framework.function_util import FunctionConfig
+from oneflow.framework.function_util import FunctionConfig as ExecutionConfig
+from oneflow.framework.function_util import FunctionConfig as function_config
+from oneflow.framework.function_util import api_oneflow_function as global_function
+from oneflow.framework.generator import create_generator as Generator
+from oneflow.framework.generator import default_generator, manual_seed
+from oneflow.framework.input_blob_def import DeprecatedFixedTensorDef as FixedTensorDef
+from oneflow.framework.input_blob_def import (
+    DeprecatedMirroredTensorDef as MirroredTensorDef,
+)
+from oneflow.framework.job_set_util import inter_job_reuse_mem_strategy
+from oneflow.framework.model import Model
+from oneflow.framework.ops import api_acc as acc
+from oneflow.framework.ops import (
+    api_hierarchical_parallel_cast as hierarchical_parallel_cast,
+)
+from oneflow.framework.ops import api_pack as pack
+from oneflow.framework.ops import api_parallel_cast as parallel_cast
+from oneflow.framework.ops import api_unpack as unpack
+from oneflow.framework.placement_util import (
+    deprecated_placement as device_prior_placement,
+)
+from oneflow.framework.placement_util import deprecated_placement as fixed_placement
+from oneflow.framework.scope_util import api_current_scope as current_scope
+from oneflow.framework.session_util import (
+    TmpInitEagerGlobalSession as InitEagerGlobalSession,
+)
+from oneflow.framework.session_util import (
+    api_clear_default_session as clear_default_session,
+)
+from oneflow.framework.session_util import (
+    api_eager_execution_enabled as eager_execution_enabled,
+)
+from oneflow.framework.session_util import (
+    api_find_or_create_module as find_or_create_module,
+)
+from oneflow.framework.session_util import (
+    api_sync_default_session as sync_default_session,
+)
+from oneflow.framework.tensor import Tensor
+from oneflow.framework.tensor import construct_tensor as tensor
+from oneflow.nn.modules.abs import abs_op as abs
+from oneflow.nn.modules.acos import acos_op as acos
+from oneflow.nn.modules.acosh import acosh_op as acosh
+from oneflow.nn.modules.acosh import arccosh_op as arccosh
+from oneflow.nn.modules.activation import gelu_op as gelu
+from oneflow.nn.modules.activation import mish_op as mish
+from oneflow.nn.modules.activation import sigmoid_op as sigmoid
+from oneflow.nn.modules.activation import softmax_op as softmax
+from oneflow.nn.modules.activation import tanh_op as tanh
+from oneflow.nn.modules.adaptive_pool import (
+    adaptive_avg_pool1d,
+    adaptive_avg_pool2d,
+    adaptive_avg_pool3d,
+)
+from oneflow.nn.modules.arange import arange_op as arange
+from oneflow.nn.modules.argmax import argmax_op as argmax
+from oneflow.nn.modules.argsort import argsort_op as argsort
+from oneflow.nn.modules.argwhere import argwhere_op as argwhere
+from oneflow.nn.modules.atan2 import atan2_op as atan2
+from oneflow.nn.modules.atanh import arctanh_op as arctanh
+from oneflow.nn.modules.atanh import atanh_op as atanh
+from oneflow.nn.modules.bmm import bmm_op as bmm
+from oneflow.nn.modules.broadcast_like import broadcast_like_op as broadcast_like
+from oneflow.nn.modules.cast import cast_op as cast
+from oneflow.nn.modules.chunk import chunk_op as chunk
+from oneflow.nn.modules.concat import concat_op as cat
+from oneflow.nn.modules.constant import ones_like_op as ones_like
+from oneflow.nn.modules.constant import ones_op as ones
+from oneflow.nn.modules.constant import zeros_like_op as zeros_like
+from oneflow.nn.modules.constant import zeros_op as zeros
+from oneflow.nn.modules.dataset import tensor_buffer_to_list_of_tensors
+from oneflow.nn.modules.diag import diag_op as diag
+from oneflow.nn.modules.eq import eq_op as eq
+from oneflow.nn.modules.eq import eq_op as equal
+from oneflow.nn.modules.exp import exp_op as exp
+from oneflow.nn.modules.expand import expand_op as expand
+from oneflow.nn.modules.flatten import _flow_flatten as flatten
+from oneflow.nn.modules.flip import flip_op as flip
+from oneflow.nn.modules.floor import floor_op as floor
+from oneflow.nn.modules.gather import gather_op as gather
+from oneflow.nn.modules.gather_nd import gather_nd_op as gather_nd
+from oneflow.nn.modules.greater import greater_op as gt
+from oneflow.nn.modules.greater_equal import greater_equal_op as ge
+from oneflow.nn.modules.in_top_k import in_top_k_op as in_top_k
+from oneflow.nn.modules.less import less_op as lt
+from oneflow.nn.modules.less_equal import less_equal_op as le
+from oneflow.nn.modules.log1p import log1p_op as log1p
+from oneflow.nn.modules.masked_fill import masked_fill_op as masked_fill
+from oneflow.nn.modules.masked_select import masked_select_op as masked_select
+from oneflow.nn.modules.math_ops import _add as add
+from oneflow.nn.modules.math_ops import _div as div
+from oneflow.nn.modules.math_ops import _mul as mul
+from oneflow.nn.modules.math_ops import _reciprocal as reciprocal
+from oneflow.nn.modules.math_ops import _sub as sub
+from oneflow.nn.modules.math_ops import addmm_op as addmm
+from oneflow.nn.modules.math_ops import arcsin_op as arcsin
+from oneflow.nn.modules.math_ops import arcsinh_op as arcsinh
+from oneflow.nn.modules.math_ops import arctan_op as arctan
+from oneflow.nn.modules.math_ops import asin_op as asin
+from oneflow.nn.modules.math_ops import asinh_op as asinh
+from oneflow.nn.modules.math_ops import atan_op as atan
+from oneflow.nn.modules.math_ops import ceil_op as ceil
+from oneflow.nn.modules.math_ops import clamp_op as clamp
+from oneflow.nn.modules.math_ops import clip_op as clip
+from oneflow.nn.modules.math_ops import cos_op as cos
+from oneflow.nn.modules.math_ops import cosh_op as cosh
+from oneflow.nn.modules.math_ops import erf_op as erf
+from oneflow.nn.modules.math_ops import erfc_op as erfc
+from oneflow.nn.modules.math_ops import expm1_op as expm1
+from oneflow.nn.modules.math_ops import fmod_op as fmod
+from oneflow.nn.modules.math_ops import log_op as log
+from oneflow.nn.modules.math_ops import pow_op as pow
+from oneflow.nn.modules.math_ops import rsqrt_op as rsqrt
+from oneflow.nn.modules.math_ops import sin_op as sin
+from oneflow.nn.modules.math_ops import sqrt_op as sqrt
+from oneflow.nn.modules.math_ops import square_op as square
+from oneflow.nn.modules.math_ops import std_op as std
+from oneflow.nn.modules.math_ops import topk_op as topk
+from oneflow.nn.modules.math_ops import variance_op as var
+from oneflow.nn.modules.matmul import matmul_op as matmul
+from oneflow.nn.modules.meshgrid import meshgrid_op as meshgrid
+from oneflow.nn.modules.ne import ne_op as ne
+from oneflow.nn.modules.ne import ne_op as not_equal
+from oneflow.nn.modules.negative import negative_op as neg
+from oneflow.nn.modules.negative import negative_op as negative
+from oneflow.nn.modules.random_ops import bernoulli
+from oneflow.nn.modules.reduce_ops import _max as max
+from oneflow.nn.modules.reduce_ops import _mean as mean
+from oneflow.nn.modules.reduce_ops import _min as min
+from oneflow.nn.modules.reduce_ops import _sum as sum
+from oneflow.nn.modules.repeat import repeat_op as repeat
+from oneflow.nn.modules.reshape import reshape_op as reshape
+from oneflow.nn.modules.reshape import view_op as view
+from oneflow.nn.modules.round import round_op as round
+from oneflow.nn.modules.scatter_nd import _scatter_nd_op as scatter_nd
+from oneflow.nn.modules.sign import sign_op as sign
+from oneflow.nn.modules.sinh import sinh_op as sinh
+from oneflow.nn.modules.slice import slice_op as slice
+from oneflow.nn.modules.slice import slice_update_op as slice_update
+from oneflow.nn.modules.softplus import softplus_op as softplus
+from oneflow.nn.modules.sort import sort_op as sort
+from oneflow.nn.modules.squeeze import squeeze_op as squeeze
+from oneflow.nn.modules.stack import stack
+from oneflow.nn.modules.tan import tan_op as tan
+from oneflow.nn.modules.tensor_buffer import gen_tensor_buffer
+from oneflow.nn.modules.tensor_buffer import (
+    tensor_buffer_to_tensor_op as tensor_buffer_to_tensor,
+)
+from oneflow.nn.modules.tensor_buffer import tensor_to_tensor_buffer
+from oneflow.nn.modules.tile import tile_op as tile
+from oneflow.nn.modules.to import to_op as to
+from oneflow.nn.modules.transpose import transpose_op as transpose
+from oneflow.nn.modules.triu import triu_op as triu
+from oneflow.nn.modules.unsqueeze import unsqueeze_op as unsqueeze
+from oneflow.nn.modules.where import where_op as where
+from oneflow.ops.assign_op import assign
+from oneflow.ops.builtin_ops import BuiltinOp as builtin_op
+from oneflow.ops.categorical_ordinal_encode_op import categorical_ordinal_encode
+from oneflow.ops.constant_op import constant, constant_like, constant_scalar
+from oneflow.ops.count_not_finite import count_not_finite, multi_count_not_finite
+from oneflow.ops.eager_nccl_ops import eager_nccl_all_reduce
+from oneflow.ops.get_variable import api_get_variable as get_variable
+from oneflow.ops.initializer_util import constant_initializer, empty_initializer
+from oneflow.ops.initializer_util import glorot_normal_initializer
+from oneflow.ops.initializer_util import (
+    glorot_normal_initializer as xavier_normal_initializer,
+)
+from oneflow.ops.initializer_util import glorot_uniform_initializer
+from oneflow.ops.initializer_util import (
+    glorot_uniform_initializer as xavier_uniform_initializer,
+)
+from oneflow.ops.initializer_util import (
+    kaiming_initializer,
+    ones_initializer,
+    random_normal_initializer,
+    random_uniform_initializer,
+    truncated_normal_initializer,
+    variance_scaling_initializer,
+    zeros_initializer,
+)
+from oneflow.ops.loss_ops import ctc_loss, smooth_l1_loss
+from oneflow.ops.one_hot import one_hot
+from oneflow.ops.partial_fc_sample import distributed_partial_fc_sample
+from oneflow.ops.user_op_builder import (
+    api_consistent_user_op_builder as consistent_user_op_builder,
+)
+from oneflow.ops.user_op_builder import (
+    api_consistent_user_op_module_builder as consistent_user_op_module_builder,
+)
+from oneflow.ops.user_op_builder import api_user_op_builder as user_op_builder
+from oneflow.ops.user_op_builder import (
+    api_user_op_module_builder as user_op_module_builder,
+)
+
+from . import autograd, distributed, linalg, optim, saved_model
diff --git a/python/oneflow/__main__.py b/python/oneflow/__main__.py
new file mode 100644
index 0000000000000000000000000000000000000000..05112fdbd1d58ca9277ab4b193a7c55ebfa9df33
--- /dev/null
+++ b/python/oneflow/__main__.py
@@ -0,0 +1,53 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import argparse
+import os
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--start_worker", default=False, action="store_true", required=False
+)
+parser.add_argument("--env_proto", type=str, required=False)
+parser.add_argument("--doctor", default=False, action="store_true", required=False)
+args = parser.parse_args()
+
+
+def StartWorker(env_proto):
+    import oneflow._oneflow_internal
+
+    oneflow._oneflow_internal.InitEnv(env_proto, False)
+
+
+def main():
+    start_worker = args.start_worker
+    if start_worker:
+        env_proto = args.env_proto
+        assert os.path.isfile(
+            env_proto
+        ), "env_proto not found, please check your env_proto path: {}".format(env_proto)
+        with open(env_proto, "rb") as f:
+            StartWorker(f.read())
+    if args.doctor:
+        import oneflow
+        import oneflow.sysconfig
+
+        print("path:", oneflow.__path__)
+        print("version:", oneflow.__version__)
+        print("cmake_build_type:", oneflow.sysconfig.cmake_build_type())
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/oneflow/advanced/__init__.py b/python/oneflow/advanced/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..91d1055ef9e8468a0b69c6f8ac3291e67591df73
--- /dev/null
+++ b/python/oneflow/advanced/__init__.py
@@ -0,0 +1,21 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from oneflow.advanced.distribute_ops import api_distribute_add as distribute_add
+from oneflow.advanced.distribute_ops import api_distribute_clone as distribute_clone
+from oneflow.advanced.distribute_ops import api_distribute_concat as distribute_concat
+from oneflow.advanced.distribute_ops import api_distribute_map as distribute_map
+from oneflow.advanced.distribute_ops import api_distribute_split as distribute_split
diff --git a/python/oneflow/advanced/distribute_ops.py b/python/oneflow/advanced/distribute_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fe94cb8bb0b8753b2f66634f68681da221af115
--- /dev/null
+++ b/python/oneflow/advanced/distribute_ops.py
@@ -0,0 +1,226 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Callable, List, Optional, Sequence, Tuple, Union
+
+import oneflow
+import oneflow._oneflow_internal
+import oneflow.core.operator.op_conf_pb2 as op_conf_util
+import oneflow.core.register.logical_blob_id_pb2 as logical_blob_id_util
+import oneflow.framework.hob as hob
+import oneflow.framework.id_util as id_util
+import oneflow.framework.interpret_util as interpret_util
+import oneflow.framework.remote_blob as remote_blob_util
+import oneflow.support.enable_if as enable_if
+
+
+def api_distribute_clone(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> Tuple[oneflow._oneflow_internal.BlobDesc]:
+    func = enable_if.unique([distribute_clone])
+    return func(x, name=name)
+
+
+@enable_if.condition(hob.in_global_mode)
+def distribute_clone(x, name=None):
+    if name is None:
+        name = id_util.UniqueStr("DistributeClone_")
+    op_conf = op_conf_util.OperatorConf()
+    op_conf.name = name
+    setattr(op_conf.distribute_clone_conf, "in", x.unique_name)
+    parallel_size = oneflow.current_scope().device_parallel_desc_symbol.parallel_num
+    op_conf.distribute_clone_conf.out.extend(
+        ["out_%d" % i for i in range(parallel_size)]
+    )
+    interpret_util.ConsistentForward(op_conf)
+    ret = []
+    for i in range(parallel_size):
+        out = "out_%d" % i
+        lbi = logical_blob_id_util.LogicalBlobId()
+        lbi.op_name = op_conf.name
+        lbi.blob_name = out
+        ret.append(remote_blob_util.RemoteBlob(lbi))
+    return tuple(ret)
+
+
+def api_distribute_add(
+    xs: Sequence[oneflow._oneflow_internal.BlobDesc], name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    func = enable_if.unique([distribute_add])
+    return func(xs, name=name)
+
+
+@enable_if.condition(hob.in_global_mode)
+def distribute_add(xs, name=None):
+    assert oneflow.current_scope().device_parallel_desc_symbol.parallel_num == len(xs)
+    if name is None:
+        name = id_util.UniqueStr("DistributeAdd_")
+    op_conf = op_conf_util.OperatorConf()
+    op_conf.name = name
+    getattr(op_conf.distribute_add_conf, "in").extend(
+        [_SoleConsistentLbn(x) for x in xs]
+    )
+    op_conf.distribute_add_conf.out = "out"
+    interpret_util.ConsistentForward(op_conf)
+    lbi = logical_blob_id_util.LogicalBlobId()
+    lbi.op_name = op_conf.name
+    lbi.blob_name = "out"
+    return remote_blob_util.RemoteBlob(lbi)
+
+
+def api_distribute_split(
+    x: oneflow._oneflow_internal.BlobDesc, axis: int = 0, name: Optional[str] = None
+) -> Tuple[oneflow._oneflow_internal.BlobDesc]:
+    func = enable_if.unique([distribute_split])
+    return func(x, axis=axis, name=name)
+
+
+@enable_if.condition(hob.in_global_mode)
+def distribute_split(x, axis=0, name=None):
+    if name is None:
+        name = id_util.UniqueStr("DistributeSplit_")
+    op_conf = op_conf_util.OperatorConf()
+    op_conf.name = name
+    setattr(op_conf.distribute_split_conf, "in", x.unique_name)
+    op_conf.distribute_split_conf.axis = axis
+    parallel_size = oneflow.current_scope().device_parallel_desc_symbol.parallel_num
+    op_conf.distribute_split_conf.out.extend(
+        ["out_%d" % i for i in range(parallel_size)]
+    )
+    interpret_util.ConsistentForward(op_conf)
+    ret = []
+    for i in range(parallel_size):
+        out = "out_%d" % i
+        lbi = logical_blob_id_util.LogicalBlobId()
+        lbi.op_name = op_conf.name
+        lbi.blob_name = out
+        ret.append(remote_blob_util.RemoteBlob(lbi))
+    return tuple(ret)
+
+
+def api_distribute_concat(
+    xs: Sequence[oneflow._oneflow_internal.BlobDesc],
+    axis: int = 0,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    func = enable_if.unique([distribute_concat])
+    return func(xs, axis=axis, name=name)
+
+
+@enable_if.condition(hob.in_global_mode)
+def distribute_concat(xs, axis=0, name=None):
+    assert oneflow.current_scope().device_parallel_desc_symbol.parallel_num == len(xs)
+    if name is None:
+        name = id_util.UniqueStr("DistributeConcat_")
+    op_conf = op_conf_util.OperatorConf()
+    op_conf.name = name
+    getattr(op_conf.distribute_concat_conf, "in").extend(
+        [_SoleConsistentLbn(x) for x in xs]
+    )
+    op_conf.distribute_concat_conf.axis = axis
+    op_conf.distribute_concat_conf.out = "out"
+    interpret_util.ConsistentForward(op_conf)
+    lbi = logical_blob_id_util.LogicalBlobId()
+    lbi.op_name = op_conf.name
+    lbi.blob_name = "out"
+    return remote_blob_util.RemoteBlob(lbi)
+
+
+def api_distribute_map(
+    xs: Union[
+        Sequence[oneflow._oneflow_internal.BlobDesc], oneflow._oneflow_internal.BlobDesc
+    ],
+    f: Callable[
+        [oneflow._oneflow_internal.BlobDesc, oneflow._oneflow_internal.BlobDesc],
+        oneflow._oneflow_internal.BlobDesc,
+    ],
+    axis: int = 0,
+) -> Tuple[oneflow._oneflow_internal.BlobDesc]:
+    func = enable_if.unqiue([distribute_map])
+    return func(xs, f, axis=axis)
+
+
+@enable_if.condition(hob.in_global_mode)
+def distribute_map(xs, f, axis=0):
+    _AssertInputOrOutput(xs)
+    if isinstance(xs, (list, tuple)) == False:
+        xs = [xs]
+    splitted_xs = [oneflow.advanced.distribute_split(x, axis=axis) for x in xs]
+    results = [_UnderSingleDevicePlacementScope(f, *x) for x in zip(*splitted_xs)]
+    output_is_not_container = all(
+        [isinstance(x, oneflow._oneflow_internal.ConsistentBlob) for x in results]
+    )
+    results = [_TryWrapTuple(x) for x in results]
+    result = [oneflow.advanced.distribute_concat(x, axis=axis) for x in zip(*results)]
+    if output_is_not_container:
+        return result[0]
+    return tuple(result)
+
+
+def cast_to_current_logical_view(
+    x: oneflow._oneflow_internal.BlobDesc,
+) -> oneflow._oneflow_internal.BlobDesc:
+    if (
+        isinstance(x, oneflow._oneflow_internal.ConsistentBlob)
+        and oneflow.scope.mirrored_view_enabled()
+        or (
+            isinstance(x, oneflow._oneflow_internal.MirroredBlob)
+            and oneflow.scope.consistent_view_enabled()
+        )
+    ):
+        x = oneflow.identity(x)
+    return x
+
+
+def _SoleConsistentLbn(blob):
+    assert blob.parallel_size == 1
+    if isinstance(blob, oneflow._oneflow_internal.ConsistentBlob):
+        return blob.unique_name
+    if isinstance(blob, oneflow._oneflow_internal.MirroredBlob):
+        return blob.sub_consistent_blob_list[0].unique_name
+    raise NotImplementedError
+
+
+def _AssertInputOrOutput(xs):
+    assert isinstance(xs, (list, tuple, oneflow._oneflow_internal.ConsistentBlob))
+    if isinstance(xs, (list, tuple)):
+        assert len(xs) > 0
+        assert all(
+            [isinstance(x, oneflow._oneflow_internal.ConsistentBlob) for x in xs]
+        )
+
+
+def _TryWrapTuple(ys):
+    _AssertInputOrOutput(ys)
+    if isinstance(ys, (list, tuple)) == False:
+        ys = (ys,)
+    return ys
+
+
+def _UnderSingleDevicePlacementScope(f, *args):
+    parallel_desc_symbol = oneflow.current_scope().device_parallel_desc_symbol
+    for (machine_id, device_id) in _EachMachineIdAndDeviceId(parallel_desc_symbol):
+        mch_dev_str = "@%d:%d" % (machine_id, device_id)
+        with oneflow.scope.placement(parallel_desc_symbol.device_tag, mch_dev_str):
+            return f(*args)
+
+
+def _EachMachineIdAndDeviceId(parallel_desc_symbol):
+    for (
+        machine_id,
+        device_id_list,
+    ) in parallel_desc_symbol.machine_id2device_id_list.items():
+        for device_id in device_id_list:
+            yield (machine_id, device_id)
diff --git a/python/oneflow/autograd/__init__.py b/python/oneflow/autograd/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8452d5241fac16308a8a62ad7426ac6f5b22da1a
--- /dev/null
+++ b/python/oneflow/autograd/__init__.py
@@ -0,0 +1,17 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from oneflow.autograd.autograd import backward, grad
diff --git a/python/oneflow/autograd/autograd.py b/python/oneflow/autograd/autograd.py
new file mode 100644
index 0000000000000000000000000000000000000000..03d05f45e99cec7b773470a5e3e533dabe250c1b
--- /dev/null
+++ b/python/oneflow/autograd/autograd.py
@@ -0,0 +1,53 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Sequence, Tuple, Union
+
+from oneflow._oneflow_internal import TensorTuple
+from oneflow._oneflow_internal.autograd import backward as backward_api
+from oneflow._oneflow_internal.autograd import grad as grad_api
+from oneflow.framework.tensor import Tensor
+from oneflow.framework.tensor_tuple_util import convert_to_tensor_tuple
+
+
+def grad(
+    outputs: Union[Tensor, Sequence[Tensor]],
+    inputs: Union[Tensor, Sequence[Tensor]],
+    out_grads: Union[Tensor, Sequence[Tensor], None] = None,
+    retain_graph: bool = False,
+    create_graph: bool = False,
+) -> Tuple[Tensor]:
+    in_grads = grad_api(
+        convert_to_tensor_tuple(outputs),
+        convert_to_tensor_tuple(inputs),
+        convert_to_tensor_tuple(out_grads),
+        retain_graph,
+        create_graph,
+    )
+    return tuple([Tensor(x) for x in in_grads])
+
+
+def backward(
+    outputs: Union[Tensor, Sequence[Tensor]],
+    out_grads: Union[Tensor, Sequence[Tensor], None],
+    retain_graph: bool = False,
+    create_graph: bool = False,
+) -> None:
+    backward_api(
+        convert_to_tensor_tuple(outputs),
+        convert_to_tensor_tuple(out_grads),
+        retain_graph,
+        create_graph,
+    )
diff --git a/python/oneflow/benchmarks/bert_benchmark/benchmark_util.py b/python/oneflow/benchmarks/bert_benchmark/benchmark_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..95bf2d602b6fffe9d9c4a1e8f9c535e2c2102227
--- /dev/null
+++ b/python/oneflow/benchmarks/bert_benchmark/benchmark_util.py
@@ -0,0 +1,110 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import time
+
+import numpy as np
+
+
+class StopWatch:
+    def __init__(self):
+        pass
+
+    def start(self):
+        self.start_time = time.time()
+        self.last_split = self.start_time
+
+    def set_start(self, val):
+        self.start_time = val
+        self.last_split = self.start_time
+
+    def split(self):
+        now = time.time()
+        duration = now - self.last_split
+        self.last_split = now
+        return duration
+
+    def stop(self):
+        self.stop_time = time.time()
+
+    def duration(self):
+        return self.stop_time - self.start_time
+
+
+class BERTSpeedometer:
+    def __init__(self):
+        self.watch = StopWatch()
+        self.throughoutput_list = []
+
+    def speedometer_cb(
+        self,
+        step,
+        start_time,
+        total_batch_size,
+        skip_iter_num,
+        iter_num,
+        loss_print_every_n_iter,
+    ):
+        def callback(train_loss):
+            assert skip_iter_num >= 0
+            if skip_iter_num == 0 and step == 0:
+                self.watch.set_start(start_time)
+                print("Start trainning without any skipping iteration.")
+            if step < skip_iter_num:
+                if step == 0:
+                    print(
+                        "Skipping {} iterations for benchmark purpose.".format(
+                            skip_iter_num
+                        )
+                    )
+                if step + 1 == skip_iter_num:
+                    self.watch.start()
+                    print("Start trainning.")
+            else:
+                train_step = step - skip_iter_num
+                if (train_step + 1) % loss_print_every_n_iter == 0:
+                    total_loss = train_loss[0].mean()
+                    mlm_loss = train_loss[1].mean()
+                    nsp_loss = train_loss[2].mean()
+                    avg_elapse_time_per_iter = (
+                        self.watch.split() / loss_print_every_n_iter
+                    )
+                    sentences_per_sec = total_batch_size / avg_elapse_time_per_iter
+                    print(
+                        "iter {}, total_loss: {:.3f}, mlm_loss: {:.3f}, nsp_loss: {:.3f}, speed: {:.3f}(sec/batch), {:.3f}(sentences/sec)".format(
+                            train_step,
+                            total_loss,
+                            mlm_loss,
+                            nsp_loss,
+                            avg_elapse_time_per_iter,
+                            sentences_per_sec,
+                        )
+                    )
+                    self.throughoutput_list.append(sentences_per_sec)
+                if train_step + 1 == iter_num:
+                    self.watch.stop()
+                    totoal_duration = self.watch.duration()
+                    avg_sentences_per_sec = (
+                        total_batch_size * iter_num / totoal_duration
+                    )
+                    print("-".ljust(66, "-"))
+                    print(
+                        "average speed: {:.3f}(sentences/sec), new_cal_method: {:.3f}(sentences/sec)".format(
+                            avg_sentences_per_sec, np.mean(self.throughoutput_list)
+                        )
+                    )
+                    print("-".ljust(66, "-"))
+
+        return callback
diff --git a/python/oneflow/benchmarks/bert_benchmark/bert.py b/python/oneflow/benchmarks/bert_benchmark/bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..42691b4e73123b559d0d0c1af2f9f3c0cd5c30fe
--- /dev/null
+++ b/python/oneflow/benchmarks/bert_benchmark/bert.py
@@ -0,0 +1,397 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import math
+
+import oneflow as flow
+import oneflow.core.common.data_type_pb2 as data_type_util
+import oneflow.core.operator.op_conf_pb2 as op_conf_util
+
+
+class BertBackbone(object):
+    def __init__(
+        self,
+        input_ids_blob,
+        input_mask_blob,
+        token_type_ids_blob,
+        vocab_size,
+        seq_length=512,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        initializer_range=0.02,
+    ):
+        with flow.scope.namespace("bert"):
+            with flow.scope.namespace("embeddings"):
+                (self.embedding_output_, self.embedding_table_) = _EmbeddingLookup(
+                    input_ids_blob=input_ids_blob,
+                    vocab_size=vocab_size,
+                    embedding_size=hidden_size,
+                    initializer_range=initializer_range,
+                    word_embedding_name="word_embeddings",
+                )
+                self.embedding_output_ = _EmbeddingPostprocessor(
+                    input_blob=self.embedding_output_,
+                    seq_length=seq_length,
+                    embedding_size=hidden_size,
+                    use_token_type=True,
+                    token_type_ids_blob=token_type_ids_blob,
+                    token_type_vocab_size=type_vocab_size,
+                    token_type_embedding_name="token_type_embeddings",
+                    use_position_embeddings=True,
+                    position_embedding_name="position_embeddings",
+                    initializer_range=initializer_range,
+                    max_position_embeddings=max_position_embeddings,
+                    dropout_prob=hidden_dropout_prob,
+                )
+            with flow.scope.namespace("encoder"):
+                attention_mask_blob = _CreateAttentionMaskFromInputMask(
+                    input_mask_blob,
+                    from_seq_length=seq_length,
+                    to_seq_length=seq_length,
+                )
+                self.all_encoder_layers_ = _TransformerModel(
+                    input_blob=self.embedding_output_,
+                    attention_mask_blob=attention_mask_blob,
+                    seq_length=seq_length,
+                    hidden_size=hidden_size,
+                    num_hidden_layers=num_hidden_layers,
+                    num_attention_heads=num_attention_heads,
+                    intermediate_size=intermediate_size,
+                    intermediate_act_fn=GetActivation(hidden_act),
+                    hidden_dropout_prob=hidden_dropout_prob,
+                    attention_probs_dropout_prob=attention_probs_dropout_prob,
+                    initializer_range=initializer_range,
+                    do_return_all_layers=False,
+                )
+            self.sequence_output_ = self.all_encoder_layers_[-1]
+
+    def embedding_output(self):
+        return self.embedding_output_
+
+    def all_encoder_layers(self):
+        return self.all_encoder_layers_
+
+    def sequence_output(self):
+        return self.sequence_output_
+
+    def embedding_table(self):
+        return self.embedding_table_
+
+
+def CreateInitializer(std):
+    return flow.truncated_normal(std)
+
+
+def _Gelu(in_blob):
+    return flow.math.gelu(in_blob)
+
+
+def _TransformerModel(
+    input_blob,
+    attention_mask_blob,
+    seq_length,
+    hidden_size=768,
+    num_hidden_layers=12,
+    num_attention_heads=12,
+    intermediate_size=3072,
+    intermediate_act_fn=_Gelu,
+    hidden_dropout_prob=0.1,
+    attention_probs_dropout_prob=0.1,
+    initializer_range=0.02,
+    do_return_all_layers=False,
+):
+    assert hidden_size % num_attention_heads == 0
+    attention_head_size = int(hidden_size / num_attention_heads)
+    input_width = hidden_size
+    prev_output_blob = flow.reshape(input_blob, (-1, input_width))
+    all_layer_output_blobs = []
+    for layer_idx in range(num_hidden_layers):
+        with flow.scope.namespace("layer_%d" % layer_idx):
+            layer_input_blob = prev_output_blob
+            with flow.scope.namespace("attention"):
+                with flow.scope.namespace("self"):
+                    attention_output_blob = _AttentionLayer(
+                        from_blob=layer_input_blob,
+                        to_blob=layer_input_blob,
+                        attention_mask_blob=attention_mask_blob,
+                        num_attention_heads=num_attention_heads,
+                        size_per_head=attention_head_size,
+                        attention_probs_dropout_prob=attention_probs_dropout_prob,
+                        initializer_range=initializer_range,
+                        do_return_2d_tensor=True,
+                        from_seq_length=seq_length,
+                        to_seq_length=seq_length,
+                    )
+                with flow.scope.namespace("output"):
+                    attention_output_blob = _FullyConnected(
+                        attention_output_blob,
+                        input_size=num_attention_heads * attention_head_size,
+                        units=hidden_size,
+                        weight_initializer=CreateInitializer(initializer_range),
+                        name="dense",
+                    )
+                    attention_output_blob = _Dropout(
+                        attention_output_blob, hidden_dropout_prob
+                    )
+                    attention_output_blob = attention_output_blob + layer_input_blob
+                    attention_output_blob = _LayerNorm(
+                        attention_output_blob, hidden_size
+                    )
+            with flow.scope.namespace("intermediate"):
+                if callable(intermediate_act_fn):
+                    act_fn = op_conf_util.kNone
+                else:
+                    act_fn = intermediate_act_fn
+                intermediate_output_blob = _FullyConnected(
+                    attention_output_blob,
+                    input_size=num_attention_heads * attention_head_size,
+                    units=intermediate_size,
+                    activation=act_fn,
+                    weight_initializer=CreateInitializer(initializer_range),
+                    name="dense",
+                )
+                if callable(intermediate_act_fn):
+                    intermediate_output_blob = intermediate_act_fn(
+                        intermediate_output_blob
+                    )
+            with flow.scope.namespace("output"):
+                layer_output_blob = _FullyConnected(
+                    intermediate_output_blob,
+                    input_size=intermediate_size,
+                    units=hidden_size,
+                    weight_initializer=CreateInitializer(initializer_range),
+                    name="dense",
+                )
+                layer_output_blob = _Dropout(layer_output_blob, hidden_dropout_prob)
+                layer_output_blob = layer_output_blob + attention_output_blob
+                layer_output_blob = _LayerNorm(layer_output_blob, hidden_size)
+                prev_output_blob = layer_output_blob
+                all_layer_output_blobs.append(layer_output_blob)
+    input_shape = (-1, seq_length, hidden_size)
+    if do_return_all_layers:
+        final_output_blobs = []
+        for layer_output_blob in all_layer_output_blobs:
+            final_output_blob = flow.reshape(layer_output_blob, input_shape)
+            final_output_blobs.append(final_output_blob)
+        return final_output_blobs
+    else:
+        final_output_blob = flow.reshape(prev_output_blob, input_shape)
+        return [final_output_blob]
+
+
+def _AttentionLayer(
+    from_blob,
+    to_blob,
+    attention_mask_blob,
+    num_attention_heads=1,
+    size_per_head=512,
+    query_act=op_conf_util.kNone,
+    key_act=op_conf_util.kNone,
+    value_act=op_conf_util.kNone,
+    attention_probs_dropout_prob=0.0,
+    initializer_range=0.02,
+    do_return_2d_tensor=False,
+    batch_size=None,
+    from_seq_length=None,
+    to_seq_length=None,
+):
+    def TransposeForScores(input_blob, num_attention_heads, seq_length, width):
+        output_blob = flow.reshape(
+            input_blob, [-1, seq_length, num_attention_heads, width]
+        )
+        output_blob = flow.transpose(output_blob, perm=[0, 2, 1, 3])
+        return output_blob
+
+    from_blob_2d = flow.reshape(from_blob, [-1, num_attention_heads * size_per_head])
+    to_blob_2d = flow.reshape(to_blob, [-1, num_attention_heads * size_per_head])
+    query_blob = _FullyConnected(
+        from_blob_2d,
+        input_size=num_attention_heads * size_per_head,
+        units=num_attention_heads * size_per_head,
+        activation=query_act,
+        name="query",
+        weight_initializer=CreateInitializer(initializer_range),
+    )
+    key_blob = _FullyConnected(
+        to_blob_2d,
+        input_size=num_attention_heads * size_per_head,
+        units=num_attention_heads * size_per_head,
+        activation=key_act,
+        name="key",
+        weight_initializer=CreateInitializer(initializer_range),
+    )
+    value_blob = _FullyConnected(
+        to_blob_2d,
+        input_size=num_attention_heads * size_per_head,
+        units=num_attention_heads * size_per_head,
+        activation=value_act,
+        name="value",
+        weight_initializer=CreateInitializer(initializer_range),
+    )
+    query_blob = TransposeForScores(
+        query_blob, num_attention_heads, from_seq_length, size_per_head
+    )
+    key_blob = TransposeForScores(
+        key_blob, num_attention_heads, to_seq_length, size_per_head
+    )
+    attention_scores_blob = flow.matmul(query_blob, key_blob, transpose_b=True)
+    attention_scores_blob = attention_scores_blob * (
+        1.0 / math.sqrt(float(size_per_head))
+    )
+    attention_mask_blob = flow.reshape(
+        attention_mask_blob, [-1, 1, from_seq_length, to_seq_length]
+    )
+    attention_mask_blob = flow.cast(attention_mask_blob, dtype=flow.float)
+    addr_blob = (attention_mask_blob - 1.0) * 10000.0
+    attention_scores_blob = attention_scores_blob + addr_blob
+    attention_probs_blob = flow.nn.softmax(attention_scores_blob)
+    attention_probs_blob = _Dropout(attention_probs_blob, attention_probs_dropout_prob)
+    value_blob = flow.reshape(
+        value_blob, [-1, to_seq_length, num_attention_heads, size_per_head]
+    )
+    value_blob = flow.transpose(value_blob, perm=[0, 2, 1, 3])
+    context_blob = flow.matmul(attention_probs_blob, value_blob)
+    context_blob = flow.transpose(context_blob, perm=[0, 2, 1, 3])
+    if do_return_2d_tensor:
+        context_blob = flow.reshape(
+            context_blob, [-1, num_attention_heads * size_per_head]
+        )
+    else:
+        context_blob = flow.reshape(
+            context_blob, [-1, from_seq_length, num_attention_heads * size_per_head]
+        )
+    return context_blob
+
+
+def _FullyConnected(
+    input_blob, input_size, units, activation=None, name=None, weight_initializer=None
+):
+    weight_blob = flow.get_variable(
+        name=name + "-weight",
+        shape=[input_size, units],
+        dtype=input_blob.dtype,
+        initializer=weight_initializer,
+    )
+    bias_blob = flow.get_variable(
+        name=name + "-bias",
+        shape=[units],
+        dtype=input_blob.dtype,
+        initializer=flow.constant_initializer(0.0),
+    )
+    output_blob = flow.matmul(input_blob, weight_blob)
+    output_blob = flow.nn.bias_add(output_blob, bias_blob)
+    return output_blob
+
+
+def _Dropout(input_blob, dropout_prob):
+    if dropout_prob == 0.0:
+        return input_blob
+    return flow.nn.dropout(input_blob, rate=dropout_prob)
+
+
+def _LayerNorm(input_blob, hidden_size):
+    return flow.layers.layer_norm(
+        input_blob, name="LayerNorm", begin_norm_axis=-1, begin_params_axis=-1
+    )
+
+
+def _CreateAttentionMaskFromInputMask(to_mask_blob, from_seq_length, to_seq_length):
+    output = flow.cast(to_mask_blob, dtype=flow.float)
+    output = flow.reshape(output, [-1, 1, to_seq_length])
+    zeros = flow.constant(0.0, dtype=flow.float, shape=[from_seq_length, to_seq_length])
+    output = zeros + output
+    return output
+
+
+def _EmbeddingPostprocessor(
+    input_blob,
+    seq_length,
+    embedding_size,
+    use_token_type=False,
+    token_type_ids_blob=None,
+    token_type_vocab_size=16,
+    token_type_embedding_name="token_type_embeddings",
+    use_position_embeddings=True,
+    position_embedding_name="position_embeddings",
+    initializer_range=0.02,
+    max_position_embeddings=512,
+    dropout_prob=0.1,
+):
+    output = input_blob
+    if use_token_type:
+        assert token_type_ids_blob is not None
+        token_type_table = flow.get_variable(
+            name=token_type_embedding_name,
+            shape=[token_type_vocab_size, embedding_size],
+            dtype=input_blob.dtype,
+            initializer=CreateInitializer(initializer_range),
+        )
+        token_type_embeddings = flow.gather(
+            params=token_type_table, indices=token_type_ids_blob, axis=0
+        )
+        output = output + token_type_embeddings
+    if use_position_embeddings:
+        position_table = flow.get_variable(
+            name=position_embedding_name,
+            shape=[1, max_position_embeddings, embedding_size],
+            dtype=input_blob.dtype,
+            initializer=CreateInitializer(initializer_range),
+        )
+        assert seq_length <= max_position_embeddings
+        if seq_length != max_position_embeddings:
+            position_table = flow.slice(
+                position_table, begin=[None, 0, 0], size=[None, seq_length, -1]
+            )
+        output = output + position_table
+    output = _LayerNorm(output, embedding_size)
+    output = _Dropout(output, dropout_prob)
+    return output
+
+
+def _EmbeddingLookup(
+    input_ids_blob,
+    vocab_size,
+    embedding_size=128,
+    initializer_range=0.02,
+    word_embedding_name="word_embeddings",
+):
+    embedding_table = flow.get_variable(
+        name=word_embedding_name,
+        shape=[vocab_size, embedding_size],
+        dtype=flow.float,
+        initializer=CreateInitializer(initializer_range),
+    )
+    output = flow.gather(params=embedding_table, indices=input_ids_blob, axis=0)
+    return (output, embedding_table)
+
+
+def GetActivation(name):
+    if name == "linear":
+        return None
+    elif name == "relu":
+        return flow.math.relu
+    elif name == "tanh":
+        return flow.math.tanh
+    elif name == "gelu":
+        return flow.math.gelu
+    else:
+        raise Exception("unsupported activation")
diff --git a/python/oneflow/benchmarks/bert_benchmark/pretrain.py b/python/oneflow/benchmarks/bert_benchmark/pretrain.py
new file mode 100644
index 0000000000000000000000000000000000000000..03832bf827ff9c1003a6ddb2a23d37d4c1bb450d
--- /dev/null
+++ b/python/oneflow/benchmarks/bert_benchmark/pretrain.py
@@ -0,0 +1,189 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import bert as bert_util
+
+import oneflow as flow
+import oneflow.core.operator.op_conf_pb2 as op_conf_util
+
+
+def PreTrain(
+    input_ids_blob,
+    input_mask_blob,
+    token_type_ids_blob,
+    masked_lm_positions_blob,
+    masked_lm_ids_blob,
+    masked_lm_weights_blob,
+    next_sentence_label_blob,
+    vocab_size,
+    seq_length=512,
+    hidden_size=768,
+    num_hidden_layers=12,
+    num_attention_heads=12,
+    intermediate_size=3072,
+    hidden_act="gelu",
+    hidden_dropout_prob=0.1,
+    attention_probs_dropout_prob=0.1,
+    max_position_embeddings=512,
+    type_vocab_size=16,
+    max_predictions_per_seq=20,
+    initializer_range=0.02,
+):
+    backbone = bert_util.BertBackbone(
+        input_ids_blob=input_ids_blob,
+        input_mask_blob=input_mask_blob,
+        token_type_ids_blob=token_type_ids_blob,
+        vocab_size=vocab_size,
+        seq_length=seq_length,
+        hidden_size=hidden_size,
+        num_hidden_layers=num_hidden_layers,
+        num_attention_heads=num_attention_heads,
+        intermediate_size=intermediate_size,
+        hidden_act=hidden_act,
+        hidden_dropout_prob=hidden_dropout_prob,
+        attention_probs_dropout_prob=attention_probs_dropout_prob,
+        max_position_embeddings=max_position_embeddings,
+        type_vocab_size=type_vocab_size,
+        initializer_range=initializer_range,
+    )
+    (lm_loss, _, _) = _AddMaskedLanguageModelLoss(
+        input_blob=backbone.sequence_output(),
+        output_weights_blob=backbone.embedding_table(),
+        positions_blob=masked_lm_positions_blob,
+        label_id_blob=masked_lm_ids_blob,
+        label_weight_blob=masked_lm_weights_blob,
+        seq_length=seq_length,
+        hidden_size=hidden_size,
+        vocab_size=vocab_size,
+        max_predictions_per_seq=max_predictions_per_seq,
+        hidden_act=bert_util.GetActivation(hidden_act),
+        initializer_range=initializer_range,
+    )
+    pooled_output = PooledOutput(
+        backbone.sequence_output(), hidden_size, initializer_range
+    )
+    (ns_loss, _, _) = _AddNextSentenceOutput(
+        input_blob=pooled_output,
+        label_blob=next_sentence_label_blob,
+        hidden_size=hidden_size,
+        initializer_range=initializer_range,
+    )
+    with flow.scope.namespace("cls-loss"):
+        total_loss = lm_loss + ns_loss
+    return (total_loss, lm_loss, ns_loss)
+
+
+def PooledOutput(sequence_output, hidden_size, initializer_range):
+    with flow.scope.namespace("bert-pooler"):
+        first_token_tensor = flow.slice(sequence_output, [None, 0, 0], [None, 1, -1])
+        first_token_tensor = flow.reshape(first_token_tensor, [-1, hidden_size])
+        pooled_output = bert_util._FullyConnected(
+            first_token_tensor,
+            input_size=hidden_size,
+            units=hidden_size,
+            weight_initializer=bert_util.CreateInitializer(initializer_range),
+            name="dense",
+        )
+        pooled_output = flow.math.tanh(pooled_output)
+    return pooled_output
+
+
+def _AddMaskedLanguageModelLoss(
+    input_blob,
+    output_weights_blob,
+    positions_blob,
+    label_id_blob,
+    label_weight_blob,
+    seq_length,
+    hidden_size,
+    vocab_size,
+    max_predictions_per_seq,
+    hidden_act,
+    initializer_range,
+):
+    with flow.scope.namespace("other"):
+        sum_label_weight_blob = flow.math.reduce_sum(label_weight_blob, axis=[-1])
+        ones = sum_label_weight_blob * 0.0 + 1.0
+        sum_label_weight_blob = flow.math.reduce_sum(sum_label_weight_blob)
+        batch_size = flow.math.reduce_sum(ones)
+        sum_label_weight_blob = sum_label_weight_blob / batch_size
+    with flow.scope.namespace("cls-predictions"):
+        input_blob = _GatherIndexes(input_blob, positions_blob, seq_length, hidden_size)
+        with flow.scope.namespace("transform"):
+            if callable(hidden_act):
+                act_fn = op_conf_util.kNone
+            else:
+                act_fn = hidden_act
+            input_blob = bert_util._FullyConnected(
+                input_blob,
+                input_size=hidden_size,
+                units=hidden_size,
+                activation=act_fn,
+                weight_initializer=bert_util.CreateInitializer(initializer_range),
+                name="dense",
+            )
+            if callable(hidden_act):
+                input_blob = hidden_act(input_blob)
+                input_blob = bert_util._LayerNorm(input_blob, hidden_size)
+        output_bias = flow.get_variable(
+            name="output_bias",
+            shape=[vocab_size],
+            dtype=input_blob.dtype,
+            initializer=flow.constant_initializer(1.0),
+        )
+        logit_blob = flow.matmul(input_blob, output_weights_blob, transpose_b=True)
+        logit_blob = flow.nn.bias_add(logit_blob, output_bias)
+        label_id_blob = flow.reshape(label_id_blob, [-1])
+        pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+            logits=logit_blob, labels=label_id_blob
+        )
+        pre_example_loss = flow.reshape(pre_example_loss, [-1, max_predictions_per_seq])
+        numerator = pre_example_loss * label_weight_blob
+        with flow.scope.namespace("loss"):
+            numerator = flow.math.reduce_sum(numerator, axis=[-1])
+            denominator = sum_label_weight_blob + 1e-05
+            loss = numerator / denominator
+        return (loss, pre_example_loss, logit_blob)
+
+
+def _GatherIndexes(sequence_blob, positions_blob, seq_length, hidden_size):
+    output = flow.gather(
+        params=sequence_blob, indices=positions_blob, axis=2, batch_dims=2
+    )
+    output = flow.reshape(output, [-1, hidden_size])
+    return output
+
+
+def _AddNextSentenceOutput(input_blob, label_blob, hidden_size, initializer_range):
+    with flow.scope.namespace("cls-seq_relationship"):
+        output_weight_blob = flow.get_variable(
+            name="output_weights",
+            shape=[2, hidden_size],
+            dtype=input_blob.dtype,
+            initializer=bert_util.CreateInitializer(initializer_range),
+        )
+        output_bias_blob = flow.get_variable(
+            name="output_bias",
+            shape=[2],
+            dtype=input_blob.dtype,
+            initializer=flow.constant_initializer(0.0),
+        )
+        logit_blob = flow.matmul(input_blob, output_weight_blob, transpose_b=True)
+        logit_blob = flow.nn.bias_add(logit_blob, output_bias_blob)
+        pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+            logits=logit_blob, labels=label_blob
+        )
+        loss = pre_example_loss
+        return (loss, pre_example_loss, logit_blob)
diff --git a/python/oneflow/benchmarks/bert_benchmark/run_pretraining.py b/python/oneflow/benchmarks/bert_benchmark/run_pretraining.py
new file mode 100644
index 0000000000000000000000000000000000000000..92ac6ba748976735554ec0b4ce1a227cc16f7a71
--- /dev/null
+++ b/python/oneflow/benchmarks/bert_benchmark/run_pretraining.py
@@ -0,0 +1,311 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import argparse
+import os
+import random
+import time
+from collections import OrderedDict
+from datetime import datetime
+
+import benchmark_util
+from pretrain import PreTrain
+
+import oneflow as flow
+
+parser = argparse.ArgumentParser(description="flags for bert")
+parser.add_argument("--gpu_num_per_node", type=int, default=1)
+parser.add_argument("--node_num", type=int, default=1)
+parser.add_argument("--node_list", type=str, default=None)
+parser.add_argument("--learning_rate", type=float, default=0.0001, help="Learning rate")
+parser.add_argument(
+    "--weight_decay_rate", type=float, default=0.01, help="weight decay rate"
+)
+parser.add_argument("--batch_size_per_device", type=int, default=24)
+parser.add_argument("--iter_num", type=int, default=10, help="total iterations to run")
+parser.add_argument(
+    "--skip_iter_num",
+    type=int,
+    default=10,
+    help="number of skipping iterations for benchmark purpose.",
+)
+parser.add_argument(
+    "--log_every_n_iter", type=int, default=1, help="print loss every n iteration"
+)
+parser.add_argument("--data_dir", type=str, default=None)
+parser.add_argument(
+    "--data_part_num", type=int, default=32, help="data part number in dataset"
+)
+parser.add_argument(
+    "--enable_auto_mixed_precision",
+    default=False,
+    type=lambda x: str(x).lower() == "true",
+)
+parser.add_argument(
+    "--loss_print_every_n_iter",
+    type=int,
+    default=1,
+    required=False,
+    help="print loss every n iteration",
+)
+parser.add_argument(
+    "--model_save_every_n_iter",
+    type=int,
+    default=200,
+    required=False,
+    help="save model every n iteration",
+)
+parser.add_argument(
+    "--model_save_dir",
+    type=str,
+    default="./output/model_save-{}".format(
+        str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))
+    ),
+    required=False,
+    help="model save directory",
+)
+parser.add_argument(
+    "--save_last_snapshot",
+    type=bool,
+    default=False,
+    required=False,
+    help="save model snapshot for last iteration",
+)
+parser.add_argument(
+    "--model_load_dir",
+    type=str,
+    default=None,
+    required=False,
+    help="model load directory",
+)
+parser.add_argument(
+    "--log_dir",
+    type=str,
+    default="./output",
+    required=False,
+    help="log info save directory",
+)
+parser.add_argument("--seq_length", type=int, default=512)
+parser.add_argument("--max_predictions_per_seq", type=int, default=80)
+parser.add_argument("--num_hidden_layers", type=int, default=24)
+parser.add_argument("--num_attention_heads", type=int, default=16)
+parser.add_argument("--max_position_embeddings", type=int, default=512)
+parser.add_argument("--type_vocab_size", type=int, default=2)
+parser.add_argument("--vocab_size", type=int, default=30522)
+parser.add_argument("--attention_probs_dropout_prob", type=float, default=0.1)
+parser.add_argument("--hidden_dropout_prob", type=float, default=0.1)
+parser.add_argument("--hidden_size_per_head", type=int, default=64)
+parser.add_argument("--warmup_batches", type=int, default=1000)
+parser.add_argument("--lr_decay_num", type=int, default=100000)
+parser.add_argument(
+    "--lr_decay_num_same_as_iter_num",
+    default=False,
+    type=lambda x: str(x).lower() == "true",
+)
+args = parser.parse_args()
+
+
+def _blob_conf(name, shape, dtype=flow.int32):
+    return flow.data.BlobConf(
+        name=name, shape=shape, dtype=dtype, codec=flow.data.RawCodec()
+    )
+
+
+def BertDecoder(
+    data_dir, batch_size, data_part_num, seq_length, max_predictions_per_seq
+):
+    config_ordered_dict = OrderedDict()
+    config_ordered_dict["input_ids"] = seq_length
+    config_ordered_dict["next_sentence_labels"] = 1
+    config_ordered_dict["input_mask"] = seq_length
+    config_ordered_dict["segment_ids"] = seq_length
+    config_ordered_dict["masked_lm_ids"] = max_predictions_per_seq
+    config_ordered_dict["masked_lm_positions"] = max_predictions_per_seq
+    config_ordered_dict["masked_lm_weights"] = max_predictions_per_seq
+    ofrecord = flow.data.ofrecord_reader(
+        data_dir, batch_size=batch_size, data_part_num=data_part_num, name="decode"
+    )
+    ret = {}
+    for (k, v) in config_ordered_dict.items():
+        ret[k] = flow.data.ofrecord_raw_decoder(
+            ofrecord,
+            k,
+            shape=(v,),
+            dtype=flow.float if k == "masked_lm_weights" else flow.int32,
+        )
+    return ret
+
+
+def BuildPreTrainNet(
+    batch_size,
+    data_part_num,
+    seq_length=128,
+    max_position_embeddings=512,
+    num_hidden_layers=12,
+    num_attention_heads=12,
+    hidden_dropout_prob=0.1,
+    attention_probs_dropout_prob=0.1,
+    vocab_size=30522,
+    type_vocab_size=2,
+    max_predictions_per_seq=20,
+):
+    hidden_size = 64 * num_attention_heads
+    intermediate_size = hidden_size * 4
+    decoders = BertDecoder(
+        args.data_dir, batch_size, data_part_num, seq_length, max_predictions_per_seq
+    )
+    input_ids = decoders["input_ids"]
+    next_sentence_labels = decoders["next_sentence_labels"]
+    input_mask = decoders["input_mask"]
+    token_type_ids = decoders["segment_ids"]
+    masked_lm_ids = decoders["masked_lm_ids"]
+    masked_lm_positions = decoders["masked_lm_positions"]
+    masked_lm_weights = decoders["masked_lm_weights"]
+    return PreTrain(
+        input_ids,
+        input_mask,
+        token_type_ids,
+        masked_lm_positions,
+        masked_lm_ids,
+        masked_lm_weights,
+        next_sentence_labels,
+        vocab_size,
+        seq_length=seq_length,
+        hidden_size=hidden_size,
+        num_hidden_layers=num_hidden_layers,
+        num_attention_heads=num_attention_heads,
+        intermediate_size=intermediate_size,
+        hidden_act="gelu",
+        hidden_dropout_prob=hidden_dropout_prob,
+        attention_probs_dropout_prob=attention_probs_dropout_prob,
+        max_position_embeddings=max_position_embeddings,
+        type_vocab_size=type_vocab_size,
+        max_predictions_per_seq=max_predictions_per_seq,
+        initializer_range=0.02,
+    )
+
+
+_BERT_MODEL_UPDATE_CONF = dict(
+    learning_rate_decay=dict(
+        polynomial_conf=dict(
+            decay_batches=args.iter_num
+            if args.lr_decay_num_same_as_iter_num
+            else args.lr_decay_num,
+            end_learning_rate=0.0,
+        )
+    ),
+    warmup_conf=dict(
+        linear_conf=dict(warmup_batches=args.warmup_batches, start_multiplier=0)
+    ),
+    clip_conf=dict(clip_by_global_norm=dict(clip_norm=1.0)),
+    adam_conf=dict(epsilon=1e-06),
+    weight_decay_conf=dict(
+        weight_decay_rate=args.weight_decay_rate,
+        excludes=dict(pattern=["bias", "LayerNorm", "layer_norm"]),
+    ),
+)
+func_config = flow.FunctionConfig()
+func_config.default_distribute_strategy(flow.scope.consistent_view())
+func_config.train.primary_lr(args.learning_rate)
+func_config.default_data_type(flow.float)
+func_config.train.model_update_conf(_BERT_MODEL_UPDATE_CONF)
+func_config.enable_auto_mixed_precision(args.enable_auto_mixed_precision)
+flow.config.gpu_device_num(args.gpu_num_per_node)
+
+
+@flow.global_function(func_config)
+def PretrainJob():
+    total_device_num = args.node_num * args.gpu_num_per_node
+    batch_size = total_device_num * args.batch_size_per_device
+    (total_loss, mlm_loss, nsp_loss) = BuildPreTrainNet(
+        batch_size,
+        args.data_part_num,
+        seq_length=args.seq_length,
+        max_position_embeddings=args.max_position_embeddings,
+        num_hidden_layers=args.num_hidden_layers,
+        num_attention_heads=args.num_attention_heads,
+        hidden_dropout_prob=args.hidden_dropout_prob,
+        attention_probs_dropout_prob=args.attention_probs_dropout_prob,
+        vocab_size=args.vocab_size,
+        type_vocab_size=args.type_vocab_size,
+        max_predictions_per_seq=args.max_predictions_per_seq,
+    )
+    flow.losses.add_loss(total_loss)
+    return (total_loss, mlm_loss, nsp_loss)
+
+
+def main():
+    print("=".ljust(66, "="))
+    print(
+        "Running bert: num_gpu_per_node = {}, num_nodes = {}.".format(
+            args.gpu_num_per_node, args.node_num
+        )
+    )
+    print("=".ljust(66, "="))
+    for arg in vars(args):
+        print("{} = {}".format(arg, getattr(args, arg)))
+    print("-".ljust(66, "-"))
+    print("Time stamp: {}".format(str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))))
+    flow.env.log_dir(args.log_dir)
+    if args.node_num > 1:
+        nodes = []
+        for n in args.node_list.strip().split(","):
+            addr_dict = {}
+            addr_dict["addr"] = n
+            nodes.append(addr_dict)
+        flow.env.machine(nodes)
+    if os.getenv("ONEFLOW_DRY_RUN"):
+        flow.env.ctrl_port(9788)
+    check_point = flow.train.CheckPoint()
+    if args.model_load_dir:
+        assert os.path.isdir(args.model_load_dir)
+        check_point.load(args.model_load_dir)
+        print("Restoring model from {}.".format(args.model_load_dir))
+    else:
+        check_point.init()
+        print("Init model on demand")
+    total_batch_size = (
+        args.node_num * args.gpu_num_per_node * args.batch_size_per_device
+    )
+    speedometer = benchmark_util.BERTSpeedometer()
+    start_time = time.time()
+    for step in range(args.skip_iter_num + args.iter_num):
+        cb = speedometer.speedometer_cb(
+            step,
+            start_time,
+            total_batch_size,
+            args.skip_iter_num,
+            args.iter_num,
+            args.loss_print_every_n_iter,
+        )
+        PretrainJob().async_get(cb)
+        if (step + 1) % args.model_save_every_n_iter == 0:
+            if not os.path.exists(args.model_save_dir):
+                os.makedirs(args.model_save_dir)
+            snapshot_save_path = os.path.join(
+                args.model_save_dir, "snapshot_%d" % (step + 1)
+            )
+            print("Saving model to {}.".format(snapshot_save_path))
+            check_point.save(snapshot_save_path)
+    if args.save_last_snapshot:
+        snapshot_save_path = os.path.join(args.model_save_dir, "last_snapshot")
+        if not os.path.exists(snapshot_save_path):
+            os.makedirs(snapshot_save_path)
+        print("Saving model to {}.".format(snapshot_save_path))
+        check_point.save(snapshot_save_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/oneflow/benchmarks/cnn_benchmark/__init__.py b/python/oneflow/benchmarks/cnn_benchmark/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/python/oneflow/benchmarks/cnn_benchmark/alexnet.py b/python/oneflow/benchmarks/cnn_benchmark/alexnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..77c60dbb0bf42c4a12d68e7eec0ad2fdbf67815a
--- /dev/null
+++ b/python/oneflow/benchmarks/cnn_benchmark/alexnet.py
@@ -0,0 +1,195 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import argparse
+
+import oneflow as flow
+
+DATA_DIR = "/dataset/imagenet_1k/oneflow/30/train"
+parser = argparse.ArgumentParser(description="flags for multi-node and resource")
+parser.add_argument("-i", "--iter_num", type=int, default=10, required=False)
+parser.add_argument("-g", "--gpu_num_per_node", type=int, default=1, required=False)
+parser.add_argument(
+    "-m", "--multinode", default=False, action="store_true", required=False
+)
+parser.add_argument("-n", "--node_list", type=str, default=None, required=False)
+parser.add_argument("-e", "--eval_dir", type=str, default=DATA_DIR, required=False)
+parser.add_argument("-t", "--train_dir", type=str, default=DATA_DIR, required=False)
+parser.add_argument("-load", "--model_load_dir", type=str, default="", required=False)
+parser.add_argument(
+    "-save", "--model_save_dir", type=str, default="./checkpoints", required=False
+)
+args = parser.parse_args()
+
+
+def _data_load_layer(data_dir):
+    rgb_mean = [123.68, 116.78, 103.94]
+    ofrecord = flow.data.ofrecord_reader(
+        data_dir, batch_size=12, data_part_num=8, name="decode"
+    )
+    image = flow.data.ofrecord_image_decoder(ofrecord, "encoded", color_space="RGB")
+    label = flow.data.ofrecord_raw_decoder(
+        ofrecord, "class/label", shape=(), dtype=flow.int32
+    )
+    rsz = flow.image.resize(image, resize_x=227, resize_y=227, color_space="RGB")
+    normal = flow.image.crop_mirror_normalize(
+        rsz,
+        color_space="RGB",
+        output_layout="NCHW",
+        mean=rgb_mean,
+        output_dtype=flow.float,
+    )
+    return (label, normal)
+
+
+def _conv2d_layer(
+    name,
+    input,
+    filters,
+    kernel_size=3,
+    strides=1,
+    padding="SAME",
+    data_format="NCHW",
+    dilation_rate=1,
+    activation="Relu",
+    use_bias=False,
+    weight_initializer=flow.random_uniform_initializer(),
+    bias_initializer=None,
+):
+    weight_shape = (filters, input.shape[1], kernel_size, kernel_size)
+    weight = flow.get_variable(
+        name + "-weight",
+        shape=weight_shape,
+        dtype=input.dtype,
+        initializer=weight_initializer,
+    )
+    output = flow.nn.conv2d(
+        input, weight, strides, padding, None, data_format, dilation_rate, name=name
+    )
+    if use_bias:
+        bias = flow.get_variable(
+            name + "-bias",
+            shape=(filters,),
+            dtype=input.dtype,
+            initializer=bias_initializer,
+        )
+        output = flow.nn.bias_add(output, bias, data_format)
+    if activation is not None:
+        if activation == "Relu":
+            output = flow.math.relu(output)
+        else:
+            raise NotImplementedError
+    return output
+
+
+def alexnet(images, labels):
+    conv1 = _conv2d_layer(
+        "conv1", images, filters=64, kernel_size=11, strides=4, padding="VALID"
+    )
+    pool1 = flow.nn.avg_pool2d(conv1, 3, 2, "VALID", "NCHW", name="pool1")
+    conv2 = _conv2d_layer("conv2", pool1, filters=192, kernel_size=5)
+    pool2 = flow.nn.avg_pool2d(conv2, 3, 2, "VALID", "NCHW", name="pool2")
+    conv3 = _conv2d_layer("conv3", pool2, filters=384)
+    conv4 = _conv2d_layer("conv4", conv3, filters=384)
+    conv5 = _conv2d_layer("conv5", conv4, filters=256)
+    pool5 = flow.nn.avg_pool2d(conv5, 3, 2, "VALID", "NCHW", name="pool5")
+    if len(pool5.shape) > 2:
+        pool5 = flow.reshape(pool5, shape=(pool5.shape[0], -1))
+    fc1 = flow.layers.dense(
+        inputs=pool5,
+        units=4096,
+        activation=flow.math.relu,
+        use_bias=False,
+        kernel_initializer=flow.random_uniform_initializer(),
+        bias_initializer=False,
+        trainable=True,
+        name="fc1",
+    )
+    dropout1 = flow.nn.dropout(fc1, rate=0.5)
+    fc2 = flow.layers.dense(
+        inputs=dropout1,
+        units=4096,
+        activation=flow.math.relu,
+        use_bias=False,
+        kernel_initializer=flow.random_uniform_initializer(),
+        bias_initializer=False,
+        trainable=True,
+        name="fc2",
+    )
+    dropout2 = flow.nn.dropout(fc2, rate=0.5)
+    fc3 = flow.layers.dense(
+        inputs=dropout2,
+        units=1001,
+        activation=None,
+        use_bias=False,
+        kernel_initializer=flow.random_uniform_initializer(),
+        bias_initializer=False,
+        trainable=True,
+        name="fc3",
+    )
+    loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+        labels, fc3, name="softmax_loss"
+    )
+    return loss
+
+
+@flow.global_function
+def alexnet_train_job():
+    flow.config.train.primary_lr(1e-05)
+    flow.config.train.model_update_conf(dict(naive_conf={}))
+    (labels, images) = _data_load_layer(args.train_dir)
+    loss = alexnet(images, labels)
+    flow.losses.add_loss(loss)
+    return loss
+
+
+@flow.global_function
+def alexnet_eval_job():
+    (labels, images) = _data_load_layer(args.eval_dir)
+    loss = alexnet(images, labels)
+    return loss
+
+
+def main():
+    flow.config.gpu_device_num(args.gpu_num_per_node)
+    flow.config.ctrl_port(9788)
+    flow.config.default_data_type(flow.float)
+    if args.multinode:
+        flow.config.ctrl_port(12138)
+        nodes = []
+        for n in args.node_list.strip().split(","):
+            addr_dict = {}
+            addr_dict["addr"] = n
+            nodes.append(addr_dict)
+        flow.config.machine(nodes)
+    check_point = flow.train.CheckPoint()
+    if not args.model_load_dir:
+        check_point.init()
+    else:
+        check_point.load(args.model_load_dir)
+    print("{:>12}  {:>12}  {:>12}".format("iter", "loss type", "loss value"))
+    for i in range(args.iter_num):
+        fmt_str = "{:>12}  {:>12}  {:>12.10f}"
+        train_loss = alexnet_train_job().get().mean()
+        print(fmt_str.format(i, "train loss:", train_loss))
+        if (i + 1) % 10 == 0:
+            eval_loss = alexnet_eval_job().get().mean()
+            print(fmt_str.format(i, "eval loss:", eval_loss))
+        if (i + 1) % 100 == 0:
+            check_point.save(args.model_save_dir + str(i))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/oneflow/benchmarks/cnn_benchmark/alexnet_model.py b/python/oneflow/benchmarks/cnn_benchmark/alexnet_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e3c7c09faf454ee613b1f98e89cb2e04fcce60c
--- /dev/null
+++ b/python/oneflow/benchmarks/cnn_benchmark/alexnet_model.py
@@ -0,0 +1,66 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from model_util import conv2d_layer
+
+import oneflow as flow
+
+
+def alexnet(images, trainable=True):
+    conv1 = conv2d_layer(
+        "conv1", images, filters=64, kernel_size=11, strides=4, padding="VALID"
+    )
+    pool1 = flow.nn.avg_pool2d(conv1, 3, 2, "VALID", "NCHW", name="pool1")
+    conv2 = conv2d_layer("conv2", pool1, filters=192, kernel_size=5)
+    pool2 = flow.nn.avg_pool2d(conv2, 3, 2, "VALID", "NCHW", name="pool2")
+    conv3 = conv2d_layer("conv3", pool2, filters=384)
+    conv4 = conv2d_layer("conv4", conv3, filters=384)
+    conv5 = conv2d_layer("conv5", conv4, filters=256)
+    pool5 = flow.nn.avg_pool2d(conv5, 3, 2, "VALID", "NCHW", name="pool5")
+    if len(pool5.shape) > 2:
+        pool5 = flow.reshape(pool5, shape=(pool5.shape[0], -1))
+    fc1 = flow.layers.dense(
+        inputs=pool5,
+        units=4096,
+        activation=flow.math.relu,
+        use_bias=False,
+        kernel_initializer=flow.random_uniform_initializer(),
+        bias_initializer=False,
+        trainable=trainable,
+        name="fc1",
+    )
+    dropout1 = flow.nn.dropout(fc1, rate=0.5)
+    fc2 = flow.layers.dense(
+        inputs=dropout1,
+        units=4096,
+        activation=flow.math.relu,
+        use_bias=False,
+        kernel_initializer=flow.random_uniform_initializer(),
+        bias_initializer=False,
+        trainable=trainable,
+        name="fc2",
+    )
+    dropout2 = flow.nn.dropout(fc2, rate=0.5)
+    fc3 = flow.layers.dense(
+        inputs=dropout2,
+        units=1001,
+        activation=None,
+        use_bias=False,
+        kernel_initializer=flow.random_uniform_initializer(),
+        bias_initializer=False,
+        trainable=trainable,
+        name="fc3",
+    )
+    return fc3
diff --git a/python/oneflow/benchmarks/cnn_benchmark/benchmark_util.py b/python/oneflow/benchmarks/cnn_benchmark/benchmark_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..38b867b8825b691d5d3ba1e4d8920f5ae046137e
--- /dev/null
+++ b/python/oneflow/benchmarks/cnn_benchmark/benchmark_util.py
@@ -0,0 +1,101 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import time
+
+import numpy as np
+
+
+class StopWatch:
+    def __init__(self):
+        pass
+
+    def start(self):
+        self.start_time = time.time()
+        self.last_split = self.start_time
+
+    def set_start(self, val):
+        self.start_time = val
+        self.last_split = self.start_time
+
+    def split(self):
+        now = time.time()
+        duration = now - self.last_split
+        self.last_split = now
+        return duration
+
+    def stop(self):
+        self.stop_time = time.time()
+
+    def duration(self):
+        return self.stop_time - self.start_time
+
+
+class CNNSpeedometer:
+    def __init__(self):
+        self.watch = StopWatch()
+        self.throughoutput_list = []
+
+    def speedometer_cb(
+        self,
+        step,
+        start_time,
+        total_batch_size,
+        skip_iter_num,
+        iter_num,
+        loss_print_every_n_iter,
+    ):
+        def callback(train_loss):
+            assert skip_iter_num >= 0
+            if skip_iter_num == 0 and step == 0:
+                self.watch.set_start(start_time)
+                print("Start trainning without any skipping iteration.")
+            if step < skip_iter_num:
+                if step == 0:
+                    print(
+                        "Skipping {} iterations for benchmark purpose.".format(
+                            skip_iter_num
+                        )
+                    )
+                if step + 1 == skip_iter_num:
+                    self.watch.start()
+                    print("Start trainning.")
+            else:
+                train_step = step - skip_iter_num
+                if (train_step + 1) % loss_print_every_n_iter == 0:
+                    loss = train_loss.mean()
+                    avg_elapse_time_per_iter = (
+                        self.watch.split() / loss_print_every_n_iter
+                    )
+                    samples_per_sec = total_batch_size / avg_elapse_time_per_iter
+                    print(
+                        "iter {}, loss: {:.3f}, speed: {:.3f}(sec/batch), {:.3f}(images/sec)".format(
+                            train_step, loss, avg_elapse_time_per_iter, samples_per_sec
+                        )
+                    )
+                    self.throughoutput_list.append(samples_per_sec)
+                if train_step + 1 == iter_num:
+                    self.watch.stop()
+                    totoal_duration = self.watch.duration()
+                    avg_samples_per_sec = total_batch_size * iter_num / totoal_duration
+                    print("-".ljust(66, "-"))
+                    print(
+                        "average speed: {:.3f}(images/sec), new_cal_method: {:.3f}(images/sec)".format(
+                            avg_samples_per_sec, np.mean(self.throughoutput_list)
+                        )
+                    )
+                    print("-".ljust(66, "-"))
+
+        return callback
diff --git a/python/oneflow/benchmarks/cnn_benchmark/data_loader.py b/python/oneflow/benchmarks/cnn_benchmark/data_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..27dea6bd4825897eff62941c2804e78d39a1c6be
--- /dev/null
+++ b/python/oneflow/benchmarks/cnn_benchmark/data_loader.py
@@ -0,0 +1,61 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+
+
+def load_imagenet(
+    data_dir, image_size, batch_size, data_part_num, gpu_image_decoder=False
+):
+    rgb_mean = [123.68, 116.78, 103.94]
+    rgb_std = [255.0, 255.0, 255.0]
+    ofrecord = flow.data.ofrecord_reader(
+        data_dir, batch_size=batch_size, data_part_num=data_part_num, name="decode"
+    )
+    label = flow.data.ofrecord_raw_decoder(
+        ofrecord, "class/label", shape=(), dtype=flow.int32
+    )
+    if gpu_image_decoder:
+        encoded = flow.data.OFRecordBytesDecoder(ofrecord, "encoded")
+        rsz = flow.data.ImageDecoderRandomCropResize(
+            encoded, target_width=image_size, target_height=image_size, num_workers=3
+        )
+    else:
+        image = flow.data.ofrecord_image_decoder(ofrecord, "encoded", color_space="RGB")
+        rsz = flow.image.resize(
+            image, resize_x=image_size, resize_y=image_size, color_space="RGB"
+        )
+    normal = flow.image.crop_mirror_normalize(
+        rsz,
+        color_space="RGB",
+        output_layout="NCHW",
+        mean=rgb_mean,
+        std=rgb_std,
+        output_dtype=flow.float,
+    )
+    return (label, normal)
+
+
+def load_synthetic(image_size, batch_size):
+    label = flow.data.decode_random(
+        shape=(),
+        dtype=flow.int32,
+        batch_size=batch_size,
+        initializer=flow.zeros_initializer(flow.int32),
+    )
+    image = flow.data.decode_random(
+        shape=(3, image_size, image_size), dtype=flow.float, batch_size=batch_size
+    )
+    return (label, image)
diff --git a/python/oneflow/benchmarks/cnn_benchmark/inceptionv3_model.py b/python/oneflow/benchmarks/cnn_benchmark/inceptionv3_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..1679107800dc5fce6b1f70330a1902d08847eee6
--- /dev/null
+++ b/python/oneflow/benchmarks/cnn_benchmark/inceptionv3_model.py
@@ -0,0 +1,477 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+import oneflow.core.operator.op_conf_pb2 as op_conf_util
+
+
+def _conv2d_layer(
+    name,
+    input,
+    filters,
+    kernel_size=3,
+    strides=1,
+    padding="SAME",
+    data_format="NCHW",
+    dilation_rate=1,
+    activation=op_conf_util.kSigmoid,
+    use_bias=True,
+    trainable=True,
+    weight_initializer=flow.random_uniform_initializer(),
+    bias_initializer=flow.constant_initializer(),
+):
+    if isinstance(kernel_size, int):
+        kernel_size = (kernel_size, kernel_size)
+    else:
+        kernel_size = tuple(kernel_size)
+    weight_shape = (filters, input.shape[1]) + kernel_size
+    weight = flow.get_variable(
+        name + "-weight",
+        shape=weight_shape,
+        dtype=input.dtype,
+        initializer=weight_initializer,
+    )
+    output = flow.nn.conv2d(
+        input, weight, strides, padding, None, data_format, dilation_rate, name=name
+    )
+    if use_bias:
+        bias = flow.get_variable(
+            name + "-bias",
+            shape=(filters,),
+            dtype=input.dtype,
+            initializer=bias_initializer,
+        )
+        output = flow.nn.bias_add(output, bias, data_format)
+    if activation is not None:
+        if activation == op_conf_util.kRelu:
+            output = flow.math.relu(output)
+        elif activation == op_conf_util.kSigmoid:
+            output = flow.math.sigmoid(output)
+        else:
+            raise NotImplementedError
+    return output
+
+
+def InceptionA(in_blob, index):
+    with flow.scope.namespace("mixed_{}".format(index)):
+        with flow.scope.namespace("branch1x1"):
+            branch1x1 = _conv2d_layer(
+                "conv0", in_blob, filters=64, kernel_size=1, strides=1, padding="SAME"
+            )
+        with flow.scope.namespace("branch5x5"):
+            branch5x5_1 = _conv2d_layer(
+                "conv0", in_blob, filters=48, kernel_size=1, strides=1, padding="SAME"
+            )
+            branch5x5_2 = _conv2d_layer(
+                "conv1",
+                branch5x5_1,
+                filters=64,
+                kernel_size=5,
+                strides=1,
+                padding="SAME",
+            )
+        with flow.scope.namespace("branch3x3dbl"):
+            branch3x3dbl_1 = _conv2d_layer(
+                "conv0", in_blob, filters=64, kernel_size=1, strides=1, padding="SAME"
+            )
+            branch3x3dbl_2 = _conv2d_layer(
+                "conv1",
+                branch3x3dbl_1,
+                filters=96,
+                kernel_size=3,
+                strides=1,
+                padding="SAME",
+            )
+            branch3x3dbl_3 = _conv2d_layer(
+                "conv2",
+                branch3x3dbl_2,
+                filters=96,
+                kernel_size=3,
+                strides=1,
+                padding="SAME",
+            )
+        with flow.scope.namespace("branch_pool"):
+            branch_pool_1 = flow.nn.avg_pool2d(
+                in_blob,
+                ksize=3,
+                strides=1,
+                padding="SAME",
+                data_format="NCHW",
+                name="pool",
+            )
+            branch_pool_2 = _conv2d_layer(
+                "conv",
+                branch_pool_1,
+                filters=32 if index == 0 else 64,
+                kernel_size=1,
+                strides=1,
+                padding="SAME",
+            )
+        inceptionA_bn = []
+        inceptionA_bn.append(branch1x1)
+        inceptionA_bn.append(branch5x5_2)
+        inceptionA_bn.append(branch3x3dbl_3)
+        inceptionA_bn.append(branch_pool_2)
+        mixed_concat = flow.concat(values=inceptionA_bn, axis=1, name="concat")
+    return mixed_concat
+
+
+def InceptionB(in_blob, index):
+    with flow.scope.namespace("mixed_{}".format(index)):
+        with flow.scope.namespace("branch3x3"):
+            branch3x3 = _conv2d_layer(
+                "conv0", in_blob, filters=384, kernel_size=3, strides=2, padding="VALID"
+            )
+        with flow.scope.namespace("branch3x3dbl"):
+            branch3x3dbl_1 = _conv2d_layer(
+                "conv0", in_blob, filters=64, kernel_size=1, strides=1, padding="SAME"
+            )
+            branch3x3dbl_2 = _conv2d_layer(
+                "conv1",
+                branch3x3dbl_1,
+                filters=96,
+                kernel_size=3,
+                strides=1,
+                padding="SAME",
+            )
+            branch3x3dbl_3 = _conv2d_layer(
+                "conv2",
+                branch3x3dbl_2,
+                filters=96,
+                kernel_size=3,
+                strides=2,
+                padding="VALID",
+            )
+        with flow.scope.namespace("branch_pool"):
+            branch_pool = flow.nn.max_pool2d(
+                in_blob,
+                ksize=3,
+                strides=2,
+                padding="VALID",
+                data_format="NCHW",
+                name="pool0",
+            )
+        inceptionB_bn = []
+        inceptionB_bn.append(branch3x3)
+        inceptionB_bn.append(branch3x3dbl_3)
+        inceptionB_bn.append(branch_pool)
+        mixed_concat = flow.concat(values=inceptionB_bn, axis=1, name="concat")
+    return mixed_concat
+
+
+def InceptionC(in_blob, index, filters):
+    with flow.scope.namespace("mixed_{}".format(index)):
+        with flow.scope.namespace("branch1x1"):
+            branch1x1 = _conv2d_layer(
+                "conv0", in_blob, filters=192, kernel_size=1, strides=1, padding="SAME"
+            )
+        with flow.scope.namespace("branch7x7"):
+            branch7x7_1 = _conv2d_layer(
+                "conv0",
+                in_blob,
+                filters=filters,
+                kernel_size=1,
+                strides=1,
+                padding="SAME",
+            )
+            branch7x7_2 = _conv2d_layer(
+                "conv1",
+                branch7x7_1,
+                filters=filters,
+                kernel_size=[1, 7],
+                strides=1,
+                padding="SAME",
+            )
+            branch7x7_3 = _conv2d_layer(
+                "conv2",
+                branch7x7_2,
+                filters=192,
+                kernel_size=[7, 1],
+                strides=[1, 1],
+                padding="SAME",
+            )
+        with flow.scope.namespace("branch7x7dbl"):
+            branch7x7dbl_1 = _conv2d_layer(
+                "conv0",
+                in_blob,
+                filters=filters,
+                kernel_size=1,
+                strides=1,
+                padding="SAME",
+            )
+            branch7x7dbl_2 = _conv2d_layer(
+                "conv1",
+                branch7x7dbl_1,
+                filters=filters,
+                kernel_size=[7, 1],
+                strides=1,
+                padding="SAME",
+            )
+            branch7x7dbl_3 = _conv2d_layer(
+                "conv2",
+                branch7x7dbl_2,
+                filters=filters,
+                kernel_size=[1, 7],
+                strides=1,
+                padding="SAME",
+            )
+            branch7x7dbl_4 = _conv2d_layer(
+                "conv3",
+                branch7x7dbl_3,
+                filters=filters,
+                kernel_size=[7, 1],
+                strides=1,
+                padding="SAME",
+            )
+            branch7x7dbl_5 = _conv2d_layer(
+                "conv4",
+                branch7x7dbl_4,
+                filters=192,
+                kernel_size=[1, 7],
+                strides=1,
+                padding="SAME",
+            )
+        with flow.scope.namespace("branch_pool"):
+            branch_pool_1 = flow.nn.avg_pool2d(
+                in_blob,
+                ksize=3,
+                strides=1,
+                padding="SAME",
+                data_format="NCHW",
+                name="pool",
+            )
+            branch_pool_2 = _conv2d_layer(
+                "conv",
+                branch_pool_1,
+                filters=192,
+                kernel_size=[1, 1],
+                strides=1,
+                padding="SAME",
+            )
+        inceptionC_bn = []
+        inceptionC_bn.append(branch1x1)
+        inceptionC_bn.append(branch7x7_3)
+        inceptionC_bn.append(branch7x7dbl_5)
+        inceptionC_bn.append(branch_pool_2)
+        mixed_concat = flow.concat(values=inceptionC_bn, axis=1, name="concat")
+    return mixed_concat
+
+
+def InceptionD(in_blob, index):
+    with flow.scope.namespace("mixed_{}".format(index)):
+        with flow.scope.namespace("branch3x3"):
+            branch3x3_1 = _conv2d_layer(
+                "conv0", in_blob, filters=192, kernel_size=1, strides=1, padding="SAME"
+            )
+            branch3x3_2 = _conv2d_layer(
+                "conv1",
+                branch3x3_1,
+                filters=320,
+                kernel_size=3,
+                strides=2,
+                padding="VALID",
+            )
+        with flow.scope.namespace("branch7x7x3"):
+            branch7x7x3_1 = _conv2d_layer(
+                "conv0", in_blob, filters=192, kernel_size=1, strides=1, padding="SAME"
+            )
+            branch7x7x3_2 = _conv2d_layer(
+                "conv1",
+                branch7x7x3_1,
+                filters=192,
+                kernel_size=[1, 7],
+                strides=1,
+                padding="SAME",
+            )
+            branch7x7x3_3 = _conv2d_layer(
+                "conv2",
+                branch7x7x3_2,
+                filters=192,
+                kernel_size=[7, 1],
+                strides=1,
+                padding="SAME",
+            )
+            branch7x7x3_4 = _conv2d_layer(
+                "conv3",
+                branch7x7x3_3,
+                filters=192,
+                kernel_size=3,
+                strides=2,
+                padding="VALID",
+            )
+        with flow.scope.namespace("branch_pool"):
+            branch_pool = flow.nn.max_pool2d(
+                in_blob,
+                ksize=3,
+                strides=2,
+                padding="VALID",
+                data_format="NCHW",
+                name="pool",
+            )
+        inceptionD_bn = []
+        inceptionD_bn.append(branch3x3_2)
+        inceptionD_bn.append(branch7x7x3_4)
+        inceptionD_bn.append(branch_pool)
+        mixed_concat = flow.concat(values=inceptionD_bn, axis=1, name="concat")
+    return mixed_concat
+
+
+def InceptionE(in_blob, index):
+    with flow.scope.namespace("mixed_{}".format(index)):
+        with flow.scope.namespace("branch1x1"):
+            branch1x1 = _conv2d_layer(
+                "conv0", in_blob, filters=320, kernel_size=1, strides=1, padding="SAME"
+            )
+        with flow.scope.namespace("branch3x3"):
+            branch3x3_1 = _conv2d_layer(
+                "conv0", in_blob, filters=384, kernel_size=1, strides=1, padding="SAME"
+            )
+            branch3x3_2 = _conv2d_layer(
+                "conv1",
+                branch3x3_1,
+                filters=384,
+                kernel_size=[1, 3],
+                strides=1,
+                padding="SAME",
+            )
+            branch3x3_3 = _conv2d_layer(
+                "conv2",
+                branch3x3_1,
+                filters=384,
+                kernel_size=[3, 1],
+                strides=[1, 1],
+                padding="SAME",
+            )
+            inceptionE_1_bn = []
+            inceptionE_1_bn.append(branch3x3_2)
+            inceptionE_1_bn.append(branch3x3_3)
+            concat_branch3x3 = flow.concat(
+                values=inceptionE_1_bn, axis=1, name="concat"
+            )
+        with flow.scope.namespace("branch3x3dbl"):
+            branch3x3dbl_1 = _conv2d_layer(
+                "conv0", in_blob, filters=448, kernel_size=1, strides=1, padding="SAME"
+            )
+            branch3x3dbl_2 = _conv2d_layer(
+                "conv1",
+                branch3x3dbl_1,
+                filters=384,
+                kernel_size=3,
+                strides=1,
+                padding="SAME",
+            )
+            branch3x3dbl_3 = _conv2d_layer(
+                "conv2",
+                branch3x3dbl_2,
+                filters=384,
+                kernel_size=[1, 3],
+                strides=1,
+                padding="SAME",
+            )
+            branch3x3dbl_4 = _conv2d_layer(
+                "conv3",
+                branch3x3dbl_2,
+                filters=384,
+                kernel_size=[3, 1],
+                strides=1,
+                padding="SAME",
+            )
+            inceptionE_2_bn = []
+            inceptionE_2_bn.append(branch3x3dbl_3)
+            inceptionE_2_bn.append(branch3x3dbl_4)
+            concat_branch3x3dbl = flow.concat(
+                values=inceptionE_2_bn, axis=1, name="concat"
+            )
+        with flow.scope.namespace("branch_pool"):
+            branch_pool_1 = flow.nn.avg_pool2d(
+                in_blob,
+                ksize=3,
+                strides=1,
+                padding="SAME",
+                data_format="NCHW",
+                name="pool",
+            )
+            branch_pool_2 = _conv2d_layer(
+                "conv",
+                branch_pool_1,
+                filters=192,
+                kernel_size=[1, 1],
+                strides=1,
+                padding="SAME",
+            )
+        inceptionE_total_bn = []
+        inceptionE_total_bn.append(branch1x1)
+        inceptionE_total_bn.append(concat_branch3x3)
+        inceptionE_total_bn.append(concat_branch3x3dbl)
+        inceptionE_total_bn.append(branch_pool_2)
+        concat_total = flow.concat(values=inceptionE_total_bn, axis=1, name="concat")
+    return concat_total
+
+
+def inceptionv3(images, labels, trainable=True):
+    conv0 = _conv2d_layer(
+        "conv0", images, filters=32, kernel_size=3, strides=2, padding="VALID"
+    )
+    conv1 = _conv2d_layer(
+        "conv1", conv0, filters=32, kernel_size=3, strides=1, padding="VALID"
+    )
+    conv2 = _conv2d_layer(
+        "conv2", conv1, filters=64, kernel_size=3, strides=1, padding="SAME"
+    )
+    pool1 = flow.nn.max_pool2d(
+        conv2, ksize=3, strides=2, padding="VALID", data_format="NCHW", name="pool1"
+    )
+    conv3 = _conv2d_layer(
+        "conv3", pool1, filters=80, kernel_size=1, strides=1, padding="VALID"
+    )
+    conv4 = _conv2d_layer(
+        "conv4", conv3, filters=192, kernel_size=3, strides=1, padding="VALID"
+    )
+    pool2 = flow.nn.max_pool2d(
+        conv4, ksize=3, strides=2, padding="VALID", data_format="NCHW", name="pool2"
+    )
+    mixed_0 = InceptionA(pool2, 0)
+    mixed_1 = InceptionA(mixed_0, 1)
+    mixed_2 = InceptionA(mixed_1, 2)
+    mixed_3 = InceptionB(mixed_2, 3)
+    mixed_4 = InceptionC(mixed_3, 4, 128)
+    mixed_5 = InceptionC(mixed_4, 5, 160)
+    mixed_6 = InceptionC(mixed_5, 6, 160)
+    mixed_7 = InceptionC(mixed_6, 7, 192)
+    mixed_8 = InceptionD(mixed_7, 8)
+    mixed_9 = InceptionE(mixed_8, 9)
+    mixed_10 = InceptionE(mixed_9, 10)
+    pool3 = flow.nn.avg_pool2d(
+        mixed_10, ksize=8, strides=1, padding="VALID", data_format="NCHW", name="pool3"
+    )
+    with flow.scope.namespace("logits"):
+        pool3 = flow.reshape(pool3, [pool3.shape[0], -1])
+        weight = flow.get_variable(
+            "fc1-weight",
+            shape=(pool3.shape[1], 1001),
+            dtype=flow.float,
+            initializer=flow.truncated_normal(0.816496580927726),
+            model_name="weight",
+        )
+        bias = flow.get_variable(
+            "fc1-bias",
+            shape=(1001,),
+            dtype=flow.float,
+            initializer=flow.constant_initializer(),
+            model_name="bias",
+        )
+        fc1 = flow.matmul(pool3, weight)
+        fc1 = flow.nn.bias_add(fc1, bias)
+    return fc1
diff --git a/python/oneflow/benchmarks/cnn_benchmark/model_util.py b/python/oneflow/benchmarks/cnn_benchmark/model_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..93130d7635f145fa8508320e4232aee0dd1cc1e4
--- /dev/null
+++ b/python/oneflow/benchmarks/cnn_benchmark/model_util.py
@@ -0,0 +1,56 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+
+
+def conv2d_layer(
+    name,
+    input,
+    filters,
+    kernel_size=3,
+    strides=1,
+    padding="SAME",
+    data_format="NCHW",
+    dilation_rate=1,
+    activation="Relu",
+    use_bias=True,
+    weight_initializer=flow.random_uniform_initializer(),
+    bias_initializer=flow.constant_initializer(),
+):
+    weight_shape = (filters, input.shape[1], kernel_size, kernel_size)
+    weight = flow.get_variable(
+        name + "-weight",
+        shape=weight_shape,
+        dtype=input.dtype,
+        initializer=weight_initializer,
+    )
+    output = flow.nn.conv2d(
+        input, weight, strides, padding, None, data_format, dilation_rate, name=name
+    )
+    if use_bias:
+        bias = flow.get_variable(
+            name + "-bias",
+            shape=(filters,),
+            dtype=input.dtype,
+            initializer=bias_initializer,
+        )
+        output = flow.nn.bias_add(output, bias, data_format)
+    if activation is not None:
+        if activation == "Relu":
+            output = flow.math.relu(output)
+        else:
+            raise NotImplementedError
+    return output
diff --git a/python/oneflow/benchmarks/cnn_benchmark/of_cnn_benchmarks.py b/python/oneflow/benchmarks/cnn_benchmark/of_cnn_benchmarks.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a71681dc6a37a50cb646696a42784619a855c9d
--- /dev/null
+++ b/python/oneflow/benchmarks/cnn_benchmark/of_cnn_benchmarks.py
@@ -0,0 +1,263 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import argparse
+import os
+import time
+from datetime import datetime
+
+import alexnet_model
+import benchmark_util
+import data_loader
+import resnet_model
+import vgg_model
+
+import oneflow as flow
+
+parser = argparse.ArgumentParser(description="flags for cnn benchmark")
+parser.add_argument("--gpu_num_per_node", type=int, default=1, required=False)
+parser.add_argument("--node_num", type=int, default=1)
+parser.add_argument(
+    "--node_list",
+    type=str,
+    default=None,
+    required=False,
+    help="nodes' IP address, split by comma",
+)
+parser.add_argument(
+    "--model", type=str, default="vgg16", required=False, help="vgg16 or resnet50"
+)
+parser.add_argument("--batch_size_per_device", type=int, default=8, required=False)
+parser.add_argument("--learning_rate", type=float, default=0.0001, required=False)
+parser.add_argument(
+    "--optimizer", type=str, default="sgd", required=False, help="sgd, adam, momentum"
+)
+parser.add_argument(
+    "--weight_l2",
+    type=float,
+    default=None,
+    required=False,
+    help="weight decay parameter",
+)
+parser.add_argument(
+    "--iter_num", type=int, default=10, required=False, help="total iterations to run"
+)
+parser.add_argument(
+    "--skip_iter_num",
+    type=int,
+    default=0,
+    required=False,
+    help="number of skipping iterations for benchmark purpose.",
+)
+parser.add_argument(
+    "--data_dir", type=str, default=None, required=False, help="dataset directory"
+)
+parser.add_argument(
+    "--data_part_num",
+    type=int,
+    default=32,
+    required=False,
+    help="data part number in dataset",
+)
+parser.add_argument(
+    "--gpu_image_decoder",
+    type=bool,
+    default=False,
+    required=False,
+    help="Whether to use use ImageDecoderRandomCropResize.",
+)
+parser.add_argument(
+    "--image_size", type=int, default=228, required=False, help="image size"
+)
+parser.add_argument(
+    "--loss_print_every_n_iter",
+    type=int,
+    default=1,
+    required=False,
+    help="print loss every n iteration",
+)
+parser.add_argument(
+    "--model_save_every_n_iter",
+    type=int,
+    default=200,
+    required=False,
+    help="save model every n iteration",
+)
+parser.add_argument(
+    "--model_save_dir",
+    type=str,
+    default="./output/model_save-{}".format(
+        str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))
+    ),
+    required=False,
+    help="model save directory",
+)
+parser.add_argument(
+    "--save_last_snapshot",
+    type=bool,
+    default=False,
+    required=False,
+    help="save model snapshot for last iteration",
+)
+parser.add_argument(
+    "--model_load_dir",
+    type=str,
+    default=None,
+    required=False,
+    help="model load directory",
+)
+parser.add_argument(
+    "--log_dir",
+    type=str,
+    default="./output",
+    required=False,
+    help="log info save directory",
+)
+parser.add_argument(
+    "--enable_auto_mixed_precision",
+    type=bool,
+    default=False,
+    required=False,
+    help="automatically change the float net into mixed precision net",
+)
+args = parser.parse_args()
+model_dict = {
+    "resnet50": resnet_model.resnet50,
+    "vgg16": vgg_model.vgg16,
+    "alexnet": alexnet_model.alexnet,
+}
+func_config = flow.FunctionConfig()
+func_config.default_distribute_strategy(flow.scope.consistent_view())
+func_config.default_data_type(flow.float)
+func_config.enable_auto_mixed_precision(args.enable_auto_mixed_precision)
+if args.weight_l2:
+    func_config.train.weight_l2(args.weight_l2)
+flow.config.gpu_device_num(args.gpu_num_per_node)
+
+
+def set_up_optimizer(loss, args):
+    loss_scale_policy = None
+    if args.enable_auto_mixed_precision:
+        loss_scale_policy = flow.optimizer.loss_scale.dynamic_loss_scale(
+            increment_period=2000
+        )
+    if args.optimizer == "sgd":
+        print("Optimizer:  SGD")
+        flow.optimizer.SGD(
+            flow.optimizer.PiecewiseConstantScheduler([], [args.learning_rate]),
+            loss_scale_policy=loss_scale_policy,
+        ).minimize(loss)
+    elif args.optimizer == "momentum":
+        print("Optimizer:  Momentum")
+        flow.optimizer.SGD(
+            flow.optimizer.PiecewiseConstantScheduler([], [args.learning_rate]),
+            momentum=0.9,
+            loss_scale_policy=loss_scale_policy,
+        ).minimize(loss)
+    elif args.optimizer == "adam":
+        print("Optimizer:  Adam")
+        flow.optimizer.Adam(
+            flow.optimizer.PiecewiseConstantScheduler([], [args.learning_rate]),
+            beta1=0.9,
+            loss_scale_policy=loss_scale_policy,
+        ).minimize(loss)
+
+
+@flow.global_function(func_config)
+def TrainNet():
+    total_device_num = args.node_num * args.gpu_num_per_node
+    batch_size = total_device_num * args.batch_size_per_device
+    if args.data_dir:
+        assert os.path.exists(args.data_dir)
+        print("Loading data from {}".format(args.data_dir))
+        (labels, images) = data_loader.load_imagenet(
+            args.data_dir,
+            args.image_size,
+            batch_size,
+            args.data_part_num,
+            args.gpu_image_decoder,
+        )
+    else:
+        print("Loading synthetic data.")
+        (labels, images) = data_loader.load_synthetic(args.image_size, batch_size)
+    logits = model_dict[args.model](images)
+    loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+        labels, logits, name="softmax_loss"
+    )
+    set_up_optimizer(loss, args)
+    return loss
+
+
+def main():
+    print("=".ljust(66, "="))
+    print(
+        "Running {}: num_gpu_per_node = {}, num_nodes = {}.".format(
+            args.model, args.gpu_num_per_node, args.node_num
+        )
+    )
+    print("=".ljust(66, "="))
+    for arg in vars(args):
+        print("{} = {}".format(arg, getattr(args, arg)))
+    print("-".ljust(66, "-"))
+    print("Time stamp: {}".format(str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))))
+    flow.env.log_dir(args.log_dir)
+    if args.node_num > 1:
+        nodes = []
+        for n in args.node_list.strip().split(","):
+            addr_dict = {}
+            addr_dict["addr"] = n
+            nodes.append(addr_dict)
+        flow.env.machine(nodes)
+    check_point = flow.train.CheckPoint()
+    if args.model_load_dir:
+        assert os.path.isdir(args.model_load_dir)
+        print("Restoring model from {}.".format(args.model_load_dir))
+        check_point.load(args.model_load_dir)
+    else:
+        print("Init model on demand.")
+        check_point.init()
+    total_batch_size = (
+        args.node_num * args.gpu_num_per_node * args.batch_size_per_device
+    )
+    speedometer = benchmark_util.CNNSpeedometer()
+    start_time = time.time()
+    for step in range(args.skip_iter_num + args.iter_num):
+        cb = speedometer.speedometer_cb(
+            step,
+            start_time,
+            total_batch_size,
+            args.skip_iter_num,
+            args.iter_num,
+            args.loss_print_every_n_iter,
+        )
+        TrainNet().async_get(cb)
+        if (step + 1) % args.model_save_every_n_iter == 0:
+            if not os.path.exists(args.model_save_dir):
+                os.makedirs(args.model_save_dir)
+            snapshot_save_path = os.path.join(
+                args.model_save_dir, "snapshot_%d" % (step + 1)
+            )
+            print("Saving model to {}.".format(snapshot_save_path))
+            check_point.save(snapshot_save_path)
+    if args.save_last_snapshot:
+        snapshot_save_path = os.path.join(args.model_save_dir, "last_snapshot")
+        if not os.path.exists(snapshot_save_path):
+            os.makedirs(snapshot_save_path)
+        print("Saving model to {}.".format(snapshot_save_path))
+        check_point.save(snapshot_save_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/oneflow/benchmarks/cnn_benchmark/of_cnn_infer_benchmarks.py b/python/oneflow/benchmarks/cnn_benchmark/of_cnn_infer_benchmarks.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd841ace40fc6abad8049c45bf84e5d545c092a4
--- /dev/null
+++ b/python/oneflow/benchmarks/cnn_benchmark/of_cnn_infer_benchmarks.py
@@ -0,0 +1,210 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import argparse
+import os
+import time
+from datetime import datetime
+
+import alexnet_model
+import data_loader
+import inceptionv3_model
+import resnet_model
+import vgg_model
+
+import oneflow as flow
+
+parser = argparse.ArgumentParser(description="flags for cnn benchmark")
+parser.add_argument("--gpu_num_per_node", type=int, default=1, required=False)
+parser.add_argument("--node_num", type=int, default=1)
+parser.add_argument(
+    "--node_list",
+    type=str,
+    default=None,
+    required=False,
+    help="nodes' IP address, split by comma",
+)
+parser.add_argument(
+    "--model", type=str, default="vgg16", required=False, help="vgg16 or resnet50"
+)
+parser.add_argument("--batch_size_per_device", type=int, default=8, required=False)
+parser.add_argument(
+    "--iter_num", type=int, default=10, required=False, help="total iterations to run"
+)
+parser.add_argument(
+    "--warmup_iter_num",
+    type=int,
+    default=0,
+    required=False,
+    help="total iterations to run",
+)
+parser.add_argument(
+    "--data_dir", type=str, default=None, required=False, help="dataset directory"
+)
+parser.add_argument(
+    "--data_part_num",
+    type=int,
+    default=32,
+    required=False,
+    help="data part number in dataset",
+)
+parser.add_argument(
+    "--image_size", type=int, default=228, required=False, help="image size"
+)
+parser.add_argument(
+    "--use_tensorrt",
+    dest="use_tensorrt",
+    action="store_true",
+    default=False,
+    required=False,
+    help="inference with tensorrt",
+)
+parser.add_argument(
+    "--use_xla_jit",
+    dest="use_xla_jit",
+    action="store_true",
+    default=False,
+    required=False,
+    help="inference with xla jit",
+)
+parser.add_argument(
+    "--precision",
+    type=str,
+    default="float32",
+    required=False,
+    help="inference with low precision",
+)
+parser.add_argument(
+    "--print_every_n_iter",
+    type=int,
+    default=1,
+    required=False,
+    help="print log every n iterations",
+)
+parser.add_argument(
+    "--model_load_dir",
+    type=str,
+    default=None,
+    required=False,
+    help="model load directory",
+)
+parser.add_argument(
+    "--log_dir",
+    type=str,
+    default="./output",
+    required=False,
+    help="log info save directory",
+)
+args = parser.parse_args()
+model_dict = {
+    "resnet50": resnet_model.resnet50,
+    "inceptionv3": inceptionv3_model.inceptionv3,
+    "vgg16": vgg_model.vgg16,
+    "alexnet": alexnet_model.alexnet,
+}
+func_config = flow.FunctionConfig()
+func_config.default_data_type(flow.float)
+flow.config.gpu_device_num(args.gpu_num_per_node)
+if args.use_tensorrt:
+    func_config.use_tensorrt()
+if args.use_xla_jit:
+    func_config.use_xla_jit()
+if args.precision == "float16":
+    if not args.use_tensorrt:
+        func_config.enable_auto_mixed_precision()
+    else:
+        func_config.tensorrt.use_fp16()
+
+
+@flow.global_function(func_config)
+def InferenceNet():
+    total_device_num = args.node_num * args.gpu_num_per_node
+    batch_size = total_device_num * args.batch_size_per_device
+    if args.data_dir:
+        assert os.path.exists(args.data_dir)
+        print("Loading data from {}".format(args.data_dir))
+        (labels, images) = data_loader.load_imagenet(
+            args.data_dir, args.image_size, batch_size, args.data_part_num
+        )
+    else:
+        print("Loading synthetic data.")
+        (labels, images) = data_loader.load_synthetic(args.image_size, batch_size)
+    logits = model_dict[args.model](images)
+    softmax = flow.nn.softmax(logits)
+    return softmax
+
+
+def main():
+    print("=".ljust(66, "="))
+    print(
+        "Running {}: num_gpu_per_node = {}, num_nodes = {}.".format(
+            args.model, args.gpu_num_per_node, args.node_num
+        )
+    )
+    print("=".ljust(66, "="))
+    for arg in vars(args):
+        print("{} = {}".format(arg, getattr(args, arg)))
+    print("-".ljust(66, "-"))
+    print("Time stamp: {}".format(str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))))
+    flow.env.log_dir(args.log_dir)
+    if args.node_num > 1:
+        nodes = []
+        for n in args.node_list.strip().split(","):
+            addr_dict = {}
+            addr_dict["addr"] = n
+            nodes.append(addr_dict)
+        flow.env.machine(nodes)
+    check_point = flow.train.CheckPoint()
+    if args.model_load_dir:
+        assert os.path.isdir(args.model_load_dir)
+        print("Restoring model from {}.".format(args.model_load_dir))
+        check_point.load(args.model_load_dir)
+    else:
+        print("Init model on demand.")
+        check_point.init()
+    print("Runing warm up for {} iterations.".format(args.warmup_iter_num))
+    for step in range(args.warmup_iter_num):
+        predictions = InferenceNet().get()
+    main.total_time = 0.0
+    main.batch_size = args.node_num * args.gpu_num_per_node * args.batch_size_per_device
+    main.start_time = time.time()
+
+    def create_callback(step):
+        def callback(predictions):
+            if step % args.print_every_n_iter == 0:
+                cur_time = time.time()
+                duration = cur_time - main.start_time
+                main.total_time += duration
+                main.start_time = cur_time
+                images_per_sec = main.batch_size / duration
+                print(
+                    "iter {}, speed: {:.3f}(sec/batch), {:.3f}(images/sec)".format(
+                        step, duration, images_per_sec
+                    )
+                )
+                if step == args.iter_num - 1:
+                    avg_img_per_sec = main.batch_size * args.iter_num / main.total_time
+                    print("-".ljust(66, "-"))
+                    print("average speed: {:.3f}(images/sec)".format(avg_img_per_sec))
+                    print("-".ljust(66, "-"))
+
+        return callback
+
+    for step in range(args.iter_num):
+        InferenceNet().async_get(create_callback(step))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/oneflow/benchmarks/cnn_benchmark/resnet_model.py b/python/oneflow/benchmarks/cnn_benchmark/resnet_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..0aa55003f0f93eafecad64638bf7c9e4757dea85
--- /dev/null
+++ b/python/oneflow/benchmarks/cnn_benchmark/resnet_model.py
@@ -0,0 +1,141 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+
+BLOCK_COUNTS = [3, 4, 6, 3]
+BLOCK_FILTERS = [256, 512, 1024, 2048]
+BLOCK_FILTERS_INNER = [64, 128, 256, 512]
+
+
+def _conv2d(
+    name,
+    input,
+    filters,
+    kernel_size,
+    strides=1,
+    padding="SAME",
+    data_format="NCHW",
+    dilations=1,
+    trainable=True,
+    weight_initializer=flow.variance_scaling_initializer(data_format="NCHW"),
+):
+    weight = flow.get_variable(
+        name + "-weight",
+        shape=(filters, input.shape[1], kernel_size, kernel_size),
+        dtype=input.dtype,
+        initializer=weight_initializer,
+        trainable=trainable,
+    )
+    return flow.nn.conv2d(
+        input, weight, strides, padding, None, data_format, dilations, name=name
+    )
+
+
+def _batch_norm(inputs, name=None, trainable=True):
+    return flow.layers.batch_normalization(
+        inputs=inputs,
+        axis=1,
+        momentum=0.997,
+        epsilon=1.001e-05,
+        center=True,
+        scale=True,
+        trainable=trainable,
+        name=name,
+    )
+
+
+def conv2d_affine(input, name, filters, kernel_size, strides, activation=None):
+    padding = "SAME" if strides > 1 or kernel_size > 1 else "VALID"
+    output = _conv2d(name, input, filters, kernel_size, strides, padding)
+    output = _batch_norm(output, name + "_bn")
+    if activation == "Relu":
+        output = flow.math.relu(output)
+    return output
+
+
+def bottleneck_transformation(input, block_name, filters, filters_inner, strides):
+    a = conv2d_affine(
+        input, block_name + "_branch2a", filters_inner, 1, 1, activation="Relu"
+    )
+    b = conv2d_affine(
+        a, block_name + "_branch2b", filters_inner, 3, strides, activation="Relu"
+    )
+    c = conv2d_affine(b, block_name + "_branch2c", filters, 1, 1)
+    return c
+
+
+def residual_block(input, block_name, filters, filters_inner, strides_init):
+    if strides_init != 1 or block_name == "res2_0":
+        shortcut = conv2d_affine(
+            input, block_name + "_branch1", filters, 1, strides_init
+        )
+    else:
+        shortcut = input
+    bottleneck = bottleneck_transformation(
+        input, block_name, filters, filters_inner, strides_init
+    )
+    return flow.math.relu(bottleneck + shortcut)
+
+
+def residual_stage(input, stage_name, counts, filters, filters_inner, stride_init=2):
+    output = input
+    for i in range(counts):
+        block_name = "%s_%d" % (stage_name, i)
+        output = residual_block(
+            output, block_name, filters, filters_inner, stride_init if i == 0 else 1
+        )
+    return output
+
+
+def resnet_conv_x_body(input, on_stage_end=lambda x: x):
+    output = input
+    for (i, (counts, filters, filters_inner)) in enumerate(
+        zip(BLOCK_COUNTS, BLOCK_FILTERS, BLOCK_FILTERS_INNER)
+    ):
+        stage_name = "res%d" % (i + 2)
+        output = residual_stage(
+            output, stage_name, counts, filters, filters_inner, 1 if i == 0 else 2
+        )
+        on_stage_end(output)
+    return output
+
+
+def resnet_stem(input):
+    conv1 = _conv2d("conv1", input, 64, 7, 2)
+    conv1_bn = flow.math.relu(_batch_norm(conv1, "conv1_bn"))
+    pool1 = flow.nn.max_pool2d(
+        conv1_bn, ksize=3, strides=2, padding="VALID", data_format="NCHW", name="pool1"
+    )
+    return pool1
+
+
+def resnet50(images, trainable=True):
+    with flow.scope.namespace("Resnet"):
+        stem = resnet_stem(images)
+        body = resnet_conv_x_body(stem, lambda x: x)
+        pool5 = flow.nn.avg_pool2d(
+            body, ksize=7, strides=1, padding="VALID", data_format="NCHW", name="pool5"
+        )
+        fc1001 = flow.layers.dense(
+            flow.reshape(pool5, (pool5.shape[0], -1)),
+            units=1001,
+            use_bias=True,
+            kernel_initializer=flow.xavier_uniform_initializer(),
+            bias_initializer=flow.zeros_initializer(),
+            trainable=trainable,
+            name="fc1001",
+        )
+    return fc1001
diff --git a/python/oneflow/benchmarks/cnn_benchmark/vgg_model.py b/python/oneflow/benchmarks/cnn_benchmark/vgg_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1c735583c1c87c5bf6b5b69225b4fa90a5b09ab
--- /dev/null
+++ b/python/oneflow/benchmarks/cnn_benchmark/vgg_model.py
@@ -0,0 +1,93 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from model_util import conv2d_layer
+
+import oneflow as flow
+import oneflow.core.job.initializer_conf_pb2 as initializer_conf_util
+import oneflow.core.operator.op_conf_pb2 as op_conf_util
+
+
+def _conv_block(in_blob, index, filters, conv_times):
+    conv_block = []
+    conv_block.insert(0, in_blob)
+    for i in range(conv_times):
+        conv_i = conv2d_layer(
+            name="conv{}".format(index),
+            input=conv_block[i],
+            filters=filters,
+            kernel_size=3,
+            strides=1,
+        )
+        conv_block.append(conv_i)
+        index += 1
+    return conv_block
+
+
+def vgg16(images, trainable=True):
+    conv1 = _conv_block(images, 0, 64, 2)
+    pool1 = flow.nn.max_pool2d(conv1[-1], 2, 2, "VALID", "NCHW", name="pool1")
+    conv2 = _conv_block(pool1, 2, 128, 2)
+    pool2 = flow.nn.max_pool2d(conv2[-1], 2, 2, "VALID", "NCHW", name="pool2")
+    conv3 = _conv_block(pool2, 4, 256, 3)
+    pool3 = flow.nn.max_pool2d(conv3[-1], 2, 2, "VALID", "NCHW", name="pool3")
+    conv4 = _conv_block(pool3, 7, 512, 3)
+    pool4 = flow.nn.max_pool2d(conv4[-1], 2, 2, "VALID", "NCHW", name="pool4")
+    conv5 = _conv_block(pool4, 10, 512, 3)
+    pool5 = flow.nn.max_pool2d(conv5[-1], 2, 2, "VALID", "NCHW", name="pool5")
+
+    def _get_kernel_initializer():
+        kernel_initializer = initializer_conf_util.InitializerConf()
+        kernel_initializer.truncated_normal_conf.std = 0.816496580927726
+        return kernel_initializer
+
+    def _get_bias_initializer():
+        bias_initializer = initializer_conf_util.InitializerConf()
+        bias_initializer.constant_conf.value = 0.0
+        return bias_initializer
+
+    pool5 = flow.reshape(pool5, [pool5.shape[0], -1])
+    fc6 = flow.layers.dense(
+        inputs=pool5,
+        units=4096,
+        activation=flow.math.relu,
+        use_bias=True,
+        kernel_initializer=_get_kernel_initializer(),
+        bias_initializer=_get_bias_initializer(),
+        trainable=trainable,
+        name="fc1",
+    )
+    fc6 = flow.nn.dropout(fc6, rate=0.5)
+    fc7 = flow.layers.dense(
+        inputs=fc6,
+        units=4096,
+        activation=flow.math.relu,
+        use_bias=True,
+        kernel_initializer=_get_kernel_initializer(),
+        bias_initializer=_get_bias_initializer(),
+        trainable=trainable,
+        name="fc2",
+    )
+    fc7 = flow.nn.dropout(fc7, rate=0.5)
+    fc8 = flow.layers.dense(
+        inputs=fc7,
+        units=1001,
+        use_bias=True,
+        kernel_initializer=_get_kernel_initializer(),
+        bias_initializer=_get_bias_initializer(),
+        trainable=trainable,
+        name="fc_final",
+    )
+    return fc8
diff --git a/python/oneflow/benchmarks/coco_data_load/coco_data_loader.py b/python/oneflow/benchmarks/coco_data_load/coco_data_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..d25cf2682c49299b5bdf5013cb8bc59bc4edc9e5
--- /dev/null
+++ b/python/oneflow/benchmarks/coco_data_load/coco_data_loader.py
@@ -0,0 +1,157 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import math
+import time
+
+import pandas as pd
+
+import oneflow as flow
+
+
+class COCODataLoadConfig(object):
+    def __init__(self):
+        self.annotation_file = (
+            "/dataset/mscoco_2017/annotations/instances_train2017.json"
+        )
+        self.image_dir = "/dataset/mscoco_2017/train2017"
+        self.shuffle_after_epoch = True
+        self.stride_partition = False
+        self.batch_size = 2
+        self.target_size = 800
+        self.max_size = 1333
+        self.image_align_size = 32
+        self.image_normal_std = (1.0, 1.0, 1.0)
+        self.image_normal_mean = (102.9801, 115.9465, 122.7717)
+        self.max_num_objs = 512
+
+
+def roundup(x, align):
+    return int(math.ceil(x / float(align)) * align)
+
+
+def coco_data_load(cfg, machine_id, nrank):
+    with flow.scope.placement("cpu", "{}:0-{}".format(machine_id, nrank - 1)):
+        (
+            image,
+            image_id,
+            image_size,
+            bbox,
+            label,
+            segm_poly,
+            segm_poly_index,
+        ) = flow.data.coco_reader(
+            annotation_file=cfg.annotation_file,
+            image_dir=cfg.image_dir,
+            batch_size=cfg.batch_size,
+            shuffle=cfg.shuffle_after_epoch,
+            stride_partition=cfg.stride_partition,
+            name="coco_reader",
+        )
+        image = flow.image.decode(image, dtype=flow.float)
+        aligned_target_size = roundup(cfg.target_size, cfg.image_align_size)
+        aligned_max_size = roundup(cfg.max_size, cfg.image_align_size)
+        (image, new_size, scale) = flow.image.target_resize(
+            image, target_size=aligned_target_size, max_size=aligned_max_size
+        )
+        bbox = flow.detection.object_bbox_scale(bbox, scale)
+        segm_poly = flow.detection.object_segmentation_polygon_scale(segm_poly, scale)
+        flip_code = flow.random.coin_flip(cfg.batch_size)
+        image = flow.image.flip(image, flip_code)
+        bbox = flow.detection.object_bbox_flip(bbox, new_size, flip_code)
+        segm_poly = flow.detection.object_segmentation_polygon_flip(
+            segm_poly, new_size, flip_code
+        )
+        image = flow.image.normalize(image, cfg.image_normal_std, cfg.image_normal_mean)
+        image = flow.image.batch_align(
+            image,
+            shape=(aligned_target_size, aligned_max_size, 3),
+            dtype=flow.float,
+            alignment=cfg.image_align_size,
+        )
+        gt_bbox = flow.tensor_buffer_to_list_of_tensors(
+            bbox, (cfg.max_num_objs, 4), flow.float, True
+        )
+        gt_label = flow.tensor_buffer_to_list_of_tensors(
+            label, (cfg.max_num_objs,), flow.int32, True
+        )
+        segm_mask = flow.detection.object_segmentation_polygon_to_mask(
+            segm_poly, segm_poly_index, new_size
+        )
+        gt_mask = flow.tensor_buffer_to_list_of_tensors(
+            segm_mask,
+            (cfg.max_num_objs, aligned_target_size, aligned_max_size),
+            flow.int8,
+            True,
+        )
+        return {
+            "image": image,
+            "image_size": new_size,
+            "gt_bbox": list(gt_bbox),
+            "gt_label": list(gt_label),
+            "gt_mask": list(gt_mask),
+        }
+
+
+def _make_data_load_fn():
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_distribute_strategy(flow.scope.consistent_view())
+    cfg = COCODataLoadConfig()
+
+    @flow.global_function(func_config)
+    def data_load_fn():
+        return coco_data_load(cfg, 0, 1)
+
+    return data_load_fn
+
+
+def _benchmark(iter_num, drop_first_iters, verbose=False):
+    flow.env.init()
+    data_loader = _make_data_load_fn()
+    s = pd.Series([], name="time_elapsed", dtype="float32")
+    timestamp = time.perf_counter()
+    for i in range(iter_num):
+        dict = data_loader().get()
+        image = dict["image"]
+        image_size = dict["image_size"]
+        gt_bbox = dict["gt_bbox"]
+        gt_label = dict["gt_label"]
+        gt_mask = dict["gt_mask"]
+        cur = time.perf_counter()
+        s[i] = cur - timestamp
+        timestamp = cur
+        if verbose:
+            print("==== iter {} ====".format(i))
+            print(
+                "image: {}\n".format(image.numpy_list()[0].shape), image.numpy_list()[0]
+            )
+            print(
+                "image_size: {}\n".format(image_size.numpy().shape), image_size.numpy()
+            )
+            print("gt_bbox:\n", [x.numpy_list()[0] for x in gt_bbox])
+            print("gt_label:\n", [x.numpy_list()[0] for x in gt_label])
+            print("gt_mask:\n", [x.numpy_list()[0] for x in gt_mask])
+    print(
+        "mean of time elapsed of {} iters (dropped {} first iters): {}".format(
+            iter_num, drop_first_iters, s[drop_first_iters:].mean()
+        )
+    )
+    s.to_csv("coco_data_benchmark.csv", header=True)
+
+
+if __name__ == "__main__":
+    _benchmark(500, 10)
diff --git a/python/oneflow/checkpoint.py b/python/oneflow/checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce9e378cbefe1568344ac34942e21b27c943b65f
--- /dev/null
+++ b/python/oneflow/checkpoint.py
@@ -0,0 +1,17 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.framework.check_point_v2 import GetCheckpoint as get
+from oneflow.framework.check_point_v2 import SaveVarDict as save
diff --git a/python/oneflow/compatible/__init__.py b/python/oneflow/compatible/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/python/oneflow/compatible/single_client/F/__init__.py b/python/oneflow/compatible/single_client/F/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/python/oneflow/compatible/single_client/__init__.py b/python/oneflow/compatible/single_client/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..215e7eec2f8868e024350444fd95d78bf3f7bca7
--- /dev/null
+++ b/python/oneflow/compatible/single_client/__init__.py
@@ -0,0 +1,437 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import oneflow._oneflow_internal
+
+Size = oneflow._oneflow_internal.Size
+device = oneflow._oneflow_internal.device
+placement = oneflow._oneflow_internal.placement
+no_grad = oneflow._oneflow_internal.autograd.no_grad
+locals()["dtype"] = oneflow._oneflow_internal.dtype
+locals()["char"] = oneflow._oneflow_internal.char
+locals()["float16"] = oneflow._oneflow_internal.float16
+locals()["half"] = oneflow._oneflow_internal.float16
+locals()["float32"] = oneflow._oneflow_internal.float32
+locals()["float"] = oneflow._oneflow_internal.float
+locals()["double"] = oneflow._oneflow_internal.double
+locals()["float64"] = oneflow._oneflow_internal.float64
+locals()["int8"] = oneflow._oneflow_internal.int8
+locals()["int"] = oneflow._oneflow_internal.int32
+locals()["int32"] = oneflow._oneflow_internal.int32
+locals()["int64"] = oneflow._oneflow_internal.int64
+locals()["long"] = oneflow._oneflow_internal.int64
+locals()["uint8"] = oneflow._oneflow_internal.uint8
+locals()["record"] = oneflow._oneflow_internal.record
+locals()["tensor_buffer"] = oneflow._oneflow_internal.tensor_buffer
+from oneflow.compatible.single_client.framework import (
+    env_util,
+    session_context,
+    session_util,
+)
+from oneflow.core.job.job_conf_pb2 import JobConfigProto
+from oneflow.core.job.job_set_pb2 import ConfigProto
+
+oneflow._oneflow_internal.DestroyGlobalWatcher()
+oneflow._oneflow_internal.DestroyGlobalForeignCallback()
+oneflow._oneflow_internal.DestroyEnv()
+import time
+
+time.sleep(1)
+del time
+oneflow._oneflow_internal.SetIsMultiClient(False)
+session_context.OpenDefaultSession(
+    session_util.Session(oneflow._oneflow_internal.NewSessionId())
+)
+oneflow._oneflow_internal.EnableEagerEnvironment(False)
+del env_util
+del session_util
+del session_context
+import oneflow.compatible.single_client.framework.c_api_util
+from oneflow.compatible.single_client.framework import (
+    python_callback,
+    register_python_callback,
+)
+
+oneflow._oneflow_internal.RegisterGlobalForeignCallback(
+    python_callback.global_python_callback
+)
+del python_callback
+del register_python_callback
+from oneflow.compatible.single_client.framework import watcher
+
+oneflow._oneflow_internal.RegisterGlobalWatcher(watcher._global_watcher)
+del watcher
+from oneflow.compatible.single_client.eager import boxing_util
+
+oneflow._oneflow_internal.deprecated.RegisterBoxingUtilOnlyOnce(
+    boxing_util._global_boxing_util
+)
+del boxing_util
+from oneflow.compatible.single_client.ops.util import custom_op_module
+
+oneflow._oneflow_internal.RegisterPyKernels(
+    custom_op_module._python_kernel_reg.kernels_
+)
+del custom_op_module
+from oneflow.compatible.single_client.framework import register_class_method_util
+
+register_class_method_util.RegisterMethod4Class()
+del register_class_method_util
+INVALID_SPLIT_AXIS = oneflow._oneflow_internal.INVALID_SPLIT_AXIS
+import atexit
+
+from oneflow.compatible.single_client.framework.session_context import (
+    TryCloseAllSession,
+)
+
+atexit.register(TryCloseAllSession)
+del TryCloseAllSession
+del atexit
+import sys
+
+__original_exit__ = sys.exit
+
+
+def custom_exit(returncode):
+    if returncode != 0:
+        import oneflow
+
+        oneflow._oneflow_internal.MasterSendAbort()
+    __original_exit__(returncode)
+
+
+sys.exit = custom_exit
+del custom_exit
+del sys
+import oneflow.compatible.single_client.nn.modules.acosh
+import oneflow.compatible.single_client.nn.modules.activation
+import oneflow.compatible.single_client.nn.modules.argwhere
+import oneflow.compatible.single_client.nn.modules.atan2
+import oneflow.compatible.single_client.nn.modules.atanh
+import oneflow.compatible.single_client.nn.modules.bmm
+import oneflow.compatible.single_client.nn.modules.constant
+import oneflow.compatible.single_client.nn.modules.floor
+import oneflow.compatible.single_client.nn.modules.greater
+import oneflow.compatible.single_client.nn.modules.greater_equal
+import oneflow.compatible.single_client.nn.modules.masked_select
+import oneflow.compatible.single_client.nn.modules.math_ops
+import oneflow.compatible.single_client.nn.modules.norm
+import oneflow.compatible.single_client.nn.modules.permute
+import oneflow.compatible.single_client.nn.modules.round
+import oneflow.compatible.single_client.nn.modules.sign
+import oneflow.compatible.single_client.nn.modules.sinh
+import oneflow.compatible.single_client.nn.modules.tan
+import oneflow.compatible.single_client.nn.modules.tensor_ops
+from oneflow.compatible.single_client.advanced.distribute_ops import (
+    cast_to_current_logical_view,
+)
+from oneflow.compatible.single_client.deprecated.initializer_util import (
+    truncated_normal_initializer as truncated_normal,
+)
+from oneflow.compatible.single_client.experimental.namescope import (
+    deprecated_name_scope as name_scope,
+)
+from oneflow.compatible.single_client.framework.check_point_v2 import (
+    GetAllVariables as get_all_variables,
+)
+from oneflow.compatible.single_client.framework.check_point_v2 import Load as load
+from oneflow.compatible.single_client.framework.check_point_v2 import (
+    LoadVariables as load_variables,
+)
+from oneflow.compatible.single_client.framework.check_point_v2 import save
+from oneflow.compatible.single_client.framework.dtype import (
+    convert_oneflow_dtype_to_numpy_dtype,
+    dtypes,
+)
+from oneflow.compatible.single_client.framework.env_util import (
+    api_enable_eager_execution as enable_eager_execution,
+)
+from oneflow.compatible.single_client.framework.env_util import (
+    api_get_current_machine_id as current_machine_id,
+)
+from oneflow.compatible.single_client.framework.env_util import (
+    api_get_current_resource as current_resource,
+)
+from oneflow.compatible.single_client.framework.function_desc import (
+    api_current_global_function_desc as current_global_function_desc,
+)
+from oneflow.compatible.single_client.framework.function_util import FunctionConfig
+from oneflow.compatible.single_client.framework.function_util import (
+    FunctionConfig as ExecutionConfig,
+)
+from oneflow.compatible.single_client.framework.function_util import (
+    FunctionConfig as function_config,
+)
+from oneflow.compatible.single_client.framework.function_util import (
+    api_oneflow_function as global_function,
+)
+from oneflow.compatible.single_client.framework.generator import (
+    create_generator as Generator,
+)
+from oneflow.compatible.single_client.framework.generator import (
+    default_generator,
+    manual_seed,
+)
+from oneflow.compatible.single_client.framework.input_blob_def import (
+    DeprecatedFixedTensorDef as FixedTensorDef,
+)
+from oneflow.compatible.single_client.framework.input_blob_def import (
+    DeprecatedMirroredTensorDef as MirroredTensorDef,
+)
+from oneflow.compatible.single_client.framework.job_set_util import (
+    inter_job_reuse_mem_strategy,
+)
+from oneflow.compatible.single_client.framework.model import Model
+from oneflow.compatible.single_client.framework.ops import api_acc as acc
+from oneflow.compatible.single_client.framework.ops import (
+    api_hierarchical_parallel_cast as hierarchical_parallel_cast,
+)
+from oneflow.compatible.single_client.framework.ops import api_pack as pack
+from oneflow.compatible.single_client.framework.ops import (
+    api_parallel_cast as parallel_cast,
+)
+from oneflow.compatible.single_client.framework.ops import api_repeat as repeat
+from oneflow.compatible.single_client.framework.ops import api_unpack as unpack
+from oneflow.compatible.single_client.framework.placement_util import (
+    deprecated_placement as device_prior_placement,
+)
+from oneflow.compatible.single_client.framework.placement_util import (
+    deprecated_placement as fixed_placement,
+)
+from oneflow.compatible.single_client.framework.scope_util import (
+    api_current_scope as current_scope,
+)
+from oneflow.compatible.single_client.framework.session_util import (
+    TmpInitEagerGlobalSession as InitEagerGlobalSession,
+)
+from oneflow.compatible.single_client.framework.session_util import (
+    api_clear_default_session as clear_default_session,
+)
+from oneflow.compatible.single_client.framework.session_util import (
+    api_eager_execution_enabled as eager_execution_enabled,
+)
+from oneflow.compatible.single_client.framework.session_util import (
+    api_find_or_create_module as find_or_create_module,
+)
+from oneflow.compatible.single_client.framework.session_util import (
+    api_sync_default_session as sync_default_session,
+)
+from oneflow.compatible.single_client.framework.tensor import Tensor
+from oneflow.compatible.single_client.framework.tensor import construct_tensor as tensor
+from oneflow.compatible.single_client.nn.modules.to import to_op as to
+from oneflow.compatible.single_client.ops.array_ops import amp_white_identity
+from oneflow.compatible.single_client.ops.array_ops import (
+    api_slice_update as slice_update,
+)
+from oneflow.compatible.single_client.ops.array_ops import (
+    argwhere,
+    broadcast_like,
+    cast_to_static_shape,
+    concat,
+    dim_gather,
+    dynamic_reshape,
+    elem_cnt,
+    expand,
+    expand_dims,
+    flatten,
+    gather,
+    gather_nd,
+    identity,
+    identity_n,
+    masked_fill,
+    nonzero,
+    ones,
+    reshape,
+    reshape_like,
+    reverse,
+    scatter_nd,
+    slice,
+    slice_v2,
+    squeeze,
+    stack,
+    sync_dynamic_resize,
+    tensor_scatter_nd_add,
+    tensor_scatter_nd_update,
+    transpose,
+    where,
+    zeros,
+)
+from oneflow.compatible.single_client.ops.assign_op import assign
+from oneflow.compatible.single_client.ops.builtin_ops import BuiltinOp as builtin_op
+from oneflow.compatible.single_client.ops.categorical_ordinal_encode_op import (
+    categorical_ordinal_encode,
+)
+from oneflow.compatible.single_client.ops.combined_margin_loss import (
+    combined_margin_loss,
+)
+from oneflow.compatible.single_client.ops.constant_op import (
+    constant,
+    constant_like,
+    constant_scalar,
+    ones_like,
+    zeros_like,
+)
+from oneflow.compatible.single_client.ops.count_not_finite import (
+    count_not_finite,
+    multi_count_not_finite,
+)
+from oneflow.compatible.single_client.ops.diag_ops import diag
+from oneflow.compatible.single_client.ops.eager_nccl_ops import eager_nccl_all_reduce
+from oneflow.compatible.single_client.ops.get_variable import (
+    api_get_variable as get_variable,
+)
+from oneflow.compatible.single_client.ops.initializer_util import (
+    constant_initializer,
+    empty_initializer,
+)
+from oneflow.compatible.single_client.ops.initializer_util import (
+    glorot_normal_initializer,
+)
+from oneflow.compatible.single_client.ops.initializer_util import (
+    glorot_normal_initializer as xavier_normal_initializer,
+)
+from oneflow.compatible.single_client.ops.initializer_util import (
+    glorot_uniform_initializer,
+)
+from oneflow.compatible.single_client.ops.initializer_util import (
+    glorot_uniform_initializer as xavier_uniform_initializer,
+)
+from oneflow.compatible.single_client.ops.initializer_util import (
+    kaiming_initializer,
+    ones_initializer,
+    random_normal_initializer,
+    random_uniform_initializer,
+    truncated_normal_initializer,
+    variance_scaling_initializer,
+    zeros_initializer,
+)
+from oneflow.compatible.single_client.ops.linalg import matmul
+from oneflow.compatible.single_client.ops.loss_ops import ctc_loss, smooth_l1_loss
+from oneflow.compatible.single_client.ops.math_ops import (
+    broadcast_to_compatible_with as broadcast_to_compatible_with,
+)
+from oneflow.compatible.single_client.ops.math_ops import cast
+from oneflow.compatible.single_client.ops.math_ops import clip_by_value as clamp
+from oneflow.compatible.single_client.ops.math_ops import clip_by_value as clip
+from oneflow.compatible.single_client.ops.math_ops import (
+    clip_by_value as clip_by_scalar,
+)
+from oneflow.compatible.single_client.ops.math_ops import clip_by_value as clip_by_value
+from oneflow.compatible.single_client.ops.math_ops import in_top_k as in_top_k
+from oneflow.compatible.single_client.ops.math_ops import range
+from oneflow.compatible.single_client.ops.math_ops import (
+    unsorted_batch_segment_sum as unsorted_batch_segment_sum,
+)
+from oneflow.compatible.single_client.ops.math_ops import (
+    unsorted_segment_sum as unsorted_segment_sum,
+)
+from oneflow.compatible.single_client.ops.math_ops import (
+    unsorted_segment_sum_like as unsorted_segment_sum_like,
+)
+from oneflow.compatible.single_client.ops.one_hot import one_hot
+from oneflow.compatible.single_client.ops.pad import (
+    constant_pad2d,
+    pad,
+    pad_grad,
+    reflection_pad2d,
+    replication_pad2d,
+    same_padding,
+    zero_pad2d,
+)
+from oneflow.compatible.single_client.ops.partial_fc_sample import (
+    distributed_partial_fc_sample,
+)
+from oneflow.compatible.single_client.ops.sort_ops import argsort, sort
+from oneflow.compatible.single_client.ops.tensor_buffer_ops import (
+    gen_tensor_buffer,
+    tensor_buffer_to_list_of_tensors,
+    tensor_buffer_to_tensor,
+    tensor_to_tensor_buffer,
+)
+from oneflow.compatible.single_client.ops.user_data_ops import (
+    api_image_random_crop as image_random_crop,
+)
+from oneflow.compatible.single_client.ops.user_data_ops import (
+    api_image_resize as image_resize,
+)
+from oneflow.compatible.single_client.ops.user_data_ops import (
+    api_image_target_resize as image_target_resize,
+)
+from oneflow.compatible.single_client.ops.user_data_ops import (
+    image_batch_align as image_batch_align,
+)
+from oneflow.compatible.single_client.ops.user_data_ops import (
+    image_decode as image_decode,
+)
+from oneflow.compatible.single_client.ops.user_data_ops import image_flip as image_flip
+from oneflow.compatible.single_client.ops.user_data_ops import (
+    image_normalize as image_normalize,
+)
+from oneflow.compatible.single_client.ops.user_data_ops import (
+    object_bbox_flip as object_bbox_flip,
+)
+from oneflow.compatible.single_client.ops.user_data_ops import (
+    object_bbox_scale as object_bbox_scale,
+)
+from oneflow.compatible.single_client.ops.user_data_ops import (
+    object_segm_poly_flip as object_segmentation_polygon_flip,
+)
+from oneflow.compatible.single_client.ops.user_data_ops import (
+    object_segm_poly_scale as object_segmentation_polygon_scale,
+)
+from oneflow.compatible.single_client.ops.user_data_ops import (
+    object_segm_poly_to_mask as object_segmentation_polygon_to_mask,
+)
+from oneflow.compatible.single_client.ops.user_op_builder import (
+    api_consistent_user_op_builder as consistent_user_op_builder,
+)
+from oneflow.compatible.single_client.ops.user_op_builder import (
+    api_consistent_user_op_module_builder as consistent_user_op_module_builder,
+)
+from oneflow.compatible.single_client.ops.user_op_builder import (
+    api_user_op_builder as user_op_builder,
+)
+from oneflow.compatible.single_client.ops.user_op_builder import (
+    api_user_op_module_builder as user_op_module_builder,
+)
+from oneflow.compatible.single_client.ops.watch import Watch as watch
+from oneflow.compatible.single_client.ops.watch import WatchDiff as watch_diff
+
+from . import (
+    checkpoint,
+    config,
+    data,
+    distribute,
+    distributed,
+    env,
+    image,
+    layers,
+    losses,
+    math,
+    model,
+    optimizer,
+    profiler,
+    random,
+    regularizers,
+    saved_model,
+    scope,
+    summary,
+    sysconfig,
+    tensorrt,
+    train,
+    typing,
+    util,
+)
diff --git a/python/oneflow/compatible/single_client/__main__.py b/python/oneflow/compatible/single_client/__main__.py
new file mode 100644
index 0000000000000000000000000000000000000000..42865d5096cd88b14edee0d002a9562e8b8aca60
--- /dev/null
+++ b/python/oneflow/compatible/single_client/__main__.py
@@ -0,0 +1,45 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import argparse
+import os
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--start_worker", default=False, action="store_true", required=False
+)
+parser.add_argument("--env_proto", type=str, required=False)
+args = parser.parse_args()
+
+
+def StartWorker(env_proto):
+    import oneflow._oneflow_internal
+
+    oneflow._oneflow_internal.InitEnv(env_proto, False)
+
+
+def main():
+    start_worker = args.start_worker
+    if start_worker:
+        env_proto = args.env_proto
+        assert os.path.isfile(
+            env_proto
+        ), "env_proto not found, please check your env_proto path: {}".format(env_proto)
+        with open(env_proto, "rb") as f:
+            StartWorker(f.read())
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/oneflow/compatible/single_client/advanced/__init__.py b/python/oneflow/compatible/single_client/advanced/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..14070b2c5c09238bca2ed58dced16ef15bc0b39a
--- /dev/null
+++ b/python/oneflow/compatible/single_client/advanced/__init__.py
@@ -0,0 +1,31 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from oneflow.compatible.single_client.advanced.distribute_ops import (
+    api_distribute_add as distribute_add,
+)
+from oneflow.compatible.single_client.advanced.distribute_ops import (
+    api_distribute_clone as distribute_clone,
+)
+from oneflow.compatible.single_client.advanced.distribute_ops import (
+    api_distribute_concat as distribute_concat,
+)
+from oneflow.compatible.single_client.advanced.distribute_ops import (
+    api_distribute_map as distribute_map,
+)
+from oneflow.compatible.single_client.advanced.distribute_ops import (
+    api_distribute_split as distribute_split,
+)
diff --git a/python/oneflow/compatible/single_client/advanced/distribute_ops.py b/python/oneflow/compatible/single_client/advanced/distribute_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..64487f836398fb0064014a2106b5068dc9cc929a
--- /dev/null
+++ b/python/oneflow/compatible/single_client/advanced/distribute_ops.py
@@ -0,0 +1,226 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Callable, List, Optional, Sequence, Tuple, Union
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import hob as hob
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework import interpret_util as interpret_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+from oneflow.compatible.single_client.support import enable_if as enable_if
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+from oneflow.core.register import logical_blob_id_pb2 as logical_blob_id_util
+
+
+def api_distribute_clone(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> Tuple[oneflow._oneflow_internal.BlobDesc]:
+    func = enable_if.unique([distribute_clone])
+    return func(x, name=name)
+
+
+@enable_if.condition(hob.in_global_mode)
+def distribute_clone(x, name=None):
+    if name is None:
+        name = id_util.UniqueStr("DistributeClone_")
+    op_conf = op_conf_util.OperatorConf()
+    op_conf.name = name
+    setattr(op_conf.distribute_clone_conf, "in", x.unique_name)
+    parallel_size = flow.current_scope().device_parallel_desc_symbol.parallel_num
+    op_conf.distribute_clone_conf.out.extend(
+        ["out_%d" % i for i in range(parallel_size)]
+    )
+    interpret_util.ConsistentForward(op_conf)
+    ret = []
+    for i in range(parallel_size):
+        out = "out_%d" % i
+        lbi = logical_blob_id_util.LogicalBlobId()
+        lbi.op_name = op_conf.name
+        lbi.blob_name = out
+        ret.append(remote_blob_util.RemoteBlob(lbi))
+    return tuple(ret)
+
+
+def api_distribute_add(
+    xs: Sequence[oneflow._oneflow_internal.BlobDesc], name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    func = enable_if.unique([distribute_add])
+    return func(xs, name=name)
+
+
+@enable_if.condition(hob.in_global_mode)
+def distribute_add(xs, name=None):
+    assert flow.current_scope().device_parallel_desc_symbol.parallel_num == len(xs)
+    if name is None:
+        name = id_util.UniqueStr("DistributeAdd_")
+    op_conf = op_conf_util.OperatorConf()
+    op_conf.name = name
+    getattr(op_conf.distribute_add_conf, "in").extend(
+        [_SoleConsistentLbn(x) for x in xs]
+    )
+    op_conf.distribute_add_conf.out = "out"
+    interpret_util.ConsistentForward(op_conf)
+    lbi = logical_blob_id_util.LogicalBlobId()
+    lbi.op_name = op_conf.name
+    lbi.blob_name = "out"
+    return remote_blob_util.RemoteBlob(lbi)
+
+
+def api_distribute_split(
+    x: oneflow._oneflow_internal.BlobDesc, axis: int = 0, name: Optional[str] = None
+) -> Tuple[oneflow._oneflow_internal.BlobDesc]:
+    func = enable_if.unique([distribute_split])
+    return func(x, axis=axis, name=name)
+
+
+@enable_if.condition(hob.in_global_mode)
+def distribute_split(x, axis=0, name=None):
+    if name is None:
+        name = id_util.UniqueStr("DistributeSplit_")
+    op_conf = op_conf_util.OperatorConf()
+    op_conf.name = name
+    setattr(op_conf.distribute_split_conf, "in", x.unique_name)
+    op_conf.distribute_split_conf.axis = axis
+    parallel_size = flow.current_scope().device_parallel_desc_symbol.parallel_num
+    op_conf.distribute_split_conf.out.extend(
+        ["out_%d" % i for i in range(parallel_size)]
+    )
+    interpret_util.ConsistentForward(op_conf)
+    ret = []
+    for i in range(parallel_size):
+        out = "out_%d" % i
+        lbi = logical_blob_id_util.LogicalBlobId()
+        lbi.op_name = op_conf.name
+        lbi.blob_name = out
+        ret.append(remote_blob_util.RemoteBlob(lbi))
+    return tuple(ret)
+
+
+def api_distribute_concat(
+    xs: Sequence[oneflow._oneflow_internal.BlobDesc],
+    axis: int = 0,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    func = enable_if.unique([distribute_concat])
+    return func(xs, axis=axis, name=name)
+
+
+@enable_if.condition(hob.in_global_mode)
+def distribute_concat(xs, axis=0, name=None):
+    assert flow.current_scope().device_parallel_desc_symbol.parallel_num == len(xs)
+    if name is None:
+        name = id_util.UniqueStr("DistributeConcat_")
+    op_conf = op_conf_util.OperatorConf()
+    op_conf.name = name
+    getattr(op_conf.distribute_concat_conf, "in").extend(
+        [_SoleConsistentLbn(x) for x in xs]
+    )
+    op_conf.distribute_concat_conf.axis = axis
+    op_conf.distribute_concat_conf.out = "out"
+    interpret_util.ConsistentForward(op_conf)
+    lbi = logical_blob_id_util.LogicalBlobId()
+    lbi.op_name = op_conf.name
+    lbi.blob_name = "out"
+    return remote_blob_util.RemoteBlob(lbi)
+
+
+def api_distribute_map(
+    xs: Union[
+        Sequence[oneflow._oneflow_internal.BlobDesc], oneflow._oneflow_internal.BlobDesc
+    ],
+    f: Callable[
+        [oneflow._oneflow_internal.BlobDesc, oneflow._oneflow_internal.BlobDesc],
+        oneflow._oneflow_internal.BlobDesc,
+    ],
+    axis: int = 0,
+) -> Tuple[oneflow._oneflow_internal.BlobDesc]:
+    func = enable_if.unqiue([distribute_map])
+    return func(xs, f, axis=axis)
+
+
+@enable_if.condition(hob.in_global_mode)
+def distribute_map(xs, f, axis=0):
+    _AssertInputOrOutput(xs)
+    if isinstance(xs, (list, tuple)) == False:
+        xs = [xs]
+    splitted_xs = [flow.advanced.distribute_split(x, axis=axis) for x in xs]
+    results = [_UnderSingleDevicePlacementScope(f, *x) for x in zip(*splitted_xs)]
+    output_is_not_container = all(
+        [isinstance(x, oneflow._oneflow_internal.ConsistentBlob) for x in results]
+    )
+    results = [_TryWrapTuple(x) for x in results]
+    result = [flow.advanced.distribute_concat(x, axis=axis) for x in zip(*results)]
+    if output_is_not_container:
+        return result[0]
+    return tuple(result)
+
+
+def cast_to_current_logical_view(
+    x: oneflow._oneflow_internal.BlobDesc,
+) -> oneflow._oneflow_internal.BlobDesc:
+    if (
+        isinstance(x, oneflow._oneflow_internal.ConsistentBlob)
+        and flow.scope.mirrored_view_enabled()
+        or (
+            isinstance(x, oneflow._oneflow_internal.MirroredBlob)
+            and flow.scope.consistent_view_enabled()
+        )
+    ):
+        x = flow.identity(x)
+    return x
+
+
+def _SoleConsistentLbn(blob):
+    assert blob.parallel_size == 1
+    if isinstance(blob, oneflow._oneflow_internal.ConsistentBlob):
+        return blob.unique_name
+    if isinstance(blob, oneflow._oneflow_internal.MirroredBlob):
+        return blob.sub_consistent_blob_list[0].unique_name
+    raise NotImplementedError
+
+
+def _AssertInputOrOutput(xs):
+    assert isinstance(xs, (list, tuple, oneflow._oneflow_internal.ConsistentBlob))
+    if isinstance(xs, (list, tuple)):
+        assert len(xs) > 0
+        assert all(
+            [isinstance(x, oneflow._oneflow_internal.ConsistentBlob) for x in xs]
+        )
+
+
+def _TryWrapTuple(ys):
+    _AssertInputOrOutput(ys)
+    if isinstance(ys, (list, tuple)) == False:
+        ys = (ys,)
+    return ys
+
+
+def _UnderSingleDevicePlacementScope(f, *args):
+    parallel_desc_symbol = flow.current_scope().device_parallel_desc_symbol
+    for (machine_id, device_id) in _EachMachineIdAndDeviceId(parallel_desc_symbol):
+        mch_dev_str = "@%d:%d" % (machine_id, device_id)
+        with flow.scope.placement(parallel_desc_symbol.device_tag, mch_dev_str):
+            return f(*args)
+
+
+def _EachMachineIdAndDeviceId(parallel_desc_symbol):
+    for (
+        machine_id,
+        device_id_list,
+    ) in parallel_desc_symbol.machine_id2device_id_list.items():
+        for device_id in device_id_list:
+            yield (machine_id, device_id)
diff --git a/python/oneflow/compatible/single_client/autograd/__init__.py b/python/oneflow/compatible/single_client/autograd/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cd8029edd4c55046da8d643ee732c2b8aa0f05d
--- /dev/null
+++ b/python/oneflow/compatible/single_client/autograd/__init__.py
@@ -0,0 +1,17 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from oneflow.compatible.single_client.autograd.autograd import backward, grad
diff --git a/python/oneflow/compatible/single_client/autograd/autograd.py b/python/oneflow/compatible/single_client/autograd/autograd.py
new file mode 100644
index 0000000000000000000000000000000000000000..2afad6bc4e89a45d476c398f2ed6ffda441bef9c
--- /dev/null
+++ b/python/oneflow/compatible/single_client/autograd/autograd.py
@@ -0,0 +1,55 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Sequence, Tuple, Union
+
+from oneflow._oneflow_internal import TensorTuple
+from oneflow._oneflow_internal.autograd import backward as backward_api
+from oneflow._oneflow_internal.autograd import grad as grad_api
+from oneflow.compatible.single_client.framework.tensor import Tensor
+from oneflow.compatible.single_client.framework.tensor_tuple_util import (
+    convert_to_tensor_tuple,
+)
+
+
+def grad(
+    outputs: Union[Tensor, Sequence[Tensor]],
+    inputs: Union[Tensor, Sequence[Tensor]],
+    out_grads: Union[Tensor, Sequence[Tensor], None] = None,
+    retain_graph: bool = False,
+    create_graph: bool = False,
+) -> Tuple[Tensor]:
+    in_grads = grad_api(
+        convert_to_tensor_tuple(outputs),
+        convert_to_tensor_tuple(inputs),
+        convert_to_tensor_tuple(out_grads),
+        retain_graph,
+        create_graph,
+    )
+    return tuple([Tensor(x) for x in in_grads])
+
+
+def backward(
+    outputs: Union[Tensor, Sequence[Tensor]],
+    out_grads: Union[Tensor, Sequence[Tensor], None],
+    retain_graph: bool = False,
+    create_graph: bool = False,
+) -> None:
+    backward_api(
+        convert_to_tensor_tuple(outputs),
+        convert_to_tensor_tuple(out_grads),
+        retain_graph,
+        create_graph,
+    )
diff --git a/python/oneflow/compatible/single_client/benchmarks/bert_benchmark/benchmark_util.py b/python/oneflow/compatible/single_client/benchmarks/bert_benchmark/benchmark_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..95bf2d602b6fffe9d9c4a1e8f9c535e2c2102227
--- /dev/null
+++ b/python/oneflow/compatible/single_client/benchmarks/bert_benchmark/benchmark_util.py
@@ -0,0 +1,110 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import time
+
+import numpy as np
+
+
+class StopWatch:
+    def __init__(self):
+        pass
+
+    def start(self):
+        self.start_time = time.time()
+        self.last_split = self.start_time
+
+    def set_start(self, val):
+        self.start_time = val
+        self.last_split = self.start_time
+
+    def split(self):
+        now = time.time()
+        duration = now - self.last_split
+        self.last_split = now
+        return duration
+
+    def stop(self):
+        self.stop_time = time.time()
+
+    def duration(self):
+        return self.stop_time - self.start_time
+
+
+class BERTSpeedometer:
+    def __init__(self):
+        self.watch = StopWatch()
+        self.throughoutput_list = []
+
+    def speedometer_cb(
+        self,
+        step,
+        start_time,
+        total_batch_size,
+        skip_iter_num,
+        iter_num,
+        loss_print_every_n_iter,
+    ):
+        def callback(train_loss):
+            assert skip_iter_num >= 0
+            if skip_iter_num == 0 and step == 0:
+                self.watch.set_start(start_time)
+                print("Start trainning without any skipping iteration.")
+            if step < skip_iter_num:
+                if step == 0:
+                    print(
+                        "Skipping {} iterations for benchmark purpose.".format(
+                            skip_iter_num
+                        )
+                    )
+                if step + 1 == skip_iter_num:
+                    self.watch.start()
+                    print("Start trainning.")
+            else:
+                train_step = step - skip_iter_num
+                if (train_step + 1) % loss_print_every_n_iter == 0:
+                    total_loss = train_loss[0].mean()
+                    mlm_loss = train_loss[1].mean()
+                    nsp_loss = train_loss[2].mean()
+                    avg_elapse_time_per_iter = (
+                        self.watch.split() / loss_print_every_n_iter
+                    )
+                    sentences_per_sec = total_batch_size / avg_elapse_time_per_iter
+                    print(
+                        "iter {}, total_loss: {:.3f}, mlm_loss: {:.3f}, nsp_loss: {:.3f}, speed: {:.3f}(sec/batch), {:.3f}(sentences/sec)".format(
+                            train_step,
+                            total_loss,
+                            mlm_loss,
+                            nsp_loss,
+                            avg_elapse_time_per_iter,
+                            sentences_per_sec,
+                        )
+                    )
+                    self.throughoutput_list.append(sentences_per_sec)
+                if train_step + 1 == iter_num:
+                    self.watch.stop()
+                    totoal_duration = self.watch.duration()
+                    avg_sentences_per_sec = (
+                        total_batch_size * iter_num / totoal_duration
+                    )
+                    print("-".ljust(66, "-"))
+                    print(
+                        "average speed: {:.3f}(sentences/sec), new_cal_method: {:.3f}(sentences/sec)".format(
+                            avg_sentences_per_sec, np.mean(self.throughoutput_list)
+                        )
+                    )
+                    print("-".ljust(66, "-"))
+
+        return callback
diff --git a/python/oneflow/compatible/single_client/benchmarks/bert_benchmark/bert.py b/python/oneflow/compatible/single_client/benchmarks/bert_benchmark/bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..62f1f8b23067f5f1b8d04f599d120ef7b4ca712b
--- /dev/null
+++ b/python/oneflow/compatible/single_client/benchmarks/bert_benchmark/bert.py
@@ -0,0 +1,397 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import math
+
+from oneflow.compatible import single_client as flow
+from oneflow.core.common import data_type_pb2 as data_type_util
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+
+
+class BertBackbone(object):
+    def __init__(
+        self,
+        input_ids_blob,
+        input_mask_blob,
+        token_type_ids_blob,
+        vocab_size,
+        seq_length=512,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        initializer_range=0.02,
+    ):
+        with flow.scope.namespace("bert"):
+            with flow.scope.namespace("embeddings"):
+                (self.embedding_output_, self.embedding_table_) = _EmbeddingLookup(
+                    input_ids_blob=input_ids_blob,
+                    vocab_size=vocab_size,
+                    embedding_size=hidden_size,
+                    initializer_range=initializer_range,
+                    word_embedding_name="word_embeddings",
+                )
+                self.embedding_output_ = _EmbeddingPostprocessor(
+                    input_blob=self.embedding_output_,
+                    seq_length=seq_length,
+                    embedding_size=hidden_size,
+                    use_token_type=True,
+                    token_type_ids_blob=token_type_ids_blob,
+                    token_type_vocab_size=type_vocab_size,
+                    token_type_embedding_name="token_type_embeddings",
+                    use_position_embeddings=True,
+                    position_embedding_name="position_embeddings",
+                    initializer_range=initializer_range,
+                    max_position_embeddings=max_position_embeddings,
+                    dropout_prob=hidden_dropout_prob,
+                )
+            with flow.scope.namespace("encoder"):
+                attention_mask_blob = _CreateAttentionMaskFromInputMask(
+                    input_mask_blob,
+                    from_seq_length=seq_length,
+                    to_seq_length=seq_length,
+                )
+                self.all_encoder_layers_ = _TransformerModel(
+                    input_blob=self.embedding_output_,
+                    attention_mask_blob=attention_mask_blob,
+                    seq_length=seq_length,
+                    hidden_size=hidden_size,
+                    num_hidden_layers=num_hidden_layers,
+                    num_attention_heads=num_attention_heads,
+                    intermediate_size=intermediate_size,
+                    intermediate_act_fn=GetActivation(hidden_act),
+                    hidden_dropout_prob=hidden_dropout_prob,
+                    attention_probs_dropout_prob=attention_probs_dropout_prob,
+                    initializer_range=initializer_range,
+                    do_return_all_layers=False,
+                )
+            self.sequence_output_ = self.all_encoder_layers_[-1]
+
+    def embedding_output(self):
+        return self.embedding_output_
+
+    def all_encoder_layers(self):
+        return self.all_encoder_layers_
+
+    def sequence_output(self):
+        return self.sequence_output_
+
+    def embedding_table(self):
+        return self.embedding_table_
+
+
+def CreateInitializer(std):
+    return flow.truncated_normal(std)
+
+
+def _Gelu(in_blob):
+    return flow.math.gelu(in_blob)
+
+
+def _TransformerModel(
+    input_blob,
+    attention_mask_blob,
+    seq_length,
+    hidden_size=768,
+    num_hidden_layers=12,
+    num_attention_heads=12,
+    intermediate_size=3072,
+    intermediate_act_fn=_Gelu,
+    hidden_dropout_prob=0.1,
+    attention_probs_dropout_prob=0.1,
+    initializer_range=0.02,
+    do_return_all_layers=False,
+):
+    assert hidden_size % num_attention_heads == 0
+    attention_head_size = int(hidden_size / num_attention_heads)
+    input_width = hidden_size
+    prev_output_blob = flow.reshape(input_blob, (-1, input_width))
+    all_layer_output_blobs = []
+    for layer_idx in range(num_hidden_layers):
+        with flow.scope.namespace("layer_%d" % layer_idx):
+            layer_input_blob = prev_output_blob
+            with flow.scope.namespace("attention"):
+                with flow.scope.namespace("self"):
+                    attention_output_blob = _AttentionLayer(
+                        from_blob=layer_input_blob,
+                        to_blob=layer_input_blob,
+                        attention_mask_blob=attention_mask_blob,
+                        num_attention_heads=num_attention_heads,
+                        size_per_head=attention_head_size,
+                        attention_probs_dropout_prob=attention_probs_dropout_prob,
+                        initializer_range=initializer_range,
+                        do_return_2d_tensor=True,
+                        from_seq_length=seq_length,
+                        to_seq_length=seq_length,
+                    )
+                with flow.scope.namespace("output"):
+                    attention_output_blob = _FullyConnected(
+                        attention_output_blob,
+                        input_size=num_attention_heads * attention_head_size,
+                        units=hidden_size,
+                        weight_initializer=CreateInitializer(initializer_range),
+                        name="dense",
+                    )
+                    attention_output_blob = _Dropout(
+                        attention_output_blob, hidden_dropout_prob
+                    )
+                    attention_output_blob = attention_output_blob + layer_input_blob
+                    attention_output_blob = _LayerNorm(
+                        attention_output_blob, hidden_size
+                    )
+            with flow.scope.namespace("intermediate"):
+                if callable(intermediate_act_fn):
+                    act_fn = op_conf_util.kNone
+                else:
+                    act_fn = intermediate_act_fn
+                intermediate_output_blob = _FullyConnected(
+                    attention_output_blob,
+                    input_size=num_attention_heads * attention_head_size,
+                    units=intermediate_size,
+                    activation=act_fn,
+                    weight_initializer=CreateInitializer(initializer_range),
+                    name="dense",
+                )
+                if callable(intermediate_act_fn):
+                    intermediate_output_blob = intermediate_act_fn(
+                        intermediate_output_blob
+                    )
+            with flow.scope.namespace("output"):
+                layer_output_blob = _FullyConnected(
+                    intermediate_output_blob,
+                    input_size=intermediate_size,
+                    units=hidden_size,
+                    weight_initializer=CreateInitializer(initializer_range),
+                    name="dense",
+                )
+                layer_output_blob = _Dropout(layer_output_blob, hidden_dropout_prob)
+                layer_output_blob = layer_output_blob + attention_output_blob
+                layer_output_blob = _LayerNorm(layer_output_blob, hidden_size)
+                prev_output_blob = layer_output_blob
+                all_layer_output_blobs.append(layer_output_blob)
+    input_shape = (-1, seq_length, hidden_size)
+    if do_return_all_layers:
+        final_output_blobs = []
+        for layer_output_blob in all_layer_output_blobs:
+            final_output_blob = flow.reshape(layer_output_blob, input_shape)
+            final_output_blobs.append(final_output_blob)
+        return final_output_blobs
+    else:
+        final_output_blob = flow.reshape(prev_output_blob, input_shape)
+        return [final_output_blob]
+
+
+def _AttentionLayer(
+    from_blob,
+    to_blob,
+    attention_mask_blob,
+    num_attention_heads=1,
+    size_per_head=512,
+    query_act=op_conf_util.kNone,
+    key_act=op_conf_util.kNone,
+    value_act=op_conf_util.kNone,
+    attention_probs_dropout_prob=0.0,
+    initializer_range=0.02,
+    do_return_2d_tensor=False,
+    batch_size=None,
+    from_seq_length=None,
+    to_seq_length=None,
+):
+    def TransposeForScores(input_blob, num_attention_heads, seq_length, width):
+        output_blob = flow.reshape(
+            input_blob, [-1, seq_length, num_attention_heads, width]
+        )
+        output_blob = flow.transpose(output_blob, perm=[0, 2, 1, 3])
+        return output_blob
+
+    from_blob_2d = flow.reshape(from_blob, [-1, num_attention_heads * size_per_head])
+    to_blob_2d = flow.reshape(to_blob, [-1, num_attention_heads * size_per_head])
+    query_blob = _FullyConnected(
+        from_blob_2d,
+        input_size=num_attention_heads * size_per_head,
+        units=num_attention_heads * size_per_head,
+        activation=query_act,
+        name="query",
+        weight_initializer=CreateInitializer(initializer_range),
+    )
+    key_blob = _FullyConnected(
+        to_blob_2d,
+        input_size=num_attention_heads * size_per_head,
+        units=num_attention_heads * size_per_head,
+        activation=key_act,
+        name="key",
+        weight_initializer=CreateInitializer(initializer_range),
+    )
+    value_blob = _FullyConnected(
+        to_blob_2d,
+        input_size=num_attention_heads * size_per_head,
+        units=num_attention_heads * size_per_head,
+        activation=value_act,
+        name="value",
+        weight_initializer=CreateInitializer(initializer_range),
+    )
+    query_blob = TransposeForScores(
+        query_blob, num_attention_heads, from_seq_length, size_per_head
+    )
+    key_blob = TransposeForScores(
+        key_blob, num_attention_heads, to_seq_length, size_per_head
+    )
+    attention_scores_blob = flow.matmul(query_blob, key_blob, transpose_b=True)
+    attention_scores_blob = attention_scores_blob * (
+        1.0 / math.sqrt(float(size_per_head))
+    )
+    attention_mask_blob = flow.reshape(
+        attention_mask_blob, [-1, 1, from_seq_length, to_seq_length]
+    )
+    attention_mask_blob = flow.cast(attention_mask_blob, dtype=flow.float)
+    addr_blob = (attention_mask_blob - 1.0) * 10000.0
+    attention_scores_blob = attention_scores_blob + addr_blob
+    attention_probs_blob = flow.nn.softmax(attention_scores_blob)
+    attention_probs_blob = _Dropout(attention_probs_blob, attention_probs_dropout_prob)
+    value_blob = flow.reshape(
+        value_blob, [-1, to_seq_length, num_attention_heads, size_per_head]
+    )
+    value_blob = flow.transpose(value_blob, perm=[0, 2, 1, 3])
+    context_blob = flow.matmul(attention_probs_blob, value_blob)
+    context_blob = flow.transpose(context_blob, perm=[0, 2, 1, 3])
+    if do_return_2d_tensor:
+        context_blob = flow.reshape(
+            context_blob, [-1, num_attention_heads * size_per_head]
+        )
+    else:
+        context_blob = flow.reshape(
+            context_blob, [-1, from_seq_length, num_attention_heads * size_per_head]
+        )
+    return context_blob
+
+
+def _FullyConnected(
+    input_blob, input_size, units, activation=None, name=None, weight_initializer=None
+):
+    weight_blob = flow.get_variable(
+        name=name + "-weight",
+        shape=[input_size, units],
+        dtype=input_blob.dtype,
+        initializer=weight_initializer,
+    )
+    bias_blob = flow.get_variable(
+        name=name + "-bias",
+        shape=[units],
+        dtype=input_blob.dtype,
+        initializer=flow.constant_initializer(0.0),
+    )
+    output_blob = flow.matmul(input_blob, weight_blob)
+    output_blob = flow.nn.bias_add(output_blob, bias_blob)
+    return output_blob
+
+
+def _Dropout(input_blob, dropout_prob):
+    if dropout_prob == 0.0:
+        return input_blob
+    return flow.nn.dropout(input_blob, rate=dropout_prob)
+
+
+def _LayerNorm(input_blob, hidden_size):
+    return flow.layers.layer_norm(
+        input_blob, name="LayerNorm", begin_norm_axis=-1, begin_params_axis=-1
+    )
+
+
+def _CreateAttentionMaskFromInputMask(to_mask_blob, from_seq_length, to_seq_length):
+    output = flow.cast(to_mask_blob, dtype=flow.float)
+    output = flow.reshape(output, [-1, 1, to_seq_length])
+    zeros = flow.constant(0.0, dtype=flow.float, shape=[from_seq_length, to_seq_length])
+    output = zeros + output
+    return output
+
+
+def _EmbeddingPostprocessor(
+    input_blob,
+    seq_length,
+    embedding_size,
+    use_token_type=False,
+    token_type_ids_blob=None,
+    token_type_vocab_size=16,
+    token_type_embedding_name="token_type_embeddings",
+    use_position_embeddings=True,
+    position_embedding_name="position_embeddings",
+    initializer_range=0.02,
+    max_position_embeddings=512,
+    dropout_prob=0.1,
+):
+    output = input_blob
+    if use_token_type:
+        assert token_type_ids_blob is not None
+        token_type_table = flow.get_variable(
+            name=token_type_embedding_name,
+            shape=[token_type_vocab_size, embedding_size],
+            dtype=input_blob.dtype,
+            initializer=CreateInitializer(initializer_range),
+        )
+        token_type_embeddings = flow.gather(
+            params=token_type_table, indices=token_type_ids_blob, axis=0
+        )
+        output = output + token_type_embeddings
+    if use_position_embeddings:
+        position_table = flow.get_variable(
+            name=position_embedding_name,
+            shape=[1, max_position_embeddings, embedding_size],
+            dtype=input_blob.dtype,
+            initializer=CreateInitializer(initializer_range),
+        )
+        assert seq_length <= max_position_embeddings
+        if seq_length != max_position_embeddings:
+            position_table = flow.slice(
+                position_table, begin=[None, 0, 0], size=[None, seq_length, -1]
+            )
+        output = output + position_table
+    output = _LayerNorm(output, embedding_size)
+    output = _Dropout(output, dropout_prob)
+    return output
+
+
+def _EmbeddingLookup(
+    input_ids_blob,
+    vocab_size,
+    embedding_size=128,
+    initializer_range=0.02,
+    word_embedding_name="word_embeddings",
+):
+    embedding_table = flow.get_variable(
+        name=word_embedding_name,
+        shape=[vocab_size, embedding_size],
+        dtype=flow.float,
+        initializer=CreateInitializer(initializer_range),
+    )
+    output = flow.gather(params=embedding_table, indices=input_ids_blob, axis=0)
+    return (output, embedding_table)
+
+
+def GetActivation(name):
+    if name == "linear":
+        return None
+    elif name == "relu":
+        return flow.math.relu
+    elif name == "tanh":
+        return flow.math.tanh
+    elif name == "gelu":
+        return flow.math.gelu
+    else:
+        raise Exception("unsupported activation")
diff --git a/python/oneflow/compatible/single_client/benchmarks/bert_benchmark/pretrain.py b/python/oneflow/compatible/single_client/benchmarks/bert_benchmark/pretrain.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b3cac2d579db44cdf59f1c9a973de569fc966e0
--- /dev/null
+++ b/python/oneflow/compatible/single_client/benchmarks/bert_benchmark/pretrain.py
@@ -0,0 +1,189 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import bert as bert_util
+
+from oneflow.compatible import single_client as flow
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+
+
+def PreTrain(
+    input_ids_blob,
+    input_mask_blob,
+    token_type_ids_blob,
+    masked_lm_positions_blob,
+    masked_lm_ids_blob,
+    masked_lm_weights_blob,
+    next_sentence_label_blob,
+    vocab_size,
+    seq_length=512,
+    hidden_size=768,
+    num_hidden_layers=12,
+    num_attention_heads=12,
+    intermediate_size=3072,
+    hidden_act="gelu",
+    hidden_dropout_prob=0.1,
+    attention_probs_dropout_prob=0.1,
+    max_position_embeddings=512,
+    type_vocab_size=16,
+    max_predictions_per_seq=20,
+    initializer_range=0.02,
+):
+    backbone = bert_util.BertBackbone(
+        input_ids_blob=input_ids_blob,
+        input_mask_blob=input_mask_blob,
+        token_type_ids_blob=token_type_ids_blob,
+        vocab_size=vocab_size,
+        seq_length=seq_length,
+        hidden_size=hidden_size,
+        num_hidden_layers=num_hidden_layers,
+        num_attention_heads=num_attention_heads,
+        intermediate_size=intermediate_size,
+        hidden_act=hidden_act,
+        hidden_dropout_prob=hidden_dropout_prob,
+        attention_probs_dropout_prob=attention_probs_dropout_prob,
+        max_position_embeddings=max_position_embeddings,
+        type_vocab_size=type_vocab_size,
+        initializer_range=initializer_range,
+    )
+    (lm_loss, _, _) = _AddMaskedLanguageModelLoss(
+        input_blob=backbone.sequence_output(),
+        output_weights_blob=backbone.embedding_table(),
+        positions_blob=masked_lm_positions_blob,
+        label_id_blob=masked_lm_ids_blob,
+        label_weight_blob=masked_lm_weights_blob,
+        seq_length=seq_length,
+        hidden_size=hidden_size,
+        vocab_size=vocab_size,
+        max_predictions_per_seq=max_predictions_per_seq,
+        hidden_act=bert_util.GetActivation(hidden_act),
+        initializer_range=initializer_range,
+    )
+    pooled_output = PooledOutput(
+        backbone.sequence_output(), hidden_size, initializer_range
+    )
+    (ns_loss, _, _) = _AddNextSentenceOutput(
+        input_blob=pooled_output,
+        label_blob=next_sentence_label_blob,
+        hidden_size=hidden_size,
+        initializer_range=initializer_range,
+    )
+    with flow.scope.namespace("cls-loss"):
+        total_loss = lm_loss + ns_loss
+    return (total_loss, lm_loss, ns_loss)
+
+
+def PooledOutput(sequence_output, hidden_size, initializer_range):
+    with flow.scope.namespace("bert-pooler"):
+        first_token_tensor = flow.slice(sequence_output, [None, 0, 0], [None, 1, -1])
+        first_token_tensor = flow.reshape(first_token_tensor, [-1, hidden_size])
+        pooled_output = bert_util._FullyConnected(
+            first_token_tensor,
+            input_size=hidden_size,
+            units=hidden_size,
+            weight_initializer=bert_util.CreateInitializer(initializer_range),
+            name="dense",
+        )
+        pooled_output = flow.math.tanh(pooled_output)
+    return pooled_output
+
+
+def _AddMaskedLanguageModelLoss(
+    input_blob,
+    output_weights_blob,
+    positions_blob,
+    label_id_blob,
+    label_weight_blob,
+    seq_length,
+    hidden_size,
+    vocab_size,
+    max_predictions_per_seq,
+    hidden_act,
+    initializer_range,
+):
+    with flow.scope.namespace("other"):
+        sum_label_weight_blob = flow.math.reduce_sum(label_weight_blob, axis=[-1])
+        ones = sum_label_weight_blob * 0.0 + 1.0
+        sum_label_weight_blob = flow.math.reduce_sum(sum_label_weight_blob)
+        batch_size = flow.math.reduce_sum(ones)
+        sum_label_weight_blob = sum_label_weight_blob / batch_size
+    with flow.scope.namespace("cls-predictions"):
+        input_blob = _GatherIndexes(input_blob, positions_blob, seq_length, hidden_size)
+        with flow.scope.namespace("transform"):
+            if callable(hidden_act):
+                act_fn = op_conf_util.kNone
+            else:
+                act_fn = hidden_act
+            input_blob = bert_util._FullyConnected(
+                input_blob,
+                input_size=hidden_size,
+                units=hidden_size,
+                activation=act_fn,
+                weight_initializer=bert_util.CreateInitializer(initializer_range),
+                name="dense",
+            )
+            if callable(hidden_act):
+                input_blob = hidden_act(input_blob)
+                input_blob = bert_util._LayerNorm(input_blob, hidden_size)
+        output_bias = flow.get_variable(
+            name="output_bias",
+            shape=[vocab_size],
+            dtype=input_blob.dtype,
+            initializer=flow.constant_initializer(1.0),
+        )
+        logit_blob = flow.matmul(input_blob, output_weights_blob, transpose_b=True)
+        logit_blob = flow.nn.bias_add(logit_blob, output_bias)
+        label_id_blob = flow.reshape(label_id_blob, [-1])
+        pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+            logits=logit_blob, labels=label_id_blob
+        )
+        pre_example_loss = flow.reshape(pre_example_loss, [-1, max_predictions_per_seq])
+        numerator = pre_example_loss * label_weight_blob
+        with flow.scope.namespace("loss"):
+            numerator = flow.math.reduce_sum(numerator, axis=[-1])
+            denominator = sum_label_weight_blob + 1e-05
+            loss = numerator / denominator
+        return (loss, pre_example_loss, logit_blob)
+
+
+def _GatherIndexes(sequence_blob, positions_blob, seq_length, hidden_size):
+    output = flow.gather(
+        params=sequence_blob, indices=positions_blob, axis=2, batch_dims=2
+    )
+    output = flow.reshape(output, [-1, hidden_size])
+    return output
+
+
+def _AddNextSentenceOutput(input_blob, label_blob, hidden_size, initializer_range):
+    with flow.scope.namespace("cls-seq_relationship"):
+        output_weight_blob = flow.get_variable(
+            name="output_weights",
+            shape=[2, hidden_size],
+            dtype=input_blob.dtype,
+            initializer=bert_util.CreateInitializer(initializer_range),
+        )
+        output_bias_blob = flow.get_variable(
+            name="output_bias",
+            shape=[2],
+            dtype=input_blob.dtype,
+            initializer=flow.constant_initializer(0.0),
+        )
+        logit_blob = flow.matmul(input_blob, output_weight_blob, transpose_b=True)
+        logit_blob = flow.nn.bias_add(logit_blob, output_bias_blob)
+        pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+            logits=logit_blob, labels=label_blob
+        )
+        loss = pre_example_loss
+        return (loss, pre_example_loss, logit_blob)
diff --git a/python/oneflow/compatible/single_client/benchmarks/bert_benchmark/run_pretraining.py b/python/oneflow/compatible/single_client/benchmarks/bert_benchmark/run_pretraining.py
new file mode 100644
index 0000000000000000000000000000000000000000..31c5736ed3d2db868e568a9a10aa9d2989368058
--- /dev/null
+++ b/python/oneflow/compatible/single_client/benchmarks/bert_benchmark/run_pretraining.py
@@ -0,0 +1,311 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import argparse
+import os
+import random
+import time
+from collections import OrderedDict
+from datetime import datetime
+
+import benchmark_util
+from pretrain import PreTrain
+
+from oneflow.compatible import single_client as flow
+
+parser = argparse.ArgumentParser(description="flags for bert")
+parser.add_argument("--gpu_num_per_node", type=int, default=1)
+parser.add_argument("--node_num", type=int, default=1)
+parser.add_argument("--node_list", type=str, default=None)
+parser.add_argument("--learning_rate", type=float, default=0.0001, help="Learning rate")
+parser.add_argument(
+    "--weight_decay_rate", type=float, default=0.01, help="weight decay rate"
+)
+parser.add_argument("--batch_size_per_device", type=int, default=24)
+parser.add_argument("--iter_num", type=int, default=10, help="total iterations to run")
+parser.add_argument(
+    "--skip_iter_num",
+    type=int,
+    default=10,
+    help="number of skipping iterations for benchmark purpose.",
+)
+parser.add_argument(
+    "--log_every_n_iter", type=int, default=1, help="print loss every n iteration"
+)
+parser.add_argument("--data_dir", type=str, default=None)
+parser.add_argument(
+    "--data_part_num", type=int, default=32, help="data part number in dataset"
+)
+parser.add_argument(
+    "--enable_auto_mixed_precision",
+    default=False,
+    type=lambda x: str(x).lower() == "true",
+)
+parser.add_argument(
+    "--loss_print_every_n_iter",
+    type=int,
+    default=1,
+    required=False,
+    help="print loss every n iteration",
+)
+parser.add_argument(
+    "--model_save_every_n_iter",
+    type=int,
+    default=200,
+    required=False,
+    help="save model every n iteration",
+)
+parser.add_argument(
+    "--model_save_dir",
+    type=str,
+    default="./output/model_save-{}".format(
+        str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))
+    ),
+    required=False,
+    help="model save directory",
+)
+parser.add_argument(
+    "--save_last_snapshot",
+    type=bool,
+    default=False,
+    required=False,
+    help="save model snapshot for last iteration",
+)
+parser.add_argument(
+    "--model_load_dir",
+    type=str,
+    default=None,
+    required=False,
+    help="model load directory",
+)
+parser.add_argument(
+    "--log_dir",
+    type=str,
+    default="./output",
+    required=False,
+    help="log info save directory",
+)
+parser.add_argument("--seq_length", type=int, default=512)
+parser.add_argument("--max_predictions_per_seq", type=int, default=80)
+parser.add_argument("--num_hidden_layers", type=int, default=24)
+parser.add_argument("--num_attention_heads", type=int, default=16)
+parser.add_argument("--max_position_embeddings", type=int, default=512)
+parser.add_argument("--type_vocab_size", type=int, default=2)
+parser.add_argument("--vocab_size", type=int, default=30522)
+parser.add_argument("--attention_probs_dropout_prob", type=float, default=0.1)
+parser.add_argument("--hidden_dropout_prob", type=float, default=0.1)
+parser.add_argument("--hidden_size_per_head", type=int, default=64)
+parser.add_argument("--warmup_batches", type=int, default=1000)
+parser.add_argument("--lr_decay_num", type=int, default=100000)
+parser.add_argument(
+    "--lr_decay_num_same_as_iter_num",
+    default=False,
+    type=lambda x: str(x).lower() == "true",
+)
+args = parser.parse_args()
+
+
+def _blob_conf(name, shape, dtype=flow.int32):
+    return flow.data.BlobConf(
+        name=name, shape=shape, dtype=dtype, codec=flow.data.RawCodec()
+    )
+
+
+def BertDecoder(
+    data_dir, batch_size, data_part_num, seq_length, max_predictions_per_seq
+):
+    config_ordered_dict = OrderedDict()
+    config_ordered_dict["input_ids"] = seq_length
+    config_ordered_dict["next_sentence_labels"] = 1
+    config_ordered_dict["input_mask"] = seq_length
+    config_ordered_dict["segment_ids"] = seq_length
+    config_ordered_dict["masked_lm_ids"] = max_predictions_per_seq
+    config_ordered_dict["masked_lm_positions"] = max_predictions_per_seq
+    config_ordered_dict["masked_lm_weights"] = max_predictions_per_seq
+    ofrecord = flow.data.ofrecord_reader(
+        data_dir, batch_size=batch_size, data_part_num=data_part_num, name="decode"
+    )
+    ret = {}
+    for (k, v) in config_ordered_dict.items():
+        ret[k] = flow.data.ofrecord_raw_decoder(
+            ofrecord,
+            k,
+            shape=(v,),
+            dtype=flow.float if k == "masked_lm_weights" else flow.int32,
+        )
+    return ret
+
+
+def BuildPreTrainNet(
+    batch_size,
+    data_part_num,
+    seq_length=128,
+    max_position_embeddings=512,
+    num_hidden_layers=12,
+    num_attention_heads=12,
+    hidden_dropout_prob=0.1,
+    attention_probs_dropout_prob=0.1,
+    vocab_size=30522,
+    type_vocab_size=2,
+    max_predictions_per_seq=20,
+):
+    hidden_size = 64 * num_attention_heads
+    intermediate_size = hidden_size * 4
+    decoders = BertDecoder(
+        args.data_dir, batch_size, data_part_num, seq_length, max_predictions_per_seq
+    )
+    input_ids = decoders["input_ids"]
+    next_sentence_labels = decoders["next_sentence_labels"]
+    input_mask = decoders["input_mask"]
+    token_type_ids = decoders["segment_ids"]
+    masked_lm_ids = decoders["masked_lm_ids"]
+    masked_lm_positions = decoders["masked_lm_positions"]
+    masked_lm_weights = decoders["masked_lm_weights"]
+    return PreTrain(
+        input_ids,
+        input_mask,
+        token_type_ids,
+        masked_lm_positions,
+        masked_lm_ids,
+        masked_lm_weights,
+        next_sentence_labels,
+        vocab_size,
+        seq_length=seq_length,
+        hidden_size=hidden_size,
+        num_hidden_layers=num_hidden_layers,
+        num_attention_heads=num_attention_heads,
+        intermediate_size=intermediate_size,
+        hidden_act="gelu",
+        hidden_dropout_prob=hidden_dropout_prob,
+        attention_probs_dropout_prob=attention_probs_dropout_prob,
+        max_position_embeddings=max_position_embeddings,
+        type_vocab_size=type_vocab_size,
+        max_predictions_per_seq=max_predictions_per_seq,
+        initializer_range=0.02,
+    )
+
+
+_BERT_MODEL_UPDATE_CONF = dict(
+    learning_rate_decay=dict(
+        polynomial_conf=dict(
+            decay_batches=args.iter_num
+            if args.lr_decay_num_same_as_iter_num
+            else args.lr_decay_num,
+            end_learning_rate=0.0,
+        )
+    ),
+    warmup_conf=dict(
+        linear_conf=dict(warmup_batches=args.warmup_batches, start_multiplier=0)
+    ),
+    clip_conf=dict(clip_by_global_norm=dict(clip_norm=1.0)),
+    adam_conf=dict(epsilon=1e-06),
+    weight_decay_conf=dict(
+        weight_decay_rate=args.weight_decay_rate,
+        excludes=dict(pattern=["bias", "LayerNorm", "layer_norm"]),
+    ),
+)
+func_config = flow.FunctionConfig()
+func_config.default_distribute_strategy(flow.scope.consistent_view())
+func_config.train.primary_lr(args.learning_rate)
+func_config.default_data_type(flow.float)
+func_config.train.model_update_conf(_BERT_MODEL_UPDATE_CONF)
+func_config.enable_auto_mixed_precision(args.enable_auto_mixed_precision)
+flow.config.gpu_device_num(args.gpu_num_per_node)
+
+
+@flow.global_function(func_config)
+def PretrainJob():
+    total_device_num = args.node_num * args.gpu_num_per_node
+    batch_size = total_device_num * args.batch_size_per_device
+    (total_loss, mlm_loss, nsp_loss) = BuildPreTrainNet(
+        batch_size,
+        args.data_part_num,
+        seq_length=args.seq_length,
+        max_position_embeddings=args.max_position_embeddings,
+        num_hidden_layers=args.num_hidden_layers,
+        num_attention_heads=args.num_attention_heads,
+        hidden_dropout_prob=args.hidden_dropout_prob,
+        attention_probs_dropout_prob=args.attention_probs_dropout_prob,
+        vocab_size=args.vocab_size,
+        type_vocab_size=args.type_vocab_size,
+        max_predictions_per_seq=args.max_predictions_per_seq,
+    )
+    flow.losses.add_loss(total_loss)
+    return (total_loss, mlm_loss, nsp_loss)
+
+
+def main():
+    print("=".ljust(66, "="))
+    print(
+        "Running bert: num_gpu_per_node = {}, num_nodes = {}.".format(
+            args.gpu_num_per_node, args.node_num
+        )
+    )
+    print("=".ljust(66, "="))
+    for arg in vars(args):
+        print("{} = {}".format(arg, getattr(args, arg)))
+    print("-".ljust(66, "-"))
+    print("Time stamp: {}".format(str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))))
+    flow.env.log_dir(args.log_dir)
+    if args.node_num > 1:
+        nodes = []
+        for n in args.node_list.strip().split(","):
+            addr_dict = {}
+            addr_dict["addr"] = n
+            nodes.append(addr_dict)
+        flow.env.machine(nodes)
+    if os.getenv("ONEFLOW_DRY_RUN"):
+        flow.env.ctrl_port(9788)
+    check_point = flow.train.CheckPoint()
+    if args.model_load_dir:
+        assert os.path.isdir(args.model_load_dir)
+        check_point.load(args.model_load_dir)
+        print("Restoring model from {}.".format(args.model_load_dir))
+    else:
+        check_point.init()
+        print("Init model on demand")
+    total_batch_size = (
+        args.node_num * args.gpu_num_per_node * args.batch_size_per_device
+    )
+    speedometer = benchmark_util.BERTSpeedometer()
+    start_time = time.time()
+    for step in range(args.skip_iter_num + args.iter_num):
+        cb = speedometer.speedometer_cb(
+            step,
+            start_time,
+            total_batch_size,
+            args.skip_iter_num,
+            args.iter_num,
+            args.loss_print_every_n_iter,
+        )
+        PretrainJob().async_get(cb)
+        if (step + 1) % args.model_save_every_n_iter == 0:
+            if not os.path.exists(args.model_save_dir):
+                os.makedirs(args.model_save_dir)
+            snapshot_save_path = os.path.join(
+                args.model_save_dir, "snapshot_%d" % (step + 1)
+            )
+            print("Saving model to {}.".format(snapshot_save_path))
+            check_point.save(snapshot_save_path)
+    if args.save_last_snapshot:
+        snapshot_save_path = os.path.join(args.model_save_dir, "last_snapshot")
+        if not os.path.exists(snapshot_save_path):
+            os.makedirs(snapshot_save_path)
+        print("Saving model to {}.".format(snapshot_save_path))
+        check_point.save(snapshot_save_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/oneflow/compatible/single_client/benchmarks/cnn_benchmark/__init__.py b/python/oneflow/compatible/single_client/benchmarks/cnn_benchmark/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/python/oneflow/compatible/single_client/benchmarks/cnn_benchmark/alexnet.py b/python/oneflow/compatible/single_client/benchmarks/cnn_benchmark/alexnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e9a1a45214c35933339bcdb08832973dac9863c
--- /dev/null
+++ b/python/oneflow/compatible/single_client/benchmarks/cnn_benchmark/alexnet.py
@@ -0,0 +1,195 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import argparse
+
+from oneflow.compatible import single_client as flow
+
+DATA_DIR = "/dataset/imagenet_1k/oneflow/30/train"
+parser = argparse.ArgumentParser(description="flags for multi-node and resource")
+parser.add_argument("-i", "--iter_num", type=int, default=10, required=False)
+parser.add_argument("-g", "--gpu_num_per_node", type=int, default=1, required=False)
+parser.add_argument(
+    "-m", "--multinode", default=False, action="store_true", required=False
+)
+parser.add_argument("-n", "--node_list", type=str, default=None, required=False)
+parser.add_argument("-e", "--eval_dir", type=str, default=DATA_DIR, required=False)
+parser.add_argument("-t", "--train_dir", type=str, default=DATA_DIR, required=False)
+parser.add_argument("-load", "--model_load_dir", type=str, default="", required=False)
+parser.add_argument(
+    "-save", "--model_save_dir", type=str, default="./checkpoints", required=False
+)
+args = parser.parse_args()
+
+
+def _data_load_layer(data_dir):
+    rgb_mean = [123.68, 116.78, 103.94]
+    ofrecord = flow.data.ofrecord_reader(
+        data_dir, batch_size=12, data_part_num=8, name="decode"
+    )
+    image = flow.data.ofrecord_image_decoder(ofrecord, "encoded", color_space="RGB")
+    label = flow.data.ofrecord_raw_decoder(
+        ofrecord, "class/label", shape=(), dtype=flow.int32
+    )
+    rsz = flow.image.resize(image, resize_x=227, resize_y=227, color_space="RGB")
+    normal = flow.image.crop_mirror_normalize(
+        rsz,
+        color_space="RGB",
+        output_layout="NCHW",
+        mean=rgb_mean,
+        output_dtype=flow.float,
+    )
+    return (label, normal)
+
+
+def _conv2d_layer(
+    name,
+    input,
+    filters,
+    kernel_size=3,
+    strides=1,
+    padding="SAME",
+    data_format="NCHW",
+    dilation_rate=1,
+    activation="Relu",
+    use_bias=False,
+    weight_initializer=flow.random_uniform_initializer(),
+    bias_initializer=None,
+):
+    weight_shape = (filters, input.shape[1], kernel_size, kernel_size)
+    weight = flow.get_variable(
+        name + "-weight",
+        shape=weight_shape,
+        dtype=input.dtype,
+        initializer=weight_initializer,
+    )
+    output = flow.nn.conv2d(
+        input, weight, strides, padding, None, data_format, dilation_rate, name=name
+    )
+    if use_bias:
+        bias = flow.get_variable(
+            name + "-bias",
+            shape=(filters,),
+            dtype=input.dtype,
+            initializer=bias_initializer,
+        )
+        output = flow.nn.bias_add(output, bias, data_format)
+    if activation is not None:
+        if activation == "Relu":
+            output = flow.math.relu(output)
+        else:
+            raise NotImplementedError
+    return output
+
+
+def alexnet(images, labels):
+    conv1 = _conv2d_layer(
+        "conv1", images, filters=64, kernel_size=11, strides=4, padding="VALID"
+    )
+    pool1 = flow.nn.avg_pool2d(conv1, 3, 2, "VALID", "NCHW", name="pool1")
+    conv2 = _conv2d_layer("conv2", pool1, filters=192, kernel_size=5)
+    pool2 = flow.nn.avg_pool2d(conv2, 3, 2, "VALID", "NCHW", name="pool2")
+    conv3 = _conv2d_layer("conv3", pool2, filters=384)
+    conv4 = _conv2d_layer("conv4", conv3, filters=384)
+    conv5 = _conv2d_layer("conv5", conv4, filters=256)
+    pool5 = flow.nn.avg_pool2d(conv5, 3, 2, "VALID", "NCHW", name="pool5")
+    if len(pool5.shape) > 2:
+        pool5 = flow.reshape(pool5, shape=(pool5.shape[0], -1))
+    fc1 = flow.layers.dense(
+        inputs=pool5,
+        units=4096,
+        activation=flow.math.relu,
+        use_bias=False,
+        kernel_initializer=flow.random_uniform_initializer(),
+        bias_initializer=False,
+        trainable=True,
+        name="fc1",
+    )
+    dropout1 = flow.nn.dropout(fc1, rate=0.5)
+    fc2 = flow.layers.dense(
+        inputs=dropout1,
+        units=4096,
+        activation=flow.math.relu,
+        use_bias=False,
+        kernel_initializer=flow.random_uniform_initializer(),
+        bias_initializer=False,
+        trainable=True,
+        name="fc2",
+    )
+    dropout2 = flow.nn.dropout(fc2, rate=0.5)
+    fc3 = flow.layers.dense(
+        inputs=dropout2,
+        units=1001,
+        activation=None,
+        use_bias=False,
+        kernel_initializer=flow.random_uniform_initializer(),
+        bias_initializer=False,
+        trainable=True,
+        name="fc3",
+    )
+    loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+        labels, fc3, name="softmax_loss"
+    )
+    return loss
+
+
+@flow.global_function
+def alexnet_train_job():
+    flow.config.train.primary_lr(1e-05)
+    flow.config.train.model_update_conf(dict(naive_conf={}))
+    (labels, images) = _data_load_layer(args.train_dir)
+    loss = alexnet(images, labels)
+    flow.losses.add_loss(loss)
+    return loss
+
+
+@flow.global_function
+def alexnet_eval_job():
+    (labels, images) = _data_load_layer(args.eval_dir)
+    loss = alexnet(images, labels)
+    return loss
+
+
+def main():
+    flow.config.gpu_device_num(args.gpu_num_per_node)
+    flow.config.ctrl_port(9788)
+    flow.config.default_data_type(flow.float)
+    if args.multinode:
+        flow.config.ctrl_port(12138)
+        nodes = []
+        for n in args.node_list.strip().split(","):
+            addr_dict = {}
+            addr_dict["addr"] = n
+            nodes.append(addr_dict)
+        flow.config.machine(nodes)
+    check_point = flow.train.CheckPoint()
+    if not args.model_load_dir:
+        check_point.init()
+    else:
+        check_point.load(args.model_load_dir)
+    print("{:>12}  {:>12}  {:>12}".format("iter", "loss type", "loss value"))
+    for i in range(args.iter_num):
+        fmt_str = "{:>12}  {:>12}  {:>12.10f}"
+        train_loss = alexnet_train_job().get().mean()
+        print(fmt_str.format(i, "train loss:", train_loss))
+        if (i + 1) % 10 == 0:
+            eval_loss = alexnet_eval_job().get().mean()
+            print(fmt_str.format(i, "eval loss:", eval_loss))
+        if (i + 1) % 100 == 0:
+            check_point.save(args.model_save_dir + str(i))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/oneflow/compatible/single_client/benchmarks/cnn_benchmark/alexnet_model.py b/python/oneflow/compatible/single_client/benchmarks/cnn_benchmark/alexnet_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a4a060495917e3c797769d6d8d04b77dd686d85
--- /dev/null
+++ b/python/oneflow/compatible/single_client/benchmarks/cnn_benchmark/alexnet_model.py
@@ -0,0 +1,66 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from model_util import conv2d_layer
+
+from oneflow.compatible import single_client as flow
+
+
+def alexnet(images, trainable=True):
+    conv1 = conv2d_layer(
+        "conv1", images, filters=64, kernel_size=11, strides=4, padding="VALID"
+    )
+    pool1 = flow.nn.avg_pool2d(conv1, 3, 2, "VALID", "NCHW", name="pool1")
+    conv2 = conv2d_layer("conv2", pool1, filters=192, kernel_size=5)
+    pool2 = flow.nn.avg_pool2d(conv2, 3, 2, "VALID", "NCHW", name="pool2")
+    conv3 = conv2d_layer("conv3", pool2, filters=384)
+    conv4 = conv2d_layer("conv4", conv3, filters=384)
+    conv5 = conv2d_layer("conv5", conv4, filters=256)
+    pool5 = flow.nn.avg_pool2d(conv5, 3, 2, "VALID", "NCHW", name="pool5")
+    if len(pool5.shape) > 2:
+        pool5 = flow.reshape(pool5, shape=(pool5.shape[0], -1))
+    fc1 = flow.layers.dense(
+        inputs=pool5,
+        units=4096,
+        activation=flow.math.relu,
+        use_bias=False,
+        kernel_initializer=flow.random_uniform_initializer(),
+        bias_initializer=False,
+        trainable=trainable,
+        name="fc1",
+    )
+    dropout1 = flow.nn.dropout(fc1, rate=0.5)
+    fc2 = flow.layers.dense(
+        inputs=dropout1,
+        units=4096,
+        activation=flow.math.relu,
+        use_bias=False,
+        kernel_initializer=flow.random_uniform_initializer(),
+        bias_initializer=False,
+        trainable=trainable,
+        name="fc2",
+    )
+    dropout2 = flow.nn.dropout(fc2, rate=0.5)
+    fc3 = flow.layers.dense(
+        inputs=dropout2,
+        units=1001,
+        activation=None,
+        use_bias=False,
+        kernel_initializer=flow.random_uniform_initializer(),
+        bias_initializer=False,
+        trainable=trainable,
+        name="fc3",
+    )
+    return fc3
diff --git a/python/oneflow/compatible/single_client/benchmarks/cnn_benchmark/benchmark_util.py b/python/oneflow/compatible/single_client/benchmarks/cnn_benchmark/benchmark_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..38b867b8825b691d5d3ba1e4d8920f5ae046137e
--- /dev/null
+++ b/python/oneflow/compatible/single_client/benchmarks/cnn_benchmark/benchmark_util.py
@@ -0,0 +1,101 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import time
+
+import numpy as np
+
+
+class StopWatch:
+    def __init__(self):
+        pass
+
+    def start(self):
+        self.start_time = time.time()
+        self.last_split = self.start_time
+
+    def set_start(self, val):
+        self.start_time = val
+        self.last_split = self.start_time
+
+    def split(self):
+        now = time.time()
+        duration = now - self.last_split
+        self.last_split = now
+        return duration
+
+    def stop(self):
+        self.stop_time = time.time()
+
+    def duration(self):
+        return self.stop_time - self.start_time
+
+
+class CNNSpeedometer:
+    def __init__(self):
+        self.watch = StopWatch()
+        self.throughoutput_list = []
+
+    def speedometer_cb(
+        self,
+        step,
+        start_time,
+        total_batch_size,
+        skip_iter_num,
+        iter_num,
+        loss_print_every_n_iter,
+    ):
+        def callback(train_loss):
+            assert skip_iter_num >= 0
+            if skip_iter_num == 0 and step == 0:
+                self.watch.set_start(start_time)
+                print("Start trainning without any skipping iteration.")
+            if step < skip_iter_num:
+                if step == 0:
+                    print(
+                        "Skipping {} iterations for benchmark purpose.".format(
+                            skip_iter_num
+                        )
+                    )
+                if step + 1 == skip_iter_num:
+                    self.watch.start()
+                    print("Start trainning.")
+            else:
+                train_step = step - skip_iter_num
+                if (train_step + 1) % loss_print_every_n_iter == 0:
+                    loss = train_loss.mean()
+                    avg_elapse_time_per_iter = (
+                        self.watch.split() / loss_print_every_n_iter
+                    )
+                    samples_per_sec = total_batch_size / avg_elapse_time_per_iter
+                    print(
+                        "iter {}, loss: {:.3f}, speed: {:.3f}(sec/batch), {:.3f}(images/sec)".format(
+                            train_step, loss, avg_elapse_time_per_iter, samples_per_sec
+                        )
+                    )
+                    self.throughoutput_list.append(samples_per_sec)
+                if train_step + 1 == iter_num:
+                    self.watch.stop()
+                    totoal_duration = self.watch.duration()
+                    avg_samples_per_sec = total_batch_size * iter_num / totoal_duration
+                    print("-".ljust(66, "-"))
+                    print(
+                        "average speed: {:.3f}(images/sec), new_cal_method: {:.3f}(images/sec)".format(
+                            avg_samples_per_sec, np.mean(self.throughoutput_list)
+                        )
+                    )
+                    print("-".ljust(66, "-"))
+
+        return callback
diff --git a/python/oneflow/compatible/single_client/benchmarks/cnn_benchmark/data_loader.py b/python/oneflow/compatible/single_client/benchmarks/cnn_benchmark/data_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..8488d0a9ff877ea4b51a66dcb3b680274ffc6e81
--- /dev/null
+++ b/python/oneflow/compatible/single_client/benchmarks/cnn_benchmark/data_loader.py
@@ -0,0 +1,61 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+
+
+def load_imagenet(
+    data_dir, image_size, batch_size, data_part_num, gpu_image_decoder=False
+):
+    rgb_mean = [123.68, 116.78, 103.94]
+    rgb_std = [255.0, 255.0, 255.0]
+    ofrecord = flow.data.ofrecord_reader(
+        data_dir, batch_size=batch_size, data_part_num=data_part_num, name="decode"
+    )
+    label = flow.data.ofrecord_raw_decoder(
+        ofrecord, "class/label", shape=(), dtype=flow.int32
+    )
+    if gpu_image_decoder:
+        encoded = flow.data.OFRecordBytesDecoder(ofrecord, "encoded")
+        rsz = flow.data.ImageDecoderRandomCropResize(
+            encoded, target_width=image_size, target_height=image_size, num_workers=3
+        )
+    else:
+        image = flow.data.ofrecord_image_decoder(ofrecord, "encoded", color_space="RGB")
+        rsz = flow.image.resize(
+            image, resize_x=image_size, resize_y=image_size, color_space="RGB"
+        )
+    normal = flow.image.crop_mirror_normalize(
+        rsz,
+        color_space="RGB",
+        output_layout="NCHW",
+        mean=rgb_mean,
+        std=rgb_std,
+        output_dtype=flow.float,
+    )
+    return (label, normal)
+
+
+def load_synthetic(image_size, batch_size):
+    label = flow.data.decode_random(
+        shape=(),
+        dtype=flow.int32,
+        batch_size=batch_size,
+        initializer=flow.zeros_initializer(flow.int32),
+    )
+    image = flow.data.decode_random(
+        shape=(3, image_size, image_size), dtype=flow.float, batch_size=batch_size
+    )
+    return (label, image)
diff --git a/python/oneflow/compatible/single_client/benchmarks/cnn_benchmark/inceptionv3_model.py b/python/oneflow/compatible/single_client/benchmarks/cnn_benchmark/inceptionv3_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..776e8eb128bdddd9b265e5a3ae4eb18330556617
--- /dev/null
+++ b/python/oneflow/compatible/single_client/benchmarks/cnn_benchmark/inceptionv3_model.py
@@ -0,0 +1,477 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+
+
+def _conv2d_layer(
+    name,
+    input,
+    filters,
+    kernel_size=3,
+    strides=1,
+    padding="SAME",
+    data_format="NCHW",
+    dilation_rate=1,
+    activation=op_conf_util.kSigmoid,
+    use_bias=True,
+    trainable=True,
+    weight_initializer=flow.random_uniform_initializer(),
+    bias_initializer=flow.constant_initializer(),
+):
+    if isinstance(kernel_size, int):
+        kernel_size = (kernel_size, kernel_size)
+    else:
+        kernel_size = tuple(kernel_size)
+    weight_shape = (filters, input.shape[1]) + kernel_size
+    weight = flow.get_variable(
+        name + "-weight",
+        shape=weight_shape,
+        dtype=input.dtype,
+        initializer=weight_initializer,
+    )
+    output = flow.nn.conv2d(
+        input, weight, strides, padding, None, data_format, dilation_rate, name=name
+    )
+    if use_bias:
+        bias = flow.get_variable(
+            name + "-bias",
+            shape=(filters,),
+            dtype=input.dtype,
+            initializer=bias_initializer,
+        )
+        output = flow.nn.bias_add(output, bias, data_format)
+    if activation is not None:
+        if activation == op_conf_util.kRelu:
+            output = flow.math.relu(output)
+        elif activation == op_conf_util.kSigmoid:
+            output = flow.math.sigmoid(output)
+        else:
+            raise NotImplementedError
+    return output
+
+
+def InceptionA(in_blob, index):
+    with flow.scope.namespace("mixed_{}".format(index)):
+        with flow.scope.namespace("branch1x1"):
+            branch1x1 = _conv2d_layer(
+                "conv0", in_blob, filters=64, kernel_size=1, strides=1, padding="SAME"
+            )
+        with flow.scope.namespace("branch5x5"):
+            branch5x5_1 = _conv2d_layer(
+                "conv0", in_blob, filters=48, kernel_size=1, strides=1, padding="SAME"
+            )
+            branch5x5_2 = _conv2d_layer(
+                "conv1",
+                branch5x5_1,
+                filters=64,
+                kernel_size=5,
+                strides=1,
+                padding="SAME",
+            )
+        with flow.scope.namespace("branch3x3dbl"):
+            branch3x3dbl_1 = _conv2d_layer(
+                "conv0", in_blob, filters=64, kernel_size=1, strides=1, padding="SAME"
+            )
+            branch3x3dbl_2 = _conv2d_layer(
+                "conv1",
+                branch3x3dbl_1,
+                filters=96,
+                kernel_size=3,
+                strides=1,
+                padding="SAME",
+            )
+            branch3x3dbl_3 = _conv2d_layer(
+                "conv2",
+                branch3x3dbl_2,
+                filters=96,
+                kernel_size=3,
+                strides=1,
+                padding="SAME",
+            )
+        with flow.scope.namespace("branch_pool"):
+            branch_pool_1 = flow.nn.avg_pool2d(
+                in_blob,
+                ksize=3,
+                strides=1,
+                padding="SAME",
+                data_format="NCHW",
+                name="pool",
+            )
+            branch_pool_2 = _conv2d_layer(
+                "conv",
+                branch_pool_1,
+                filters=32 if index == 0 else 64,
+                kernel_size=1,
+                strides=1,
+                padding="SAME",
+            )
+        inceptionA_bn = []
+        inceptionA_bn.append(branch1x1)
+        inceptionA_bn.append(branch5x5_2)
+        inceptionA_bn.append(branch3x3dbl_3)
+        inceptionA_bn.append(branch_pool_2)
+        mixed_concat = flow.concat(values=inceptionA_bn, axis=1, name="concat")
+    return mixed_concat
+
+
+def InceptionB(in_blob, index):
+    with flow.scope.namespace("mixed_{}".format(index)):
+        with flow.scope.namespace("branch3x3"):
+            branch3x3 = _conv2d_layer(
+                "conv0", in_blob, filters=384, kernel_size=3, strides=2, padding="VALID"
+            )
+        with flow.scope.namespace("branch3x3dbl"):
+            branch3x3dbl_1 = _conv2d_layer(
+                "conv0", in_blob, filters=64, kernel_size=1, strides=1, padding="SAME"
+            )
+            branch3x3dbl_2 = _conv2d_layer(
+                "conv1",
+                branch3x3dbl_1,
+                filters=96,
+                kernel_size=3,
+                strides=1,
+                padding="SAME",
+            )
+            branch3x3dbl_3 = _conv2d_layer(
+                "conv2",
+                branch3x3dbl_2,
+                filters=96,
+                kernel_size=3,
+                strides=2,
+                padding="VALID",
+            )
+        with flow.scope.namespace("branch_pool"):
+            branch_pool = flow.nn.max_pool2d(
+                in_blob,
+                ksize=3,
+                strides=2,
+                padding="VALID",
+                data_format="NCHW",
+                name="pool0",
+            )
+        inceptionB_bn = []
+        inceptionB_bn.append(branch3x3)
+        inceptionB_bn.append(branch3x3dbl_3)
+        inceptionB_bn.append(branch_pool)
+        mixed_concat = flow.concat(values=inceptionB_bn, axis=1, name="concat")
+    return mixed_concat
+
+
+def InceptionC(in_blob, index, filters):
+    with flow.scope.namespace("mixed_{}".format(index)):
+        with flow.scope.namespace("branch1x1"):
+            branch1x1 = _conv2d_layer(
+                "conv0", in_blob, filters=192, kernel_size=1, strides=1, padding="SAME"
+            )
+        with flow.scope.namespace("branch7x7"):
+            branch7x7_1 = _conv2d_layer(
+                "conv0",
+                in_blob,
+                filters=filters,
+                kernel_size=1,
+                strides=1,
+                padding="SAME",
+            )
+            branch7x7_2 = _conv2d_layer(
+                "conv1",
+                branch7x7_1,
+                filters=filters,
+                kernel_size=[1, 7],
+                strides=1,
+                padding="SAME",
+            )
+            branch7x7_3 = _conv2d_layer(
+                "conv2",
+                branch7x7_2,
+                filters=192,
+                kernel_size=[7, 1],
+                strides=[1, 1],
+                padding="SAME",
+            )
+        with flow.scope.namespace("branch7x7dbl"):
+            branch7x7dbl_1 = _conv2d_layer(
+                "conv0",
+                in_blob,
+                filters=filters,
+                kernel_size=1,
+                strides=1,
+                padding="SAME",
+            )
+            branch7x7dbl_2 = _conv2d_layer(
+                "conv1",
+                branch7x7dbl_1,
+                filters=filters,
+                kernel_size=[7, 1],
+                strides=1,
+                padding="SAME",
+            )
+            branch7x7dbl_3 = _conv2d_layer(
+                "conv2",
+                branch7x7dbl_2,
+                filters=filters,
+                kernel_size=[1, 7],
+                strides=1,
+                padding="SAME",
+            )
+            branch7x7dbl_4 = _conv2d_layer(
+                "conv3",
+                branch7x7dbl_3,
+                filters=filters,
+                kernel_size=[7, 1],
+                strides=1,
+                padding="SAME",
+            )
+            branch7x7dbl_5 = _conv2d_layer(
+                "conv4",
+                branch7x7dbl_4,
+                filters=192,
+                kernel_size=[1, 7],
+                strides=1,
+                padding="SAME",
+            )
+        with flow.scope.namespace("branch_pool"):
+            branch_pool_1 = flow.nn.avg_pool2d(
+                in_blob,
+                ksize=3,
+                strides=1,
+                padding="SAME",
+                data_format="NCHW",
+                name="pool",
+            )
+            branch_pool_2 = _conv2d_layer(
+                "conv",
+                branch_pool_1,
+                filters=192,
+                kernel_size=[1, 1],
+                strides=1,
+                padding="SAME",
+            )
+        inceptionC_bn = []
+        inceptionC_bn.append(branch1x1)
+        inceptionC_bn.append(branch7x7_3)
+        inceptionC_bn.append(branch7x7dbl_5)
+        inceptionC_bn.append(branch_pool_2)
+        mixed_concat = flow.concat(values=inceptionC_bn, axis=1, name="concat")
+    return mixed_concat
+
+
+def InceptionD(in_blob, index):
+    with flow.scope.namespace("mixed_{}".format(index)):
+        with flow.scope.namespace("branch3x3"):
+            branch3x3_1 = _conv2d_layer(
+                "conv0", in_blob, filters=192, kernel_size=1, strides=1, padding="SAME"
+            )
+            branch3x3_2 = _conv2d_layer(
+                "conv1",
+                branch3x3_1,
+                filters=320,
+                kernel_size=3,
+                strides=2,
+                padding="VALID",
+            )
+        with flow.scope.namespace("branch7x7x3"):
+            branch7x7x3_1 = _conv2d_layer(
+                "conv0", in_blob, filters=192, kernel_size=1, strides=1, padding="SAME"
+            )
+            branch7x7x3_2 = _conv2d_layer(
+                "conv1",
+                branch7x7x3_1,
+                filters=192,
+                kernel_size=[1, 7],
+                strides=1,
+                padding="SAME",
+            )
+            branch7x7x3_3 = _conv2d_layer(
+                "conv2",
+                branch7x7x3_2,
+                filters=192,
+                kernel_size=[7, 1],
+                strides=1,
+                padding="SAME",
+            )
+            branch7x7x3_4 = _conv2d_layer(
+                "conv3",
+                branch7x7x3_3,
+                filters=192,
+                kernel_size=3,
+                strides=2,
+                padding="VALID",
+            )
+        with flow.scope.namespace("branch_pool"):
+            branch_pool = flow.nn.max_pool2d(
+                in_blob,
+                ksize=3,
+                strides=2,
+                padding="VALID",
+                data_format="NCHW",
+                name="pool",
+            )
+        inceptionD_bn = []
+        inceptionD_bn.append(branch3x3_2)
+        inceptionD_bn.append(branch7x7x3_4)
+        inceptionD_bn.append(branch_pool)
+        mixed_concat = flow.concat(values=inceptionD_bn, axis=1, name="concat")
+    return mixed_concat
+
+
+def InceptionE(in_blob, index):
+    with flow.scope.namespace("mixed_{}".format(index)):
+        with flow.scope.namespace("branch1x1"):
+            branch1x1 = _conv2d_layer(
+                "conv0", in_blob, filters=320, kernel_size=1, strides=1, padding="SAME"
+            )
+        with flow.scope.namespace("branch3x3"):
+            branch3x3_1 = _conv2d_layer(
+                "conv0", in_blob, filters=384, kernel_size=1, strides=1, padding="SAME"
+            )
+            branch3x3_2 = _conv2d_layer(
+                "conv1",
+                branch3x3_1,
+                filters=384,
+                kernel_size=[1, 3],
+                strides=1,
+                padding="SAME",
+            )
+            branch3x3_3 = _conv2d_layer(
+                "conv2",
+                branch3x3_1,
+                filters=384,
+                kernel_size=[3, 1],
+                strides=[1, 1],
+                padding="SAME",
+            )
+            inceptionE_1_bn = []
+            inceptionE_1_bn.append(branch3x3_2)
+            inceptionE_1_bn.append(branch3x3_3)
+            concat_branch3x3 = flow.concat(
+                values=inceptionE_1_bn, axis=1, name="concat"
+            )
+        with flow.scope.namespace("branch3x3dbl"):
+            branch3x3dbl_1 = _conv2d_layer(
+                "conv0", in_blob, filters=448, kernel_size=1, strides=1, padding="SAME"
+            )
+            branch3x3dbl_2 = _conv2d_layer(
+                "conv1",
+                branch3x3dbl_1,
+                filters=384,
+                kernel_size=3,
+                strides=1,
+                padding="SAME",
+            )
+            branch3x3dbl_3 = _conv2d_layer(
+                "conv2",
+                branch3x3dbl_2,
+                filters=384,
+                kernel_size=[1, 3],
+                strides=1,
+                padding="SAME",
+            )
+            branch3x3dbl_4 = _conv2d_layer(
+                "conv3",
+                branch3x3dbl_2,
+                filters=384,
+                kernel_size=[3, 1],
+                strides=1,
+                padding="SAME",
+            )
+            inceptionE_2_bn = []
+            inceptionE_2_bn.append(branch3x3dbl_3)
+            inceptionE_2_bn.append(branch3x3dbl_4)
+            concat_branch3x3dbl = flow.concat(
+                values=inceptionE_2_bn, axis=1, name="concat"
+            )
+        with flow.scope.namespace("branch_pool"):
+            branch_pool_1 = flow.nn.avg_pool2d(
+                in_blob,
+                ksize=3,
+                strides=1,
+                padding="SAME",
+                data_format="NCHW",
+                name="pool",
+            )
+            branch_pool_2 = _conv2d_layer(
+                "conv",
+                branch_pool_1,
+                filters=192,
+                kernel_size=[1, 1],
+                strides=1,
+                padding="SAME",
+            )
+        inceptionE_total_bn = []
+        inceptionE_total_bn.append(branch1x1)
+        inceptionE_total_bn.append(concat_branch3x3)
+        inceptionE_total_bn.append(concat_branch3x3dbl)
+        inceptionE_total_bn.append(branch_pool_2)
+        concat_total = flow.concat(values=inceptionE_total_bn, axis=1, name="concat")
+    return concat_total
+
+
+def inceptionv3(images, labels, trainable=True):
+    conv0 = _conv2d_layer(
+        "conv0", images, filters=32, kernel_size=3, strides=2, padding="VALID"
+    )
+    conv1 = _conv2d_layer(
+        "conv1", conv0, filters=32, kernel_size=3, strides=1, padding="VALID"
+    )
+    conv2 = _conv2d_layer(
+        "conv2", conv1, filters=64, kernel_size=3, strides=1, padding="SAME"
+    )
+    pool1 = flow.nn.max_pool2d(
+        conv2, ksize=3, strides=2, padding="VALID", data_format="NCHW", name="pool1"
+    )
+    conv3 = _conv2d_layer(
+        "conv3", pool1, filters=80, kernel_size=1, strides=1, padding="VALID"
+    )
+    conv4 = _conv2d_layer(
+        "conv4", conv3, filters=192, kernel_size=3, strides=1, padding="VALID"
+    )
+    pool2 = flow.nn.max_pool2d(
+        conv4, ksize=3, strides=2, padding="VALID", data_format="NCHW", name="pool2"
+    )
+    mixed_0 = InceptionA(pool2, 0)
+    mixed_1 = InceptionA(mixed_0, 1)
+    mixed_2 = InceptionA(mixed_1, 2)
+    mixed_3 = InceptionB(mixed_2, 3)
+    mixed_4 = InceptionC(mixed_3, 4, 128)
+    mixed_5 = InceptionC(mixed_4, 5, 160)
+    mixed_6 = InceptionC(mixed_5, 6, 160)
+    mixed_7 = InceptionC(mixed_6, 7, 192)
+    mixed_8 = InceptionD(mixed_7, 8)
+    mixed_9 = InceptionE(mixed_8, 9)
+    mixed_10 = InceptionE(mixed_9, 10)
+    pool3 = flow.nn.avg_pool2d(
+        mixed_10, ksize=8, strides=1, padding="VALID", data_format="NCHW", name="pool3"
+    )
+    with flow.scope.namespace("logits"):
+        pool3 = flow.reshape(pool3, [pool3.shape[0], -1])
+        weight = flow.get_variable(
+            "fc1-weight",
+            shape=(pool3.shape[1], 1001),
+            dtype=flow.float,
+            initializer=flow.truncated_normal(0.816496580927726),
+            model_name="weight",
+        )
+        bias = flow.get_variable(
+            "fc1-bias",
+            shape=(1001,),
+            dtype=flow.float,
+            initializer=flow.constant_initializer(),
+            model_name="bias",
+        )
+        fc1 = flow.matmul(pool3, weight)
+        fc1 = flow.nn.bias_add(fc1, bias)
+    return fc1
diff --git a/python/oneflow/compatible/single_client/benchmarks/cnn_benchmark/model_util.py b/python/oneflow/compatible/single_client/benchmarks/cnn_benchmark/model_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..47b76e2d9f5e61caa3ffc1bca4bed1d15a0da42f
--- /dev/null
+++ b/python/oneflow/compatible/single_client/benchmarks/cnn_benchmark/model_util.py
@@ -0,0 +1,56 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+
+
+def conv2d_layer(
+    name,
+    input,
+    filters,
+    kernel_size=3,
+    strides=1,
+    padding="SAME",
+    data_format="NCHW",
+    dilation_rate=1,
+    activation="Relu",
+    use_bias=True,
+    weight_initializer=flow.random_uniform_initializer(),
+    bias_initializer=flow.constant_initializer(),
+):
+    weight_shape = (filters, input.shape[1], kernel_size, kernel_size)
+    weight = flow.get_variable(
+        name + "-weight",
+        shape=weight_shape,
+        dtype=input.dtype,
+        initializer=weight_initializer,
+    )
+    output = flow.nn.conv2d(
+        input, weight, strides, padding, None, data_format, dilation_rate, name=name
+    )
+    if use_bias:
+        bias = flow.get_variable(
+            name + "-bias",
+            shape=(filters,),
+            dtype=input.dtype,
+            initializer=bias_initializer,
+        )
+        output = flow.nn.bias_add(output, bias, data_format)
+    if activation is not None:
+        if activation == "Relu":
+            output = flow.math.relu(output)
+        else:
+            raise NotImplementedError
+    return output
diff --git a/python/oneflow/compatible/single_client/benchmarks/cnn_benchmark/of_cnn_benchmarks.py b/python/oneflow/compatible/single_client/benchmarks/cnn_benchmark/of_cnn_benchmarks.py
new file mode 100644
index 0000000000000000000000000000000000000000..025e403d6659f7b432192d8c05e767c26593975d
--- /dev/null
+++ b/python/oneflow/compatible/single_client/benchmarks/cnn_benchmark/of_cnn_benchmarks.py
@@ -0,0 +1,263 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import argparse
+import os
+import time
+from datetime import datetime
+
+import alexnet_model
+import benchmark_util
+import data_loader
+import resnet_model
+import vgg_model
+
+from oneflow.compatible import single_client as flow
+
+parser = argparse.ArgumentParser(description="flags for cnn benchmark")
+parser.add_argument("--gpu_num_per_node", type=int, default=1, required=False)
+parser.add_argument("--node_num", type=int, default=1)
+parser.add_argument(
+    "--node_list",
+    type=str,
+    default=None,
+    required=False,
+    help="nodes' IP address, split by comma",
+)
+parser.add_argument(
+    "--model", type=str, default="vgg16", required=False, help="vgg16 or resnet50"
+)
+parser.add_argument("--batch_size_per_device", type=int, default=8, required=False)
+parser.add_argument("--learning_rate", type=float, default=0.0001, required=False)
+parser.add_argument(
+    "--optimizer", type=str, default="sgd", required=False, help="sgd, adam, momentum"
+)
+parser.add_argument(
+    "--weight_l2",
+    type=float,
+    default=None,
+    required=False,
+    help="weight decay parameter",
+)
+parser.add_argument(
+    "--iter_num", type=int, default=10, required=False, help="total iterations to run"
+)
+parser.add_argument(
+    "--skip_iter_num",
+    type=int,
+    default=0,
+    required=False,
+    help="number of skipping iterations for benchmark purpose.",
+)
+parser.add_argument(
+    "--data_dir", type=str, default=None, required=False, help="dataset directory"
+)
+parser.add_argument(
+    "--data_part_num",
+    type=int,
+    default=32,
+    required=False,
+    help="data part number in dataset",
+)
+parser.add_argument(
+    "--gpu_image_decoder",
+    type=bool,
+    default=False,
+    required=False,
+    help="Whether to use use ImageDecoderRandomCropResize.",
+)
+parser.add_argument(
+    "--image_size", type=int, default=228, required=False, help="image size"
+)
+parser.add_argument(
+    "--loss_print_every_n_iter",
+    type=int,
+    default=1,
+    required=False,
+    help="print loss every n iteration",
+)
+parser.add_argument(
+    "--model_save_every_n_iter",
+    type=int,
+    default=200,
+    required=False,
+    help="save model every n iteration",
+)
+parser.add_argument(
+    "--model_save_dir",
+    type=str,
+    default="./output/model_save-{}".format(
+        str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))
+    ),
+    required=False,
+    help="model save directory",
+)
+parser.add_argument(
+    "--save_last_snapshot",
+    type=bool,
+    default=False,
+    required=False,
+    help="save model snapshot for last iteration",
+)
+parser.add_argument(
+    "--model_load_dir",
+    type=str,
+    default=None,
+    required=False,
+    help="model load directory",
+)
+parser.add_argument(
+    "--log_dir",
+    type=str,
+    default="./output",
+    required=False,
+    help="log info save directory",
+)
+parser.add_argument(
+    "--enable_auto_mixed_precision",
+    type=bool,
+    default=False,
+    required=False,
+    help="automatically change the float net into mixed precision net",
+)
+args = parser.parse_args()
+model_dict = {
+    "resnet50": resnet_model.resnet50,
+    "vgg16": vgg_model.vgg16,
+    "alexnet": alexnet_model.alexnet,
+}
+func_config = flow.FunctionConfig()
+func_config.default_distribute_strategy(flow.scope.consistent_view())
+func_config.default_data_type(flow.float)
+func_config.enable_auto_mixed_precision(args.enable_auto_mixed_precision)
+if args.weight_l2:
+    func_config.train.weight_l2(args.weight_l2)
+flow.config.gpu_device_num(args.gpu_num_per_node)
+
+
+def set_up_optimizer(loss, args):
+    loss_scale_policy = None
+    if args.enable_auto_mixed_precision:
+        loss_scale_policy = flow.optimizer.loss_scale.dynamic_loss_scale(
+            increment_period=2000
+        )
+    if args.optimizer == "sgd":
+        print("Optimizer:  SGD")
+        flow.optimizer.SGD(
+            flow.optimizer.PiecewiseConstantScheduler([], [args.learning_rate]),
+            loss_scale_policy=loss_scale_policy,
+        ).minimize(loss)
+    elif args.optimizer == "momentum":
+        print("Optimizer:  Momentum")
+        flow.optimizer.SGD(
+            flow.optimizer.PiecewiseConstantScheduler([], [args.learning_rate]),
+            momentum=0.9,
+            loss_scale_policy=loss_scale_policy,
+        ).minimize(loss)
+    elif args.optimizer == "adam":
+        print("Optimizer:  Adam")
+        flow.optimizer.Adam(
+            flow.optimizer.PiecewiseConstantScheduler([], [args.learning_rate]),
+            beta1=0.9,
+            loss_scale_policy=loss_scale_policy,
+        ).minimize(loss)
+
+
+@flow.global_function(func_config)
+def TrainNet():
+    total_device_num = args.node_num * args.gpu_num_per_node
+    batch_size = total_device_num * args.batch_size_per_device
+    if args.data_dir:
+        assert os.path.exists(args.data_dir)
+        print("Loading data from {}".format(args.data_dir))
+        (labels, images) = data_loader.load_imagenet(
+            args.data_dir,
+            args.image_size,
+            batch_size,
+            args.data_part_num,
+            args.gpu_image_decoder,
+        )
+    else:
+        print("Loading synthetic data.")
+        (labels, images) = data_loader.load_synthetic(args.image_size, batch_size)
+    logits = model_dict[args.model](images)
+    loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+        labels, logits, name="softmax_loss"
+    )
+    set_up_optimizer(loss, args)
+    return loss
+
+
+def main():
+    print("=".ljust(66, "="))
+    print(
+        "Running {}: num_gpu_per_node = {}, num_nodes = {}.".format(
+            args.model, args.gpu_num_per_node, args.node_num
+        )
+    )
+    print("=".ljust(66, "="))
+    for arg in vars(args):
+        print("{} = {}".format(arg, getattr(args, arg)))
+    print("-".ljust(66, "-"))
+    print("Time stamp: {}".format(str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))))
+    flow.env.log_dir(args.log_dir)
+    if args.node_num > 1:
+        nodes = []
+        for n in args.node_list.strip().split(","):
+            addr_dict = {}
+            addr_dict["addr"] = n
+            nodes.append(addr_dict)
+        flow.env.machine(nodes)
+    check_point = flow.train.CheckPoint()
+    if args.model_load_dir:
+        assert os.path.isdir(args.model_load_dir)
+        print("Restoring model from {}.".format(args.model_load_dir))
+        check_point.load(args.model_load_dir)
+    else:
+        print("Init model on demand.")
+        check_point.init()
+    total_batch_size = (
+        args.node_num * args.gpu_num_per_node * args.batch_size_per_device
+    )
+    speedometer = benchmark_util.CNNSpeedometer()
+    start_time = time.time()
+    for step in range(args.skip_iter_num + args.iter_num):
+        cb = speedometer.speedometer_cb(
+            step,
+            start_time,
+            total_batch_size,
+            args.skip_iter_num,
+            args.iter_num,
+            args.loss_print_every_n_iter,
+        )
+        TrainNet().async_get(cb)
+        if (step + 1) % args.model_save_every_n_iter == 0:
+            if not os.path.exists(args.model_save_dir):
+                os.makedirs(args.model_save_dir)
+            snapshot_save_path = os.path.join(
+                args.model_save_dir, "snapshot_%d" % (step + 1)
+            )
+            print("Saving model to {}.".format(snapshot_save_path))
+            check_point.save(snapshot_save_path)
+    if args.save_last_snapshot:
+        snapshot_save_path = os.path.join(args.model_save_dir, "last_snapshot")
+        if not os.path.exists(snapshot_save_path):
+            os.makedirs(snapshot_save_path)
+        print("Saving model to {}.".format(snapshot_save_path))
+        check_point.save(snapshot_save_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/oneflow/compatible/single_client/benchmarks/cnn_benchmark/of_cnn_infer_benchmarks.py b/python/oneflow/compatible/single_client/benchmarks/cnn_benchmark/of_cnn_infer_benchmarks.py
new file mode 100644
index 0000000000000000000000000000000000000000..47499b81193e0e49494f77b0a90ec5cfd771c195
--- /dev/null
+++ b/python/oneflow/compatible/single_client/benchmarks/cnn_benchmark/of_cnn_infer_benchmarks.py
@@ -0,0 +1,210 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import argparse
+import os
+import time
+from datetime import datetime
+
+import alexnet_model
+import data_loader
+import inceptionv3_model
+import resnet_model
+import vgg_model
+
+from oneflow.compatible import single_client as flow
+
+parser = argparse.ArgumentParser(description="flags for cnn benchmark")
+parser.add_argument("--gpu_num_per_node", type=int, default=1, required=False)
+parser.add_argument("--node_num", type=int, default=1)
+parser.add_argument(
+    "--node_list",
+    type=str,
+    default=None,
+    required=False,
+    help="nodes' IP address, split by comma",
+)
+parser.add_argument(
+    "--model", type=str, default="vgg16", required=False, help="vgg16 or resnet50"
+)
+parser.add_argument("--batch_size_per_device", type=int, default=8, required=False)
+parser.add_argument(
+    "--iter_num", type=int, default=10, required=False, help="total iterations to run"
+)
+parser.add_argument(
+    "--warmup_iter_num",
+    type=int,
+    default=0,
+    required=False,
+    help="total iterations to run",
+)
+parser.add_argument(
+    "--data_dir", type=str, default=None, required=False, help="dataset directory"
+)
+parser.add_argument(
+    "--data_part_num",
+    type=int,
+    default=32,
+    required=False,
+    help="data part number in dataset",
+)
+parser.add_argument(
+    "--image_size", type=int, default=228, required=False, help="image size"
+)
+parser.add_argument(
+    "--use_tensorrt",
+    dest="use_tensorrt",
+    action="store_true",
+    default=False,
+    required=False,
+    help="inference with tensorrt",
+)
+parser.add_argument(
+    "--use_xla_jit",
+    dest="use_xla_jit",
+    action="store_true",
+    default=False,
+    required=False,
+    help="inference with xla jit",
+)
+parser.add_argument(
+    "--precision",
+    type=str,
+    default="float32",
+    required=False,
+    help="inference with low precision",
+)
+parser.add_argument(
+    "--print_every_n_iter",
+    type=int,
+    default=1,
+    required=False,
+    help="print log every n iterations",
+)
+parser.add_argument(
+    "--model_load_dir",
+    type=str,
+    default=None,
+    required=False,
+    help="model load directory",
+)
+parser.add_argument(
+    "--log_dir",
+    type=str,
+    default="./output",
+    required=False,
+    help="log info save directory",
+)
+args = parser.parse_args()
+model_dict = {
+    "resnet50": resnet_model.resnet50,
+    "inceptionv3": inceptionv3_model.inceptionv3,
+    "vgg16": vgg_model.vgg16,
+    "alexnet": alexnet_model.alexnet,
+}
+func_config = flow.FunctionConfig()
+func_config.default_data_type(flow.float)
+flow.config.gpu_device_num(args.gpu_num_per_node)
+if args.use_tensorrt:
+    func_config.use_tensorrt()
+if args.use_xla_jit:
+    func_config.use_xla_jit()
+if args.precision == "float16":
+    if not args.use_tensorrt:
+        func_config.enable_auto_mixed_precision()
+    else:
+        func_config.tensorrt.use_fp16()
+
+
+@flow.global_function(func_config)
+def InferenceNet():
+    total_device_num = args.node_num * args.gpu_num_per_node
+    batch_size = total_device_num * args.batch_size_per_device
+    if args.data_dir:
+        assert os.path.exists(args.data_dir)
+        print("Loading data from {}".format(args.data_dir))
+        (labels, images) = data_loader.load_imagenet(
+            args.data_dir, args.image_size, batch_size, args.data_part_num
+        )
+    else:
+        print("Loading synthetic data.")
+        (labels, images) = data_loader.load_synthetic(args.image_size, batch_size)
+    logits = model_dict[args.model](images)
+    softmax = flow.nn.softmax(logits)
+    return softmax
+
+
+def main():
+    print("=".ljust(66, "="))
+    print(
+        "Running {}: num_gpu_per_node = {}, num_nodes = {}.".format(
+            args.model, args.gpu_num_per_node, args.node_num
+        )
+    )
+    print("=".ljust(66, "="))
+    for arg in vars(args):
+        print("{} = {}".format(arg, getattr(args, arg)))
+    print("-".ljust(66, "-"))
+    print("Time stamp: {}".format(str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))))
+    flow.env.log_dir(args.log_dir)
+    if args.node_num > 1:
+        nodes = []
+        for n in args.node_list.strip().split(","):
+            addr_dict = {}
+            addr_dict["addr"] = n
+            nodes.append(addr_dict)
+        flow.env.machine(nodes)
+    check_point = flow.train.CheckPoint()
+    if args.model_load_dir:
+        assert os.path.isdir(args.model_load_dir)
+        print("Restoring model from {}.".format(args.model_load_dir))
+        check_point.load(args.model_load_dir)
+    else:
+        print("Init model on demand.")
+        check_point.init()
+    print("Runing warm up for {} iterations.".format(args.warmup_iter_num))
+    for step in range(args.warmup_iter_num):
+        predictions = InferenceNet().get()
+    main.total_time = 0.0
+    main.batch_size = args.node_num * args.gpu_num_per_node * args.batch_size_per_device
+    main.start_time = time.time()
+
+    def create_callback(step):
+        def callback(predictions):
+            if step % args.print_every_n_iter == 0:
+                cur_time = time.time()
+                duration = cur_time - main.start_time
+                main.total_time += duration
+                main.start_time = cur_time
+                images_per_sec = main.batch_size / duration
+                print(
+                    "iter {}, speed: {:.3f}(sec/batch), {:.3f}(images/sec)".format(
+                        step, duration, images_per_sec
+                    )
+                )
+                if step == args.iter_num - 1:
+                    avg_img_per_sec = main.batch_size * args.iter_num / main.total_time
+                    print("-".ljust(66, "-"))
+                    print("average speed: {:.3f}(images/sec)".format(avg_img_per_sec))
+                    print("-".ljust(66, "-"))
+
+        return callback
+
+    for step in range(args.iter_num):
+        InferenceNet().async_get(create_callback(step))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/oneflow/compatible/single_client/benchmarks/cnn_benchmark/resnet_model.py b/python/oneflow/compatible/single_client/benchmarks/cnn_benchmark/resnet_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0df08d5c42cd6d215d91e1d47615b8c6a59be35
--- /dev/null
+++ b/python/oneflow/compatible/single_client/benchmarks/cnn_benchmark/resnet_model.py
@@ -0,0 +1,141 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+
+BLOCK_COUNTS = [3, 4, 6, 3]
+BLOCK_FILTERS = [256, 512, 1024, 2048]
+BLOCK_FILTERS_INNER = [64, 128, 256, 512]
+
+
+def _conv2d(
+    name,
+    input,
+    filters,
+    kernel_size,
+    strides=1,
+    padding="SAME",
+    data_format="NCHW",
+    dilations=1,
+    trainable=True,
+    weight_initializer=flow.variance_scaling_initializer(data_format="NCHW"),
+):
+    weight = flow.get_variable(
+        name + "-weight",
+        shape=(filters, input.shape[1], kernel_size, kernel_size),
+        dtype=input.dtype,
+        initializer=weight_initializer,
+        trainable=trainable,
+    )
+    return flow.nn.conv2d(
+        input, weight, strides, padding, None, data_format, dilations, name=name
+    )
+
+
+def _batch_norm(inputs, name=None, trainable=True):
+    return flow.layers.batch_normalization(
+        inputs=inputs,
+        axis=1,
+        momentum=0.997,
+        epsilon=1.001e-05,
+        center=True,
+        scale=True,
+        trainable=trainable,
+        name=name,
+    )
+
+
+def conv2d_affine(input, name, filters, kernel_size, strides, activation=None):
+    padding = "SAME" if strides > 1 or kernel_size > 1 else "VALID"
+    output = _conv2d(name, input, filters, kernel_size, strides, padding)
+    output = _batch_norm(output, name + "_bn")
+    if activation == "Relu":
+        output = flow.math.relu(output)
+    return output
+
+
+def bottleneck_transformation(input, block_name, filters, filters_inner, strides):
+    a = conv2d_affine(
+        input, block_name + "_branch2a", filters_inner, 1, 1, activation="Relu"
+    )
+    b = conv2d_affine(
+        a, block_name + "_branch2b", filters_inner, 3, strides, activation="Relu"
+    )
+    c = conv2d_affine(b, block_name + "_branch2c", filters, 1, 1)
+    return c
+
+
+def residual_block(input, block_name, filters, filters_inner, strides_init):
+    if strides_init != 1 or block_name == "res2_0":
+        shortcut = conv2d_affine(
+            input, block_name + "_branch1", filters, 1, strides_init
+        )
+    else:
+        shortcut = input
+    bottleneck = bottleneck_transformation(
+        input, block_name, filters, filters_inner, strides_init
+    )
+    return flow.math.relu(bottleneck + shortcut)
+
+
+def residual_stage(input, stage_name, counts, filters, filters_inner, stride_init=2):
+    output = input
+    for i in range(counts):
+        block_name = "%s_%d" % (stage_name, i)
+        output = residual_block(
+            output, block_name, filters, filters_inner, stride_init if i == 0 else 1
+        )
+    return output
+
+
+def resnet_conv_x_body(input, on_stage_end=lambda x: x):
+    output = input
+    for (i, (counts, filters, filters_inner)) in enumerate(
+        zip(BLOCK_COUNTS, BLOCK_FILTERS, BLOCK_FILTERS_INNER)
+    ):
+        stage_name = "res%d" % (i + 2)
+        output = residual_stage(
+            output, stage_name, counts, filters, filters_inner, 1 if i == 0 else 2
+        )
+        on_stage_end(output)
+    return output
+
+
+def resnet_stem(input):
+    conv1 = _conv2d("conv1", input, 64, 7, 2)
+    conv1_bn = flow.math.relu(_batch_norm(conv1, "conv1_bn"))
+    pool1 = flow.nn.max_pool2d(
+        conv1_bn, ksize=3, strides=2, padding="VALID", data_format="NCHW", name="pool1"
+    )
+    return pool1
+
+
+def resnet50(images, trainable=True):
+    with flow.scope.namespace("Resnet"):
+        stem = resnet_stem(images)
+        body = resnet_conv_x_body(stem, lambda x: x)
+        pool5 = flow.nn.avg_pool2d(
+            body, ksize=7, strides=1, padding="VALID", data_format="NCHW", name="pool5"
+        )
+        fc1001 = flow.layers.dense(
+            flow.reshape(pool5, (pool5.shape[0], -1)),
+            units=1001,
+            use_bias=True,
+            kernel_initializer=flow.xavier_uniform_initializer(),
+            bias_initializer=flow.zeros_initializer(),
+            trainable=trainable,
+            name="fc1001",
+        )
+    return fc1001
diff --git a/python/oneflow/compatible/single_client/benchmarks/cnn_benchmark/vgg_model.py b/python/oneflow/compatible/single_client/benchmarks/cnn_benchmark/vgg_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1c9c0fc83144e916f8fc2a35b592d41ea2ad27b
--- /dev/null
+++ b/python/oneflow/compatible/single_client/benchmarks/cnn_benchmark/vgg_model.py
@@ -0,0 +1,93 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from model_util import conv2d_layer
+
+from oneflow.compatible import single_client as flow
+from oneflow.core.job import initializer_conf_pb2 as initializer_conf_util
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+
+
+def _conv_block(in_blob, index, filters, conv_times):
+    conv_block = []
+    conv_block.insert(0, in_blob)
+    for i in range(conv_times):
+        conv_i = conv2d_layer(
+            name="conv{}".format(index),
+            input=conv_block[i],
+            filters=filters,
+            kernel_size=3,
+            strides=1,
+        )
+        conv_block.append(conv_i)
+        index += 1
+    return conv_block
+
+
+def vgg16(images, trainable=True):
+    conv1 = _conv_block(images, 0, 64, 2)
+    pool1 = flow.nn.max_pool2d(conv1[-1], 2, 2, "VALID", "NCHW", name="pool1")
+    conv2 = _conv_block(pool1, 2, 128, 2)
+    pool2 = flow.nn.max_pool2d(conv2[-1], 2, 2, "VALID", "NCHW", name="pool2")
+    conv3 = _conv_block(pool2, 4, 256, 3)
+    pool3 = flow.nn.max_pool2d(conv3[-1], 2, 2, "VALID", "NCHW", name="pool3")
+    conv4 = _conv_block(pool3, 7, 512, 3)
+    pool4 = flow.nn.max_pool2d(conv4[-1], 2, 2, "VALID", "NCHW", name="pool4")
+    conv5 = _conv_block(pool4, 10, 512, 3)
+    pool5 = flow.nn.max_pool2d(conv5[-1], 2, 2, "VALID", "NCHW", name="pool5")
+
+    def _get_kernel_initializer():
+        kernel_initializer = initializer_conf_util.InitializerConf()
+        kernel_initializer.truncated_normal_conf.std = 0.816496580927726
+        return kernel_initializer
+
+    def _get_bias_initializer():
+        bias_initializer = initializer_conf_util.InitializerConf()
+        bias_initializer.constant_conf.value = 0.0
+        return bias_initializer
+
+    pool5 = flow.reshape(pool5, [pool5.shape[0], -1])
+    fc6 = flow.layers.dense(
+        inputs=pool5,
+        units=4096,
+        activation=flow.math.relu,
+        use_bias=True,
+        kernel_initializer=_get_kernel_initializer(),
+        bias_initializer=_get_bias_initializer(),
+        trainable=trainable,
+        name="fc1",
+    )
+    fc6 = flow.nn.dropout(fc6, rate=0.5)
+    fc7 = flow.layers.dense(
+        inputs=fc6,
+        units=4096,
+        activation=flow.math.relu,
+        use_bias=True,
+        kernel_initializer=_get_kernel_initializer(),
+        bias_initializer=_get_bias_initializer(),
+        trainable=trainable,
+        name="fc2",
+    )
+    fc7 = flow.nn.dropout(fc7, rate=0.5)
+    fc8 = flow.layers.dense(
+        inputs=fc7,
+        units=1001,
+        use_bias=True,
+        kernel_initializer=_get_kernel_initializer(),
+        bias_initializer=_get_bias_initializer(),
+        trainable=trainable,
+        name="fc_final",
+    )
+    return fc8
diff --git a/python/oneflow/compatible/single_client/benchmarks/coco_data_load/coco_data_loader.py b/python/oneflow/compatible/single_client/benchmarks/coco_data_load/coco_data_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..272c803cecf87b4167a5b88a26402e7449e8fa8a
--- /dev/null
+++ b/python/oneflow/compatible/single_client/benchmarks/coco_data_load/coco_data_loader.py
@@ -0,0 +1,157 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import math
+import time
+
+import pandas as pd
+
+from oneflow.compatible import single_client as flow
+
+
+class COCODataLoadConfig(object):
+    def __init__(self):
+        self.annotation_file = (
+            "/dataset/mscoco_2017/annotations/instances_train2017.json"
+        )
+        self.image_dir = "/dataset/mscoco_2017/train2017"
+        self.shuffle_after_epoch = True
+        self.stride_partition = False
+        self.batch_size = 2
+        self.target_size = 800
+        self.max_size = 1333
+        self.image_align_size = 32
+        self.image_normal_std = (1.0, 1.0, 1.0)
+        self.image_normal_mean = (102.9801, 115.9465, 122.7717)
+        self.max_num_objs = 512
+
+
+def roundup(x, align):
+    return int(math.ceil(x / float(align)) * align)
+
+
+def coco_data_load(cfg, machine_id, nrank):
+    with flow.scope.placement("cpu", "{}:0-{}".format(machine_id, nrank - 1)):
+        (
+            image,
+            image_id,
+            image_size,
+            bbox,
+            label,
+            segm_poly,
+            segm_poly_index,
+        ) = flow.data.coco_reader(
+            annotation_file=cfg.annotation_file,
+            image_dir=cfg.image_dir,
+            batch_size=cfg.batch_size,
+            shuffle=cfg.shuffle_after_epoch,
+            stride_partition=cfg.stride_partition,
+            name="coco_reader",
+        )
+        image = flow.image.decode(image, dtype=flow.float)
+        aligned_target_size = roundup(cfg.target_size, cfg.image_align_size)
+        aligned_max_size = roundup(cfg.max_size, cfg.image_align_size)
+        (image, new_size, scale) = flow.image.target_resize(
+            image, target_size=aligned_target_size, max_size=aligned_max_size
+        )
+        bbox = flow.detection.object_bbox_scale(bbox, scale)
+        segm_poly = flow.detection.object_segmentation_polygon_scale(segm_poly, scale)
+        flip_code = flow.random.coin_flip(cfg.batch_size)
+        image = flow.image.flip(image, flip_code)
+        bbox = flow.detection.object_bbox_flip(bbox, new_size, flip_code)
+        segm_poly = flow.detection.object_segmentation_polygon_flip(
+            segm_poly, new_size, flip_code
+        )
+        image = flow.image.normalize(image, cfg.image_normal_std, cfg.image_normal_mean)
+        image = flow.image.batch_align(
+            image,
+            shape=(aligned_target_size, aligned_max_size, 3),
+            dtype=flow.float,
+            alignment=cfg.image_align_size,
+        )
+        gt_bbox = flow.tensor_buffer_to_list_of_tensors(
+            bbox, (cfg.max_num_objs, 4), flow.float, True
+        )
+        gt_label = flow.tensor_buffer_to_list_of_tensors(
+            label, (cfg.max_num_objs,), flow.int32, True
+        )
+        segm_mask = flow.detection.object_segmentation_polygon_to_mask(
+            segm_poly, segm_poly_index, new_size
+        )
+        gt_mask = flow.tensor_buffer_to_list_of_tensors(
+            segm_mask,
+            (cfg.max_num_objs, aligned_target_size, aligned_max_size),
+            flow.int8,
+            True,
+        )
+        return {
+            "image": image,
+            "image_size": new_size,
+            "gt_bbox": list(gt_bbox),
+            "gt_label": list(gt_label),
+            "gt_mask": list(gt_mask),
+        }
+
+
+def _make_data_load_fn():
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_distribute_strategy(flow.scope.consistent_view())
+    cfg = COCODataLoadConfig()
+
+    @flow.global_function(func_config)
+    def data_load_fn():
+        return coco_data_load(cfg, 0, 1)
+
+    return data_load_fn
+
+
+def _benchmark(iter_num, drop_first_iters, verbose=False):
+    flow.env.init()
+    data_loader = _make_data_load_fn()
+    s = pd.Series([], name="time_elapsed", dtype="float32")
+    timestamp = time.perf_counter()
+    for i in range(iter_num):
+        dict = data_loader().get()
+        image = dict["image"]
+        image_size = dict["image_size"]
+        gt_bbox = dict["gt_bbox"]
+        gt_label = dict["gt_label"]
+        gt_mask = dict["gt_mask"]
+        cur = time.perf_counter()
+        s[i] = cur - timestamp
+        timestamp = cur
+        if verbose:
+            print("==== iter {} ====".format(i))
+            print(
+                "image: {}\n".format(image.numpy_list()[0].shape), image.numpy_list()[0]
+            )
+            print(
+                "image_size: {}\n".format(image_size.numpy().shape), image_size.numpy()
+            )
+            print("gt_bbox:\n", [x.numpy_list()[0] for x in gt_bbox])
+            print("gt_label:\n", [x.numpy_list()[0] for x in gt_label])
+            print("gt_mask:\n", [x.numpy_list()[0] for x in gt_mask])
+    print(
+        "mean of time elapsed of {} iters (dropped {} first iters): {}".format(
+            iter_num, drop_first_iters, s[drop_first_iters:].mean()
+        )
+    )
+    s.to_csv("coco_data_benchmark.csv", header=True)
+
+
+if __name__ == "__main__":
+    _benchmark(500, 10)
diff --git a/python/oneflow/compatible/single_client/checkpoint.py b/python/oneflow/compatible/single_client/checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..709007e076d710f7c332aa4897cb50f834af1869
--- /dev/null
+++ b/python/oneflow/compatible/single_client/checkpoint.py
@@ -0,0 +1,21 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.framework.check_point_v2 import (
+    GetCheckpoint as get,
+)
+from oneflow.compatible.single_client.framework.check_point_v2 import (
+    SaveVarDict as save,
+)
diff --git a/python/oneflow/compatible/single_client/config/__init__.py b/python/oneflow/compatible/single_client/config/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf1a2dde04ffc79c658e007db224849c27d6e7e9
--- /dev/null
+++ b/python/oneflow/compatible/single_client/config/__init__.py
@@ -0,0 +1,89 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.framework.config_util import (
+    api_collect_act_event as collect_act_event,
+)
+from oneflow.compatible.single_client.framework.config_util import (
+    api_comm_net_worker_num as comm_net_worker_num,
+)
+from oneflow.compatible.single_client.framework.config_util import (
+    api_compute_thread_pool_size as compute_thread_pool_size,
+)
+from oneflow.compatible.single_client.framework.config_util import (
+    api_cpu_device_num as cpu_device_num,
+)
+from oneflow.compatible.single_client.framework.config_util import (
+    api_disable_group_boxing_by_dst_parallel as disable_group_boxing_by_dst_parallel,
+)
+from oneflow.compatible.single_client.framework.config_util import (
+    api_enable_debug_mode as enable_debug_mode,
+)
+from oneflow.compatible.single_client.framework.config_util import (
+    api_enable_legacy_model_io as enable_legacy_model_io,
+)
+from oneflow.compatible.single_client.framework.config_util import (
+    api_enable_mem_chain_merge as enable_mem_chain_merge,
+)
+from oneflow.compatible.single_client.framework.config_util import (
+    api_enable_model_io_v2 as enable_model_io_v2,
+)
+from oneflow.compatible.single_client.framework.config_util import (
+    api_enable_tensor_float_32_compute as enable_tensor_float_32_compute,
+)
+from oneflow.compatible.single_client.framework.config_util import (
+    api_gpu_device_num as gpu_device_num,
+)
+from oneflow.compatible.single_client.framework.config_util import (
+    api_legacy_model_io_enabled as legacy_model_io_enabled,
+)
+from oneflow.compatible.single_client.framework.config_util import (
+    api_load_library as load_library,
+)
+from oneflow.compatible.single_client.framework.config_util import (
+    api_load_library_now as load_library_now,
+)
+from oneflow.compatible.single_client.framework.config_util import (
+    api_machine_num as machine_num,
+)
+from oneflow.compatible.single_client.framework.config_util import (
+    api_max_mdsave_worker_num as max_mdsave_worker_num,
+)
+from oneflow.compatible.single_client.framework.config_util import (
+    api_nccl_use_compute_stream as nccl_use_compute_stream,
+)
+from oneflow.compatible.single_client.framework.config_util import (
+    api_numa_aware_cuda_malloc_host as enable_numa_aware_cuda_malloc_host,
+)
+from oneflow.compatible.single_client.framework.config_util import (
+    api_rdma_mem_block_mbyte as rdma_mem_block_mbyte,
+)
+from oneflow.compatible.single_client.framework.config_util import (
+    api_rdma_recv_msg_buf_mbyte as rdma_recv_msg_buf_mbyte,
+)
+from oneflow.compatible.single_client.framework.config_util import (
+    api_reserved_device_mem_mbyte as reserved_device_mem_mbyte,
+)
+from oneflow.compatible.single_client.framework.config_util import (
+    api_reserved_host_mem_mbyte as reserved_host_mem_mbyte,
+)
+from oneflow.compatible.single_client.framework.config_util import (
+    api_thread_enable_local_message_queue as thread_enable_local_message_queue,
+)
+from oneflow.compatible.single_client.framework.config_util import (
+    api_use_rdma as use_rdma,
+)
+
+from . import collective_boxing
diff --git a/python/oneflow/compatible/single_client/config/collective_boxing.py b/python/oneflow/compatible/single_client/config/collective_boxing.py
new file mode 100644
index 0000000000000000000000000000000000000000..77c67e39c13f5eeb6e24db67e9546f7f5ee72cd7
--- /dev/null
+++ b/python/oneflow/compatible/single_client/config/collective_boxing.py
@@ -0,0 +1,54 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.framework.config_util import (
+    api_enable_fusion as enable_fusion,
+)
+from oneflow.compatible.single_client.framework.config_util import (
+    api_nccl_enable_all_to_all as nccl_enable_all_to_all,
+)
+from oneflow.compatible.single_client.framework.config_util import (
+    api_nccl_enable_mixed_fusion as nccl_enable_mixed_fusion,
+)
+from oneflow.compatible.single_client.framework.config_util import (
+    api_nccl_fusion_all_gather as nccl_fusion_all_gather,
+)
+from oneflow.compatible.single_client.framework.config_util import (
+    api_nccl_fusion_all_reduce as nccl_fusion_all_reduce,
+)
+from oneflow.compatible.single_client.framework.config_util import (
+    api_nccl_fusion_all_reduce_use_buffer as nccl_fusion_all_reduce_use_buffer,
+)
+from oneflow.compatible.single_client.framework.config_util import (
+    api_nccl_fusion_broadcast as nccl_fusion_broadcast,
+)
+from oneflow.compatible.single_client.framework.config_util import (
+    api_nccl_fusion_max_ops as nccl_fusion_max_ops,
+)
+from oneflow.compatible.single_client.framework.config_util import (
+    api_nccl_fusion_reduce as nccl_fusion_reduce,
+)
+from oneflow.compatible.single_client.framework.config_util import (
+    api_nccl_fusion_reduce_scatter as nccl_fusion_reduce_scatter,
+)
+from oneflow.compatible.single_client.framework.config_util import (
+    api_nccl_fusion_threshold_mb as nccl_fusion_threshold_mb,
+)
+from oneflow.compatible.single_client.framework.config_util import (
+    api_nccl_num_streams as nccl_num_streams,
+)
+from oneflow.compatible.single_client.framework.config_util import (
+    api_num_callback_threads as num_callback_threads,
+)
diff --git a/python/oneflow/compatible/single_client/contrib/__init__.py b/python/oneflow/compatible/single_client/contrib/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb5daf782a0d43a4e7d17d9e2462194ac5658caa
--- /dev/null
+++ b/python/oneflow/compatible/single_client/contrib/__init__.py
@@ -0,0 +1,16 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from .tensorrt import *
diff --git a/python/oneflow/compatible/single_client/contrib/tensorrt/__init__.py b/python/oneflow/compatible/single_client/contrib/tensorrt/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/python/oneflow/compatible/single_client/contrib/tensorrt/tensorrt_api.py b/python/oneflow/compatible/single_client/contrib/tensorrt/tensorrt_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..6add6042f7859b6e133879fb846eada5a08bc573
--- /dev/null
+++ b/python/oneflow/compatible/single_client/contrib/tensorrt/tensorrt_api.py
@@ -0,0 +1,32 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import traceback
+
+import oneflow._oneflow_internal
+
+
+def write_int8_calibration(path):
+    try:
+        oneflow._oneflow_internal.WriteInt8Calibration(path)
+    except oneflow._oneflow_internal.exception.CompileOptionWrongException:
+        traceback.print_exc()
+
+
+def cache_int8_calibration():
+    try:
+        oneflow._oneflow_internal.CacheInt8Calibration()
+    except oneflow._oneflow_internal.exception.CompileOptionWrongException:
+        traceback.print_exc()
diff --git a/python/oneflow/compatible/single_client/data.py b/python/oneflow/compatible/single_client/data.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcfd58c43086d2338b94f91fe8a301b8f5e19b0c
--- /dev/null
+++ b/python/oneflow/compatible/single_client/data.py
@@ -0,0 +1,71 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.experimental.load_mnist import load_mnist
+from oneflow.compatible.single_client.ops.data_ops import (
+    BlobConf,
+    ImageCodec,
+    ImagePreprocessor,
+    ImageResizePreprocessor,
+    NormByChannelPreprocessor,
+    RawCodec,
+    decode_ofrecord,
+    decode_random,
+)
+from oneflow.compatible.single_client.ops.data_ops import (
+    image_decoder_random_crop_resize,
+)
+from oneflow.compatible.single_client.ops.data_ops import (
+    image_decoder_random_crop_resize as ImageDecoderRandomCropResize,
+)
+from oneflow.compatible.single_client.ops.data_ops import (
+    ofrecord_loader,
+    ofrecord_reader,
+    onerec_reader,
+)
+from oneflow.compatible.single_client.ops.user_data_ops import OFRecordBytesDecoder
+from oneflow.compatible.single_client.ops.user_data_ops import (
+    OFRecordBytesDecoder as ofrecord_bytes_decoder,
+)
+from oneflow.compatible.single_client.ops.user_data_ops import OFRecordImageDecoder
+from oneflow.compatible.single_client.ops.user_data_ops import (
+    OFRecordImageDecoder as ofrecord_image_decoder,
+)
+from oneflow.compatible.single_client.ops.user_data_ops import OFRecordRawDecoder
+from oneflow.compatible.single_client.ops.user_data_ops import (
+    OFRecordRawDecoder as ofrecord_raw_decoder,
+)
+from oneflow.compatible.single_client.ops.user_data_ops import OneRecDecoder
+from oneflow.compatible.single_client.ops.user_data_ops import (
+    OneRecDecoder as onerec_decoder,
+)
+from oneflow.compatible.single_client.ops.user_data_ops import (
+    api_coco_reader as coco_reader,
+)
+from oneflow.compatible.single_client.ops.user_data_ops import (
+    api_ofrecord_image_decoder_random_crop as OFRecordImageDecoderRandomCrop,
+)
+from oneflow.compatible.single_client.ops.user_data_ops import (
+    api_ofrecord_image_decoder_random_crop as ofrecord_image_decoder_random_crop,
+)
+from oneflow.compatible.single_client.ops.user_data_ops import (
+    gpt_data_loader as MegatronGPTMMapDataLoader,
+)
+from oneflow.compatible.single_client.ops.user_data_ops import (
+    gpt_data_loader as megatron_gpt_mmap_data_loader,
+)
+from oneflow.compatible.single_client.ops.user_data_ops import (
+    ofrecord_image_classification_reader,
+)
diff --git a/python/oneflow/compatible/single_client/deprecated/__init__.py b/python/oneflow/compatible/single_client/deprecated/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..17eb58b3cf2284ba7a02474f034c39d79e21dccd
--- /dev/null
+++ b/python/oneflow/compatible/single_client/deprecated/__init__.py
@@ -0,0 +1,25 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from oneflow.compatible.single_client.deprecated.init_cluster_env import (
+    delete_worker_by_bootstrap,
+    delete_worker_of_multi_process,
+)
+from oneflow.compatible.single_client.experimental.namescope import (
+    deprecated_name_scope as variable_scope,
+)
+
+from . import nn
diff --git a/python/oneflow/compatible/single_client/deprecated/init_cluster_env.py b/python/oneflow/compatible/single_client/deprecated/init_cluster_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..fadafe4a45fe7d4ee08d5e8c6f7a29c5d6dc24f3
--- /dev/null
+++ b/python/oneflow/compatible/single_client/deprecated/init_cluster_env.py
@@ -0,0 +1,63 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import getpass
+import os
+import subprocess
+import sys
+import uuid
+from tempfile import NamedTemporaryFile
+
+from google.protobuf import text_format as pbtxt
+
+from oneflow.compatible.single_client.framework import env_util as env_util
+from oneflow.core.control.ctrl_bootstrap_pb2 import BootstrapConf
+from oneflow.core.job.env_pb2 import EnvProto
+
+
+def delete_worker_by_bootstrap(ssh_port=22) -> None:
+    ssh_port_arg = " -p {} ".format(ssh_port)
+    bootstrap_conf_list = env_util.global_ctrl_bootstrap_confs
+    assert isinstance(bootstrap_conf_list, list)
+    global _temp_run_dir
+    assert _temp_run_dir != ""
+    for bootstrap_conf in bootstrap_conf_list:
+        assert isinstance(bootstrap_conf, BootstrapConf)
+        if bootstrap_conf.rank == 0:
+            continue
+        ssh_prefix = (
+            "ssh {} ".format(ssh_port_arg)
+            + getpass.getuser()
+            + "@"
+            + bootstrap_conf.host
+            + " "
+        )
+        if os.getenv("ONEFLOW_WORKER_KEEP_LOG"):
+            print("worker log kept at: {}".format(bootstrap_conf.host), flush=True)
+        else:
+            _SystemCall(ssh_prefix + '"rm -r ' + _temp_run_dir + '"')
+            print("temp run dir removed at: {}".format(bootstrap_conf.host), flush=True)
+
+
+def delete_worker_of_multi_process(run_dir) -> None:
+    assert run_dir != ""
+    if os.getenv("ONEFLOW_WORKER_KEEP_LOG"):
+        print("worker log kept at localhost:" + run_dir, flush=True)
+    else:
+        os.system("rm -r " + run_dir)
+        print("temp run dir removed at localhost:" + run_dir, flush=True)
+
+
+_temp_run_dir = ""
diff --git a/python/oneflow/compatible/single_client/deprecated/initializer_util.py b/python/oneflow/compatible/single_client/deprecated/initializer_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c3dd4693f66477b807181b8fc554d60ea7596d9
--- /dev/null
+++ b/python/oneflow/compatible/single_client/deprecated/initializer_util.py
@@ -0,0 +1,26 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.core.common import data_type_pb2 as data_type_conf_util
+from oneflow.core.job import initializer_conf_pb2 as initializer_conf_util
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+
+
+def truncated_normal_initializer(
+    stddev: float = 1.0,
+) -> initializer_conf_util.InitializerConf:
+    initializer = initializer_conf_util.InitializerConf()
+    setattr(initializer.truncated_normal_conf, "std", float(stddev))
+    return initializer
diff --git a/python/oneflow/compatible/single_client/deprecated/nn.py b/python/oneflow/compatible/single_client/deprecated/nn.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef6127a8679323607ed0df552a19b516cfc726f4
--- /dev/null
+++ b/python/oneflow/compatible/single_client/deprecated/nn.py
@@ -0,0 +1,16 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.framework.module import Module
diff --git a/python/oneflow/compatible/single_client/detection.py b/python/oneflow/compatible/single_client/detection.py
new file mode 100644
index 0000000000000000000000000000000000000000..58fb7e6e3122ec5d505f4e0960e1bd5a00e714e4
--- /dev/null
+++ b/python/oneflow/compatible/single_client/detection.py
@@ -0,0 +1,28 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.ops.user_data_ops import (
+    object_bbox_flip,
+    object_bbox_scale,
+)
+from oneflow.compatible.single_client.ops.user_data_ops import (
+    object_segm_poly_flip as object_segmentation_polygon_flip,
+)
+from oneflow.compatible.single_client.ops.user_data_ops import (
+    object_segm_poly_scale as object_segmentation_polygon_scale,
+)
+from oneflow.compatible.single_client.ops.user_data_ops import (
+    object_segm_poly_to_mask as object_segmentation_polygon_to_mask,
+)
diff --git a/python/oneflow/compatible/single_client/distribute.py b/python/oneflow/compatible/single_client/distribute.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef0cd59ed9e40583b8c202712863e43429b63037
--- /dev/null
+++ b/python/oneflow/compatible/single_client/distribute.py
@@ -0,0 +1,33 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.framework.distribute import (
+    assert_is_valid_distribute,
+    auto,
+    broadcast,
+)
+from oneflow.compatible.single_client.framework.distribute import (
+    deprecated_consistent_strategy as consistent_strategy,
+)
+from oneflow.compatible.single_client.framework.distribute import (
+    deprecated_consistent_strategy_enabled as consistent_strategy_enabled,
+)
+from oneflow.compatible.single_client.framework.distribute import (
+    deprecated_mirrored_strategy as mirrored_strategy,
+)
+from oneflow.compatible.single_client.framework.distribute import (
+    deprecated_mirrored_strategy_enabled as mirrored_strategy_enabled,
+)
+from oneflow.compatible.single_client.framework.distribute import split
diff --git a/python/oneflow/compatible/single_client/distributed.py b/python/oneflow/compatible/single_client/distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..56e6e6f4b0808aa5a1e13c259b1294656e3b88b9
--- /dev/null
+++ b/python/oneflow/compatible/single_client/distributed.py
@@ -0,0 +1,21 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.framework.distribute import (
+    get_local_rank,
+    get_rank,
+    get_world_size,
+    is_multi_client,
+)
diff --git a/python/oneflow/compatible/single_client/eager/__init__.py b/python/oneflow/compatible/single_client/eager/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/python/oneflow/compatible/single_client/eager/blob_register.py b/python/oneflow/compatible/single_client/eager/blob_register.py
new file mode 100644
index 0000000000000000000000000000000000000000..437788ffa8858bf473b43fa54e51baf83fb2ed4c
--- /dev/null
+++ b/python/oneflow/compatible/single_client/eager/blob_register.py
@@ -0,0 +1,35 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from contextlib import contextmanager
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+
+
+@contextmanager
+def BnInOp2BlobObjectScope(blob_register, op_attribute):
+    bn_in_op2blob_object = oneflow._oneflow_internal.deprecated.BnInOp2BlobObject()
+    for ibn in op_attribute.input_bns:
+        lbi = op_attribute.arg_signature.bn_in_op2lbi[ibn]
+        bn_in_op2blob_object[ibn] = blob_register.GetObject4BlobName(
+            "%s/%s" % (lbi.op_name, lbi.blob_name)
+        )
+    yield bn_in_op2blob_object
+    for obn in op_attribute.output_bns:
+        lbi = op_attribute.arg_signature.bn_in_op2lbi[obn]
+        blob_register.SetObject4BlobName(
+            "%s/%s" % (lbi.op_name, lbi.blob_name), bn_in_op2blob_object[obn]
+        )
diff --git a/python/oneflow/compatible/single_client/eager/boxing_hob.py b/python/oneflow/compatible/single_client/eager/boxing_hob.py
new file mode 100644
index 0000000000000000000000000000000000000000..42f196e83a5088b8a82f8596951067ef7a69392d
--- /dev/null
+++ b/python/oneflow/compatible/single_client/eager/boxing_hob.py
@@ -0,0 +1,167 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow._oneflow_internal
+from oneflow.compatible.single_client.support.high_order_bool import (
+    BoolFunctor,
+    bool_functor,
+    hob_context_attr,
+)
+
+
+class BoxingHobContext(object):
+    def __init__(self, produced_blob_object, consumer_op_arg_parallel_attr):
+        self.produced_blob_object_ = produced_blob_object
+        self.consumer_op_arg_parallel_attr_ = consumer_op_arg_parallel_attr
+        self.composer2lhs_context = {}
+        self.composer2rhs_context = {}
+        self.composer2middle_op_arg_parallel_attr = {}
+
+    @property
+    def produced_blob_object(self):
+        return self.produced_blob_object_
+
+    @property
+    def consumer_op_arg_parallel_attr(self):
+        return self.consumer_op_arg_parallel_attr_
+
+
+class ComposeHob(BoolFunctor):
+    def __init__(
+        self, lhs_hob, rhs_hob, get_middle_op_arg_parallel_attr, middle_verbose_str=None
+    ):
+        self.get_middle_op_arg_parallel_attr_ = get_middle_op_arg_parallel_attr
+        self.lhs_hob_ = lhs_hob
+        self.rhs_hob_ = rhs_hob
+        self.ctx_id2middle_op_arg_parallel_attr_ = {}
+        self.middle_verbose_str_ = middle_verbose_str
+
+    def verbose_debug_str(self, ctx, display_result=True):
+        left_display = self.lhs_hob_.debug_str(self._GetLhsContext(ctx), display_result)
+        display_result = display_result and self.lhs_hob_(self._GetLhsContext(ctx))
+        right_display = self.rhs_hob_.debug_str(
+            self._GetRhsContext(ctx), display_result
+        )
+        return "%s -> %s" % (left_display, right_display)
+
+    def __call__(self, ctx):
+        return self.lhs_hob_(self._GetLhsContext(ctx)) and self.rhs_hob_(
+            self._GetRhsContext(ctx)
+        )
+
+    def _GetLhsContext(self, ctx):
+        if self not in ctx.composer2lhs_context:
+            blob_object = oneflow._oneflow_internal.BlobObject(
+                ctx.produced_blob_object.object_id,
+                ctx.produced_blob_object.op_arg_parallel_attr,
+                ctx.produced_blob_object.op_arg_blob_attr,
+            )
+            value = BoxingHobContext(blob_object, self._GetMiddleOpArgParallelAttr(ctx))
+            ctx.composer2lhs_context[self] = value
+        return ctx.composer2lhs_context[self]
+
+    def _GetRhsContext(self, ctx):
+        if self not in ctx.composer2rhs_context:
+            middle_blob_object = oneflow._oneflow_internal.BlobObject(
+                ctx.produced_blob_object.object_id,
+                self._GetMiddleOpArgParallelAttr(ctx),
+                ctx.produced_blob_object.op_arg_blob_attr,
+            )
+            value = BoxingHobContext(
+                middle_blob_object, ctx.consumer_op_arg_parallel_attr
+            )
+            ctx.composer2rhs_context[self] = value
+        return ctx.composer2rhs_context[self]
+
+    def _GetMiddleOpArgParallelAttr(self, ctx):
+        if self not in ctx.composer2middle_op_arg_parallel_attr:
+            value = self.get_middle_op_arg_parallel_attr_(
+                None, ctx.produced_blob_object, ctx.consumer_op_arg_parallel_attr
+            )
+            if self.middle_verbose_str_ is not None:
+                print("=== %s ===" % self.middle_verbose_str_)
+                print(value)
+            ctx.composer2middle_op_arg_parallel_attr[self] = value
+        return ctx.composer2middle_op_arg_parallel_attr[self]
+
+
+@bool_functor("SingleMachine")
+def SingleMachine(ctx):
+    blob_device_ids = dict(
+        ctx.produced_blob_object.parallel_desc_symbol.machine_id2device_id_list
+    )
+    arg_parallel_desc_symbol = ctx.consumer_op_arg_parallel_attr.parallel_desc_symbol
+    op_arg_device_ids = dict(arg_parallel_desc_symbol.machine_id2device_id_list)
+    return list(blob_device_ids.keys()) == [0] and list(op_arg_device_ids.keys()) == [0]
+
+
+@bool_functor("MatchDeviceOneToOnePerMachine")
+def MatchDeviceOneToOnePerMachine(ctx):
+    blob_device_ids = dict(
+        ctx.produced_blob_object.parallel_desc_symbol.machine_id2device_id_list
+    )
+    arg_parallel_desc_symbol = ctx.consumer_op_arg_parallel_attr.parallel_desc_symbol
+    op_arg_device_ids = dict(arg_parallel_desc_symbol.machine_id2device_id_list)
+    if blob_device_ids.keys() != op_arg_device_ids.keys():
+        return False
+    for key in blob_device_ids.keys():
+        if len(blob_device_ids[key]) != len(op_arg_device_ids[key]):
+            return False
+    return True
+
+
+@bool_functor("Verbose")
+def Verbose(ctx):
+    print("============[producer]============")
+    print(ctx.produced_blob_object.op_arg_parallel_attr.parallel_desc_symbol)
+    print(ctx.produced_blob_object.op_arg_parallel_attr.sbp_parallel)
+    print("============[consumer]============")
+    print(ctx.consumer_op_arg_parallel_attr.parallel_desc_symbol)
+    print(ctx.consumer_op_arg_parallel_attr.sbp_parallel)
+    return True
+
+
+@bool_functor("producer's devices contained in consumer's devices")
+def ProducerDevicesContainedInConsumerDevices(ctx):
+    return ctx.consumer_op_arg_parallel_attr.parallel_desc_symbol.Containing(
+        ctx.produced_blob_object.parallel_desc_symbol
+    )
+
+
+@bool_functor("consumer's devices contained in producer's devices")
+def ConsumerDevicesContainedInProducerDevices(ctx):
+    return ctx.produced_blob_object.parallel_desc_symbol.Containing(
+        ctx.consumer_op_arg_parallel_attr.parallel_desc_symbol
+    )
+
+
+@hob_context_attr("consumer_sbp_parallel")
+def consumer_sbp_parallel(ctx):
+    return ctx.consumer_op_arg_parallel_attr.sbp_parallel
+
+
+@hob_context_attr("producer_sbp_parallel")
+def producer_sbp_parallel(ctx):
+    return ctx.produced_blob_object.op_arg_parallel_attr.sbp_parallel
+
+
+@hob_context_attr("producer_parallel_desc")
+def producer_parallel_desc(ctx):
+    return ctx.produced_blob_object.op_arg_parallel_attr.parallel_desc_symbol
+
+
+@hob_context_attr("consumer_parallel_desc")
+def consumer_parallel_desc(ctx):
+    return ctx.consumer_op_arg_parallel_attr.parallel_desc_symbol
diff --git a/python/oneflow/compatible/single_client/eager/boxing_middle.py b/python/oneflow/compatible/single_client/eager/boxing_middle.py
new file mode 100644
index 0000000000000000000000000000000000000000..7de49c3771926a6a91ccdfb8d59fc53a37a5ec77
--- /dev/null
+++ b/python/oneflow/compatible/single_client/eager/boxing_middle.py
@@ -0,0 +1,173 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import random
+
+import oneflow._oneflow_internal
+from oneflow._oneflow_internal.oneflow.core.common import shape as shape_proto_cfg
+from oneflow._oneflow_internal.oneflow.core.job import placement as placement_cfg
+from oneflow.compatible.single_client.eager import symbol as symbol_util
+from oneflow.core.job import sbp_parallel_pb2 as sbp_parallel_pb
+
+
+class BoxingToMiddle(object):
+    def __init__(
+        self,
+        boxing_method,
+        get_middle_parallel_desc_symbol,
+        get_middle_sbp_parallel,
+        verbose=False,
+    ):
+        self.boxing_method_ = boxing_method
+        self.get_middle_op_arg_parallel_attr_ = MiddleOpArgParallelAttr(
+            get_middle_parallel_desc_symbol, get_middle_sbp_parallel
+        )
+        self.verbose_ = verbose
+
+    @property
+    def boxing_method(self):
+        return self.boxing_method_
+
+    @property
+    def get_middle_op_arg_parallel_attr(self):
+        return self.get_middle_op_arg_parallel_attr_
+
+    @property
+    def verbose(self):
+        return self.verbose_
+
+
+def MiddleOpArgParallelAttr(get_parallel_desc_symbol, get_sbp_parallel):
+    def GetOpArgParallelAttr(
+        builder, produced_blob_object, consumer_op_arg_parallel_attr
+    ):
+        return oneflow._oneflow_internal.OpArgParallelAttribute(
+            get_parallel_desc_symbol(
+                builder, produced_blob_object, consumer_op_arg_parallel_attr
+            ),
+            str(
+                get_sbp_parallel(
+                    builder, produced_blob_object, consumer_op_arg_parallel_attr
+                )
+            ),
+            str(produced_blob_object.op_arg_parallel_attr.opt_mirrored_parallel),
+        )
+
+    return GetOpArgParallelAttr
+
+
+def ReplaceProducerDeviceTag(new_device_tag):
+    def Getter(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+        x_parallel_attr = produced_blob_object.op_arg_parallel_attr
+        return TryReplaceDeviceTag(
+            builder, x_parallel_attr.parallel_desc_symbol, new_device_tag
+        )
+
+    return Getter
+
+
+def ProducerRandomParallelIdPerMachine(device_tag=None):
+    def Getter(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+        return RandomParallelIdPerMachine(
+            produced_blob_object.parallel_desc_symbol,
+            device_tag=device_tag,
+            builder=builder,
+        )
+
+    return Getter
+
+
+def ConsumerRandomParallelIdPerMachine(device_tag=None):
+    def Getter(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+        return RandomParallelIdPerMachine(
+            consumer_op_arg_parallel_attr.parallel_desc_symbol,
+            device_tag=device_tag,
+            builder=builder,
+        )
+
+    return Getter
+
+
+def ProducerParallelDesc(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+    return produced_blob_object.parallel_desc_symbol
+
+
+def ConsumerParallelDesc(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+    return consumer_op_arg_parallel_attr.parallel_desc_symbol
+
+
+def ReplaceConsumerDeviceTag(new_device_tag):
+    def Getter(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+        parallel_desc_sym = consumer_op_arg_parallel_attr.parallel_desc_symbol
+        return TryReplaceDeviceTag(builder, parallel_desc_sym, new_device_tag)
+
+    return Getter
+
+
+def BroadcastParallel(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+    sbp_parallel = sbp_parallel_pb.SbpParallel()
+    sbp_parallel.broadcast_parallel.SetInParent()
+    return sbp_parallel
+
+
+def ProducerSbpParallel(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+    return produced_blob_object.op_arg_parallel_attr.sbp_parallel
+
+
+def ConsumerSbpParallel(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+    return consumer_op_arg_parallel_attr.sbp_parallel
+
+
+def TryReplaceDeviceTag(builder, parallel_desc_symbol, device_tag):
+    if parallel_desc_symbol.device_tag == device_tag:
+        return parallel_desc_symbol
+    else:
+        return ReplaceDeviceTag(parallel_desc_symbol, device_tag, builder=builder)
+
+
+def ReplaceDeviceTag(parallel_desc_symbol, device_tag, builder=None):
+    assert parallel_desc_symbol.device_tag != device_tag
+    parallel_conf = placement_cfg.ParallelConf()
+    parallel_conf.set_device_tag(device_tag)
+    for device_name in parallel_desc_symbol.parallel_conf.device_name():
+        parallel_conf.add_device_name(device_name)
+    hierarchy = shape_proto_cfg.ShapeProto()
+    for dim in parallel_desc_symbol.hierarchy:
+        hierarchy.add_dim(dim)
+    assert hierarchy.dim_size() > 0
+    parallel_conf.mutable_hierarchy().CopyFrom(hierarchy)
+    if builder is None:
+        return oneflow._oneflow_internal.PlacementSymbol(
+            parallel_desc_symbol.symbol_id, parallel_conf
+        )
+    else:
+        return builder.GetParallelDescSymbol(parallel_conf)
+
+
+def RandomParallelIdPerMachine(parallel_desc_symbol, device_tag=None, builder=None):
+    if device_tag is None:
+        device_tag = parallel_desc_symbol.parallel_conf.device_tag()
+    assert device_tag is not None
+    parallel_conf = placement_cfg.ParallelConf()
+    parallel_conf.set_device_tag(device_tag)
+    for (machine_id, dev_ids) in parallel_desc_symbol.machine_id2device_id_list.items():
+        dev_id = dev_ids[random.randint(0, len(dev_ids) - 1)]
+        parallel_conf.add_device_name("@%s:%s" % (machine_id, dev_id))
+    if builder is None:
+        return oneflow._oneflow_internal.PlacementSymbol(
+            parallel_desc_symbol.symbol_id, parallel_conf
+        )
+    else:
+        return builder.GetParallelDescSymbol(parallel_conf)
diff --git a/python/oneflow/compatible/single_client/eager/boxing_util.py b/python/oneflow/compatible/single_client/eager/boxing_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..81a545a966e780a3413d454d5c7f8e7c536b9027
--- /dev/null
+++ b/python/oneflow/compatible/single_client/eager/boxing_util.py
@@ -0,0 +1,983 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import random
+from contextlib import contextmanager
+
+import oneflow._oneflow_internal
+from oneflow._oneflow_internal.oneflow.core.common import shape as shape_proto_cfg
+from oneflow._oneflow_internal.oneflow.core.job import placement as placement_cfg
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.eager import boxing_hob as boxing_hob
+from oneflow.compatible.single_client.eager import boxing_middle as boxing_middle
+from oneflow.compatible.single_client.eager import op_infer_util as op_infer_util
+from oneflow.compatible.single_client.eager import symbol as symbol_util
+from oneflow.compatible.single_client.eager.boxing_hob import BoxingHobContext
+from oneflow.compatible.single_client.framework import (
+    balanced_splitter as balanced_splitter,
+)
+from oneflow.compatible.single_client.framework import c_api_util as c_api_util
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.support import enable_if as enable_if
+from oneflow.compatible.single_client.support import high_order_bool as high_order_bool
+from oneflow.core.job import sbp_parallel_pb2 as sbp_parallel_pb
+from oneflow.core.operator import op_attribute_pb2 as op_attribute_pb
+from oneflow.core.operator import op_conf_pb2 as op_conf_pb
+from oneflow.core.register import logical_blob_id_pb2 as logical_blob_id_util
+
+
+def BoxingTo(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+    hob_context = BoxingHobContext(produced_blob_object, consumer_op_arg_parallel_attr)
+    if enable_if.get_condition_hob(NoBoxing)(hob_context):
+        return produced_blob_object
+    producer_opt_mirrored_parallel = (
+        produced_blob_object.op_arg_parallel_attr.opt_mirrored_parallel
+    )
+    consumer_opt_mirrored_parallel = consumer_op_arg_parallel_attr.opt_mirrored_parallel
+    assert producer_opt_mirrored_parallel == consumer_opt_mirrored_parallel, (
+        "\nproducer_op_arg_parallel_attr: %s\nconsumer_op_arg_parallel_attr: %s"
+        % (produced_blob_object.op_arg_parallel_attr, consumer_op_arg_parallel_attr)
+    )
+
+    def default(get_failed_info, *args, **kwargs):
+        raise NotImplementedError(
+            "%s\nno boxing method found.\nlogical_blob_name: %s\nx_arg_attribute: %s\nconsumer_op_arg_parallel_attr: %s\n"
+            % (
+                get_failed_info(),
+                produced_blob_object.op_arg_blob_attr.logical_blob_name,
+                produced_blob_object.op_arg_parallel_attr,
+                consumer_op_arg_parallel_attr,
+            )
+        )
+
+    global conditional_function_table
+    function = enable_if.unique(
+        conditional_function_table,
+        context=BoxingHobContext(produced_blob_object, consumer_op_arg_parallel_attr),
+        default=default,
+    )
+    return function(builder, produced_blob_object, consumer_op_arg_parallel_attr)
+
+
+def boxing_condition(hob_expr, verbose=False):
+    def Decorator(func):
+        func.__oneflow_condition_hob__ = hob_expr
+        if not verbose:
+            hob_expr.__debug_str__ = GetBoxingDebugString(func)
+        return func
+
+    return Decorator
+
+
+def FirstMatchedBoxing(*boxing_methods):
+    hob_expr = enable_if.get_condition_hob(boxing_methods[0])
+    for boxing_method in boxing_methods[1:]:
+        hob_expr = hob_expr | enable_if.get_condition_hob(boxing_method)
+
+    @enable_if.condition(hob_expr)
+    def FirstMatched(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+        ctx = BoxingHobContext(produced_blob_object, consumer_op_arg_parallel_attr)
+        for boxing_method in boxing_methods:
+            hob_expr = enable_if.get_condition_hob(boxing_method)
+            if not hob_expr(ctx):
+                continue
+            return boxing_method(
+                builder, produced_blob_object, consumer_op_arg_parallel_attr
+            )
+
+    boxing_methods_names = [GetBoxingDebugString(m) for m in boxing_methods]
+    FirstMatched.__debug_str__ = "(%s)" % " | ".join(boxing_methods_names)
+    return FirstMatched
+
+
+def OptionalBoxing(boxing_method):
+    opt_boxing_method = FirstMatchedBoxing(boxing_method, NoBoxing)
+    debug_str = "Optional(%s)" % GetBoxingDebugString(boxing_method)
+    opt_boxing_method.__debug_str__ = debug_str
+    return opt_boxing_method
+
+
+def ComposeBoxing(
+    lhs_boxing, rhs_boxing, get_middle_op_arg_parallel_attr, middle_verbose_str=None
+):
+    composed_hob = boxing_hob.ComposeHob(
+        enable_if.get_condition_hob(lhs_boxing),
+        enable_if.get_condition_hob(rhs_boxing),
+        get_middle_op_arg_parallel_attr=get_middle_op_arg_parallel_attr,
+        middle_verbose_str=middle_verbose_str,
+    )
+
+    @enable_if.condition(composed_hob)
+    def Composed(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+        tmp_op_arg_parallel_attr = get_middle_op_arg_parallel_attr(
+            builder, produced_blob_object, consumer_op_arg_parallel_attr
+        )
+        tmp = lhs_boxing(builder, produced_blob_object, tmp_op_arg_parallel_attr)
+        return rhs_boxing(builder, tmp, consumer_op_arg_parallel_attr)
+
+    Composed.__debug_str__ = "%s->%s" % (
+        GetBoxingDebugString(lhs_boxing),
+        GetBoxingDebugString(rhs_boxing),
+    )
+    Composed.__left_debug_str__ = GetBoxingLeftDebugString(lhs_boxing)
+    Composed.__right_debug_str__ = GetBoxingRightDebugString(rhs_boxing)
+    return Composed
+
+
+def GetBoxingDebugString(boxing_method):
+    if hasattr(boxing_method, "__debug_str__"):
+        return boxing_method.__debug_str__
+    else:
+        return boxing_method.__name__
+
+
+def GetBoxingLeftDebugString(boxing_method):
+    if hasattr(boxing_method, "__left_debug_str__"):
+        return boxing_method.__left_debug_str__
+    else:
+        return GetBoxingDebugString(boxing_method)
+
+
+def GetBoxingRightDebugString(boxing_method):
+    if hasattr(boxing_method, "__right_debug_str__"):
+        return boxing_method.__right_debug_str__
+    else:
+        return GetBoxingDebugString(boxing_method)
+
+
+def Sequential(*boxing_methods, exclude=tuple(), middle_verbose=False):
+    assert not isinstance(boxing_methods[-1], boxing_middle.BoxingToMiddle)
+    composed = boxing_methods[-1]
+    for boxing_to_middle in boxing_methods[-2::-1]:
+        assert isinstance(boxing_to_middle, boxing_middle.BoxingToMiddle)
+        if middle_verbose:
+            middle_verbose_str = "middle op_arg_parallel_attr of %s->%s:" % (
+                GetBoxingDebugString(boxing_to_middle.boxing_method),
+                GetBoxingLeftDebugString(composed),
+            )
+        else:
+            middle_verbose_str = None
+        composed = ComposeBoxing(
+            boxing_to_middle.boxing_method,
+            composed,
+            boxing_to_middle.get_middle_op_arg_parallel_attr,
+            middle_verbose_str=middle_verbose_str,
+        )
+    if len(exclude) > 0:
+        exclude_hob = enable_if.get_condition_hob(exclude[0])
+        for method in exclude[1:]:
+            exclude_hob = exclude_hob | enable_if.get_condition_hob(method)
+        old_hob = enable_if.get_condition_hob(composed)
+        enable_if.set_condition_hob(composed, old_hob & ~exclude_hob)
+    return composed
+
+
+MatchCopyH2D = (
+    (
+        boxing_hob.producer_parallel_desc.machine_id2device_id_list
+        == boxing_hob.consumer_parallel_desc.machine_id2device_id_list
+    )
+    & (
+        (boxing_hob.producer_sbp_parallel == boxing_hob.consumer_sbp_parallel)
+        | (boxing_hob.producer_parallel_desc.parallel_num == 1)
+    )
+    & (boxing_hob.producer_parallel_desc.device_tag == "cpu")
+    & (boxing_hob.consumer_parallel_desc.device_tag == "gpu")
+)
+
+
+@boxing_condition(MatchCopyH2D)
+def CopyH2D(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+    return CopyHD(builder, produced_blob_object, consumer_op_arg_parallel_attr)
+
+
+MatchCopyD2H = (
+    (
+        boxing_hob.producer_parallel_desc.machine_id2device_id_list
+        == boxing_hob.consumer_parallel_desc.machine_id2device_id_list
+    )
+    & (
+        (boxing_hob.producer_sbp_parallel == boxing_hob.consumer_sbp_parallel)
+        | (boxing_hob.producer_parallel_desc.parallel_num == 1)
+    )
+    & (boxing_hob.producer_parallel_desc.device_tag == "gpu")
+    & (boxing_hob.consumer_parallel_desc.device_tag == "cpu")
+)
+
+
+@boxing_condition(MatchCopyD2H)
+def CopyD2H(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+    return CopyHD(builder, produced_blob_object, consumer_op_arg_parallel_attr)
+
+
+def CopyHD(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+    arg_parallel_desc_symbol = consumer_op_arg_parallel_attr.parallel_desc_symbol
+    op_device_tag = arg_parallel_desc_symbol.device_tag
+    return BuildCopyHdInstruction(builder, produced_blob_object, op_device_tag)
+
+
+BlobIsPartialSum = boxing_hob.producer_sbp_parallel.HasField("partial_sum_parallel")
+OpArgIsBroadcast = boxing_hob.consumer_sbp_parallel.HasField("broadcast_parallel")
+MatchInterNodeOneToMany = (
+    ~boxing_hob.SingleMachine
+    & (boxing_hob.producer_parallel_desc.device_tag == "cpu")
+    & (boxing_hob.consumer_parallel_desc.device_tag == "cpu")
+    & (boxing_hob.producer_parallel_desc.parallel_num == 1)
+    & (boxing_hob.consumer_parallel_desc.parallel_num > 1)
+    & OpArgIsBroadcast
+)
+
+
+@boxing_condition(MatchInterNodeOneToMany)
+def InterNodeOneToMany(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+    out_blobs = []
+    consumer_dev_ids = (
+        consumer_op_arg_parallel_attr.parallel_desc_symbol.machine_id2device_id_list
+    )
+    for (machine_id, device_ids) in consumer_dev_ids.items():
+        for device_id in device_ids:
+            parallel_conf = placement_cfg.ParallelConf()
+            parallel_conf.set_device_tag("cpu")
+            parallel_conf.add_device_name("@%s:%s" % (machine_id, device_id))
+            parallel_desc_symbol = builder.GetParallelDescSymbol(parallel_conf)
+            out_blob = builder.Build121To(produced_blob_object, parallel_desc_symbol)
+            out_blobs.append(out_blob)
+    return PackPhysicalBoxingBlobObjectsToLogical(
+        builder,
+        out_blobs,
+        consumer_op_arg_parallel_attr,
+        produced_blob_object.op_arg_blob_attr,
+    )
+
+
+MatchInterNodeOneToOne = (
+    (boxing_hob.producer_parallel_desc.device_tag == "cpu")
+    & (boxing_hob.consumer_parallel_desc.device_tag == "cpu")
+    & (boxing_hob.producer_parallel_desc != boxing_hob.consumer_parallel_desc)
+    & (
+        boxing_hob.producer_parallel_desc.parallel_num
+        == boxing_hob.consumer_parallel_desc.parallel_num
+    )
+    & ~boxing_hob.MatchDeviceOneToOnePerMachine
+    & (
+        (boxing_hob.producer_sbp_parallel == boxing_hob.consumer_sbp_parallel)
+        | (boxing_hob.producer_parallel_desc.parallel_num == 1)
+    )
+)
+
+
+@boxing_condition(MatchInterNodeOneToOne)
+def InterNodeOneToOne(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+    return builder.Build121To(
+        produced_blob_object, consumer_op_arg_parallel_attr.parallel_desc_symbol
+    )
+
+
+MatchCpuBroadcastOneToOne = (
+    (boxing_hob.producer_parallel_desc.device_tag == "cpu")
+    & (boxing_hob.consumer_parallel_desc.device_tag == "cpu")
+    & (boxing_hob.producer_parallel_desc != boxing_hob.consumer_parallel_desc)
+    & boxing_hob.MatchDeviceOneToOnePerMachine
+    & (
+        (boxing_hob.producer_sbp_parallel == boxing_hob.consumer_sbp_parallel)
+        | (boxing_hob.producer_parallel_desc.parallel_num == 1)
+    )
+)
+
+
+@boxing_condition(MatchCpuBroadcastOneToOne)
+def CpuBroadcastOneToOne(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+    def get_identity_physical_in_blob_objects(
+        builder,
+        produced_blob_object,
+        consumer_op_arg_parallel_attr,
+        physical_in_blob_objects,
+        boxing_parallel_desc_symbol,
+        out_parallel_num,
+    ):
+        return physical_in_blob_objects
+
+    return NaiveCpuRefPhysicalBlobObjectsScope(
+        builder,
+        produced_blob_object,
+        consumer_op_arg_parallel_attr,
+        get_physical_out_blob_objects=get_identity_physical_in_blob_objects,
+    )
+
+
+MatchNoBoxing = (
+    boxing_hob.producer_parallel_desc == boxing_hob.consumer_parallel_desc
+) & (
+    (boxing_hob.producer_sbp_parallel == boxing_hob.consumer_sbp_parallel)
+    | (boxing_hob.producer_parallel_desc.parallel_num == 1)
+)
+
+
+@boxing_condition(MatchNoBoxing)
+def NoBoxing(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+    return produced_blob_object
+
+
+@boxing_condition(boxing_hob.Verbose & MatchNoBoxing)
+def VerboseNoBoxing(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+    return produced_blob_object
+
+
+def VerboseOptionalBoxing(boxing_method):
+    opt_boxing_method = FirstMatchedBoxing(boxing_method, VerboseNoBoxing)
+    debug_str = "VerboseOptional(%s)" % GetBoxingDebugString(boxing_method)
+    opt_boxing_method.__debug_str__ = debug_str
+    return opt_boxing_method
+
+
+MatchNcclAllReduce = (
+    boxing_hob.SingleMachine
+    & (boxing_hob.producer_parallel_desc.device_tag == "gpu")
+    & (boxing_hob.producer_parallel_desc == boxing_hob.consumer_parallel_desc)
+    & (boxing_hob.consumer_parallel_desc.parallel_num > 1)
+    & BlobIsPartialSum
+    & OpArgIsBroadcast
+)
+
+
+@boxing_condition(MatchNcclAllReduce)
+def GpuNcclAllReduce(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+    parallel_conf = consumer_op_arg_parallel_attr.parallel_desc_symbol.parallel_conf
+    bn_in_op2blob_object = oneflow._oneflow_internal.deprecated.BnInOp2BlobObject()
+    bn_in_op2blob_object["in_0"] = produced_blob_object
+    op_attribute = _GetEagerNcclAllReduce(parallel_conf, bn_in_op2blob_object)
+    cfg_op_attribute = oneflow._oneflow_internal.deprecated.MakeOpAttributeByString(
+        str(op_attribute)
+    )
+    builder.NoBoxingStatelessCall(cfg_op_attribute, parallel_conf, bn_in_op2blob_object)
+    y_blob_object = bn_in_op2blob_object["out_0"]
+    y_blob_object.op_arg_parallel_attr.Assign(consumer_op_arg_parallel_attr)
+    return y_blob_object
+
+
+MatchSplitOneToMany = (
+    (boxing_hob.producer_parallel_desc.parallel_num == 1)
+    & (boxing_hob.consumer_parallel_desc.parallel_num > 1)
+    & boxing_hob.consumer_sbp_parallel.HasField("split_parallel")
+)
+MatchConcatManyToOne = (
+    (boxing_hob.consumer_parallel_desc.parallel_num == 1)
+    & (boxing_hob.producer_parallel_desc.parallel_num > 1)
+    & boxing_hob.producer_sbp_parallel.HasField("split_parallel")
+)
+MatchConcatManyToSplitMany = (
+    (boxing_hob.producer_parallel_desc.parallel_num > 1)
+    & (boxing_hob.consumer_parallel_desc.parallel_num > 1)
+    & boxing_hob.producer_sbp_parallel.HasField("split_parallel")
+    & boxing_hob.consumer_sbp_parallel.HasField("split_parallel")
+    & (
+        (boxing_hob.producer_sbp_parallel != boxing_hob.consumer_sbp_parallel)
+        | (
+            boxing_hob.producer_parallel_desc.parallel_num
+            != boxing_hob.consumer_parallel_desc.parallel_num
+        )
+    )
+)
+MatchNaiveCpuSplitToSplit = (
+    (boxing_hob.producer_parallel_desc.device_tag == "cpu")
+    & (boxing_hob.consumer_parallel_desc.device_tag == "cpu")
+    & (MatchSplitOneToMany | MatchConcatManyToOne | MatchConcatManyToSplitMany)
+)
+
+
+@boxing_condition(MatchNaiveCpuSplitToSplit)
+def NaiveCpuSplitToSplit(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+    return NaiveCpuRefPhysicalBlobObjectsScope(
+        builder,
+        produced_blob_object,
+        consumer_op_arg_parallel_attr,
+        get_physical_out_blob_objects=NaiveBoxingToPhysicalBlobObjects,
+    )
+
+
+MatchNaiveCpuPartialSumToSplit = (
+    (boxing_hob.producer_parallel_desc.device_tag == "cpu")
+    & (boxing_hob.consumer_parallel_desc.device_tag == "cpu")
+    & (boxing_hob.producer_parallel_desc.parallel_num > 1)
+    & boxing_hob.producer_sbp_parallel.HasField("partial_sum_parallel")
+    & (
+        (boxing_hob.consumer_parallel_desc.parallel_num == 1)
+        | boxing_hob.consumer_sbp_parallel.HasField("split_parallel")
+    )
+)
+
+
+@boxing_condition(MatchNaiveCpuPartialSumToSplit)
+def NaiveCpuPartialSumToSplit(
+    builder, produced_blob_object, consumer_op_arg_parallel_attr
+):
+    return NaiveCpuRefPhysicalBlobObjectsScope(
+        builder,
+        produced_blob_object,
+        consumer_op_arg_parallel_attr,
+        get_physical_out_blob_objects=NaiveBoxingToPhysicalBlobObjects,
+    )
+
+
+def NaiveCpuRefPhysicalBlobObjectsScope(
+    builder,
+    produced_blob_object,
+    consumer_op_arg_parallel_attr,
+    get_physical_out_blob_objects,
+):
+    physical_in_blob_objects = UnpackLogicalBoxingBlobObjectToPhysical(
+        builder, produced_blob_object
+    )
+    consumer_parallel_desc_symbol = consumer_op_arg_parallel_attr.parallel_desc_symbol
+    out_parallel_num = consumer_parallel_desc_symbol.parallel_num
+    boxing_parallel_desc_symbol = GetConcatSplitBoxingParallelDescSymbol(
+        builder,
+        consumer_parallel_desc_symbol,
+        max(len(physical_in_blob_objects), out_parallel_num),
+    )
+    physical_output_blob_objects = get_physical_out_blob_objects(
+        builder=builder,
+        produced_blob_object=produced_blob_object,
+        consumer_op_arg_parallel_attr=consumer_op_arg_parallel_attr,
+        physical_in_blob_objects=physical_in_blob_objects,
+        boxing_parallel_desc_symbol=boxing_parallel_desc_symbol,
+        out_parallel_num=out_parallel_num,
+    )
+    phy_parallel_desc_symbols = builder.GetPhysicalParallelDescSymbols(
+        consumer_op_arg_parallel_attr.parallel_desc_symbol
+    )
+    physical_output_blob_objects = RefBlobObjectWithParallelDesc(
+        builder, physical_output_blob_objects, phy_parallel_desc_symbols
+    )
+    return PackPhysicalBoxingBlobObjectsToLogical(
+        builder,
+        physical_output_blob_objects,
+        consumer_op_arg_parallel_attr,
+        produced_blob_object.op_arg_blob_attr,
+    )
+
+
+def NaiveBoxingToPhysicalBlobObjects(
+    builder,
+    produced_blob_object,
+    consumer_op_arg_parallel_attr,
+    physical_in_blob_objects,
+    boxing_parallel_desc_symbol,
+    out_parallel_num,
+):
+    op_attribute = ConstructNaiveBoxingOpConf(
+        produced_blob_object,
+        consumer_op_arg_parallel_attr,
+        len(physical_in_blob_objects),
+        out_parallel_num,
+    )
+    return BuildNaiveCpuBoxing(
+        builder,
+        op_attribute,
+        physical_in_blob_objects,
+        boxing_parallel_desc_symbol,
+        out_parallel_num,
+    )
+
+
+def RefBlobObjectWithParallelDesc(
+    builder, physical_blob_objects, phy_parallel_desc_symbols
+):
+    assert len(physical_blob_objects) == len(
+        phy_parallel_desc_symbols
+    ), "%s v.s. %s" % (len(physical_blob_objects), len(phy_parallel_desc_symbols))
+
+    def RefWithParallelDesc(physical_blob_object, phy_parallel_desc_symbol):
+        if physical_blob_object.parallel_desc_symbol == phy_parallel_desc_symbol:
+            return physical_blob_object
+        return builder.BroadcastBlobReference(
+            physical_blob_object, phy_parallel_desc_symbol
+        )
+
+    return [
+        RefWithParallelDesc(*pair)
+        for pair in zip(physical_blob_objects, phy_parallel_desc_symbols)
+    ]
+
+
+def PackPhysicalBoxingBlobObjectsToLogical(
+    builder, physical_blob_objects, op_arg_parallel_attr, op_arg_blob_attr
+):
+    if len(physical_blob_objects) == 1:
+        return physical_blob_objects[0]
+    return builder.PackPhysicalBlobsToLogicalBlob(
+        physical_blob_objects, op_arg_parallel_attr, op_arg_blob_attr
+    )
+
+
+def BuildNaiveCpuBoxing(
+    builder,
+    op_attribute,
+    physical_in_blob_objects,
+    boxing_parallel_desc_symbol,
+    out_parallel_num,
+):
+    bn_in_op2blob_object = oneflow._oneflow_internal.deprecated.BnInOp2BlobObject()
+    for i in range(len(physical_in_blob_objects)):
+        bn_in_op2blob_object["in_%s" % i] = physical_in_blob_objects[i]
+    cfg_op_attribute = oneflow._oneflow_internal.deprecated.MakeOpAttributeByString(
+        str(op_attribute)
+    )
+    builder.NoBoxingStatelessCall(
+        cfg_op_attribute,
+        boxing_parallel_desc_symbol.parallel_conf,
+        bn_in_op2blob_object,
+    )
+    return [bn_in_op2blob_object["out_%s" % i] for i in range(out_parallel_num)]
+
+
+def ConstructNaiveBoxingOpConf(
+    produced_blob_object,
+    consumer_op_arg_parallel_attr,
+    in_parallel_num,
+    out_parallel_num,
+):
+    op_conf = op_conf_pb.OperatorConf()
+    op_conf.name = "undefined_boxing_op_name"
+    op_conf.device_tag = "cpu"
+    op_conf.boxing_conf.lbi.op_name = "undefined_boxing_op_name"
+    op_conf.boxing_conf.lbi.blob_name = "undefined_boxing_blob_name"
+    op_conf.boxing_conf.in_num = in_parallel_num
+    op_conf.boxing_conf.out_num = out_parallel_num
+    in_sbp_parallel = produced_blob_object.op_arg_parallel_attr.sbp_parallel
+    if in_sbp_parallel.has_split_parallel():
+        op_conf.boxing_conf.concat_box.axis = in_sbp_parallel.split_parallel().axis()
+    elif in_parallel_num == 1:
+        op_conf.boxing_conf.concat_box.axis = 0
+    else:
+        assert in_sbp_parallel.has_partial_sum_parallel()
+        op_conf.boxing_conf.add_box.SetInParent()
+    out_sbp_parallel = consumer_op_arg_parallel_attr.sbp_parallel
+    if out_sbp_parallel.has_split_parallel():
+        out_axis = out_sbp_parallel.split_parallel().axis()
+    else:
+        assert out_parallel_num == 1
+        out_axis = 0
+    op_conf.boxing_conf.split_box.axis = out_axis
+    shape = produced_blob_object.op_arg_blob_attr.shape
+    op_conf.boxing_conf.split_box.part_num.extend(
+        balanced_splitter.BalancedPartNums(shape[out_axis], out_parallel_num)
+    )
+    bn_in_op2blob_object = oneflow._oneflow_internal.deprecated.BnInOp2BlobObject()
+    for i in range(in_parallel_num):
+        bn_in_op2blob_object["in_%s" % i] = produced_blob_object
+    return op_infer_util.Infer(op_conf, bn_in_op2blob_object)
+
+
+def GetConcatSplitBoxingParallelDescSymbol(
+    builder, blob_parallel_desc_symbol, max_parallel_num
+):
+    random_rank_id = random.randint(0, max_parallel_num - 1)
+    parallel_conf = placement_cfg.ParallelConf()
+    parallel_conf.set_device_tag("cpu")
+    for (machine_id, _) in blob_parallel_desc_symbol.machine_id2device_id_list.items():
+        parallel_conf.add_device_name("@%s:%s" % (machine_id, random_rank_id))
+    return builder.GetParallelDescSymbol(parallel_conf)
+
+
+def UnpackLogicalBoxingBlobObjectToPhysical(builder, produced_blob_object):
+    if produced_blob_object.parallel_desc_symbol.parallel_num == 1:
+        return [produced_blob_object]
+    return builder.UnpackLogicalBlobToPhysicalBlobs(produced_blob_object)
+
+
+MatchCpuBroadcastOneToMany = (
+    boxing_hob.SingleMachine
+    & (boxing_hob.producer_parallel_desc.device_tag == "cpu")
+    & (boxing_hob.consumer_parallel_desc.device_tag == "cpu")
+    & boxing_hob.ProducerDevicesContainedInConsumerDevices
+    & (boxing_hob.producer_parallel_desc.parallel_num == 1)
+    & (boxing_hob.consumer_parallel_desc.parallel_num > 1)
+    & boxing_hob.consumer_sbp_parallel.HasField("broadcast_parallel")
+)
+
+
+@boxing_condition(MatchCpuBroadcastOneToMany)
+def CpuBroadcastOneToMany(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+    return CpuOneToManyBroadcastBlobReference(
+        builder,
+        produced_blob_object,
+        consumer_op_arg_parallel_attr.parallel_desc_symbol,
+    )
+
+
+MatchBroadcastManyToOne = (
+    (
+        boxing_hob.producer_parallel_desc.device_tag
+        == boxing_hob.consumer_parallel_desc.device_tag
+    )
+    & boxing_hob.ConsumerDevicesContainedInProducerDevices
+    & (boxing_hob.producer_parallel_desc.parallel_num > 1)
+    & (boxing_hob.consumer_parallel_desc.parallel_num == 1)
+    & boxing_hob.producer_sbp_parallel.HasField("broadcast_parallel")
+)
+
+
+@boxing_condition(MatchBroadcastManyToOne)
+def BroadcastManyToOne(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+    y_blob_objects = builder.UnpackLogicalBlobToPhysicalBlobs(produced_blob_object)
+    for y in y_blob_objects:
+        if y.parallel_desc_symbol == consumer_op_arg_parallel_attr.parallel_desc_symbol:
+            return y
+    raise NotImplementedError("op_arg's devices is not contained in blob's devices")
+
+
+def Assign(builder, ref_blob_object, value_blob_object):
+    return BuildAssignInstruction(
+        builder, ref_blob_object, value_blob_object, _AssignOpConf()
+    )
+
+
+def CpuOneToManyBroadcastBlobReference(
+    builder, produced_blob_object, to_parallel_desc_symbol
+):
+    x_parallel_desc_symbol = produced_blob_object.parallel_desc_symbol
+    x_machine_ids = list(dict(x_parallel_desc_symbol.machine_id2device_id_list).keys())
+    to_machine_ids = list(
+        dict(to_parallel_desc_symbol.machine_id2device_id_list).keys()
+    )
+    assert x_machine_ids == to_machine_ids, (x_machine_ids, to_machine_ids)
+    x_first_device_ids = x_parallel_desc_symbol.machine_id2device_id_list[
+        x_machine_ids[0]
+    ]
+    assert len(x_first_device_ids) == 1, x_first_device_ids
+    if x_parallel_desc_symbol == to_parallel_desc_symbol:
+        return produced_blob_object
+    return builder.BroadcastBlobReference(produced_blob_object, to_parallel_desc_symbol)
+
+
+def BuildCopyHdInstruction(builder, produced_blob_object, to_device_tag):
+    (op_conf, lbi) = _MakeCopyHdOpConfAndRetLbi()
+    return _BuildCopyInstruction(builder, produced_blob_object, op_conf, to_device_tag)
+
+
+def _MakeCopyHdOpConfAndRetLbi():
+    op_conf = op_conf_pb.OperatorConf()
+    op_conf.name = "copy_hd"
+    op_conf.device_tag = "gpu"
+    setattr(op_conf.copy_conf, "in", "%s/in" % op_conf.name)
+    op_conf.copy_conf.out = "out"
+    lbi = logical_blob_id_util.LogicalBlobId()
+    lbi.op_name = op_conf.name
+    lbi.blob_name = "out"
+    return (op_conf, lbi)
+
+
+@contextmanager
+def _CudaHostPinBlob(build, blob_object):
+    build.CudaHostRegisterBlob(blob_object)
+    try:
+        yield
+    finally:
+        build.CudaHostUnregisterBlob(blob_object)
+
+
+def _BuildCopyInstruction(builder, produced_blob_object, op_conf, to_device_tag):
+    x_devices = produced_blob_object.parallel_desc_symbol.machine_id2device_id_list
+    x_device_tag = produced_blob_object.parallel_desc_symbol.device_tag
+    bn_in_op2blob_object = oneflow._oneflow_internal.deprecated.BnInOp2BlobObject()
+    bn_in_op2blob_object["in"] = produced_blob_object
+    op_attribute = op_infer_util.Infer(op_conf, bn_in_op2blob_object)
+    assert to_device_tag != x_device_tag, (to_device_tag, x_device_tag)
+    cfg_op_attribute = oneflow._oneflow_internal.deprecated.MakeOpAttributeByString(
+        str(op_attribute)
+    )
+    if to_device_tag == "cpu" and x_device_tag == "gpu":
+        x_parallel_conf = produced_blob_object.parallel_desc_symbol.parallel_conf
+        builder.NoBoxingCudaD2HStatelessCall(
+            cfg_op_attribute, x_parallel_conf, bn_in_op2blob_object, TryReplaceDeviceTag
+        )
+    elif to_device_tag == "gpu" and x_device_tag == "cpu":
+        out_parallel_desc_symbol = TryReplaceDeviceTag(
+            builder, produced_blob_object.parallel_desc_symbol, to_device_tag
+        )
+        out_parallel_conf = out_parallel_desc_symbol.parallel_conf
+        with _CudaHostPinBlob(builder, produced_blob_object):
+            builder.NoBoxingCudaH2DStatelessCall(
+                cfg_op_attribute, out_parallel_conf, bn_in_op2blob_object
+            )
+    else:
+        raise NotImplementedError(
+            "invalid device found. to_device_tag: %s, x_device_tag: %s"
+            % (to_device_tag, x_device_tag)
+        )
+    sbp_parallel = bn_in_op2blob_object["out"].op_arg_parallel_attr.sbp_parallel
+    sbp_parallel.CopyFrom(produced_blob_object.op_arg_parallel_attr.sbp_parallel)
+    return bn_in_op2blob_object["out"]
+
+
+def _AssignOpConf():
+    op_conf = op_conf_pb.OperatorConf()
+    op_conf.name = "assign"
+    op_conf.assign_conf.ref = "assign/ref"
+    op_conf.assign_conf.value = "assign/value"
+    device_tag = flow.current_scope().device_parallel_desc_symbol.device_tag
+    op_conf.device_tag = device_tag
+    return op_conf
+
+
+def BuildAssignInstruction(builder, ref_blob_object, value_blob_object, op_conf):
+    ref_parallel_conf = ref_blob_object.parallel_desc_symbol.parallel_conf
+    ref_devices = ref_blob_object.parallel_desc_symbol.machine_id2device_id_list
+    value_devices = value_blob_object.parallel_desc_symbol.machine_id2device_id_list
+    assert ref_devices == value_devices, "\nref_devices: %s\nvalue_devices: %s" % (
+        ref_devices,
+        value_devices,
+    )
+    ref_device_tag = ref_blob_object.parallel_desc_symbol.device_tag
+    value_device_tag = value_blob_object.parallel_desc_symbol.device_tag
+    bn_in_op2blob_object = oneflow._oneflow_internal.deprecated.BnInOp2BlobObject()
+    bn_in_op2blob_object["ref"] = ref_blob_object
+    bn_in_op2blob_object["value"] = value_blob_object
+    op_attribute = op_infer_util.Infer(op_conf, bn_in_op2blob_object)
+    cfg_op_attribute = oneflow._oneflow_internal.deprecated.MakeOpAttributeByString(
+        str(op_attribute)
+    )
+    if ref_device_tag == value_device_tag:
+        builder.NoBoxingStatelessCall(
+            cfg_op_attribute, ref_parallel_conf, bn_in_op2blob_object
+        )
+    elif ref_device_tag == "cpu" and value_device_tag == "gpu":
+        value_parallel_conf = value_blob_object.parallel_desc_symbol.parallel_conf
+        builder.NoBoxingCudaD2HStatelessCall(
+            cfg_op_attribute,
+            value_parallel_conf,
+            bn_in_op2blob_object,
+            TryReplaceDeviceTag,
+        )
+    elif ref_device_tag == "gpu" and value_device_tag == "cpu":
+        with _CudaHostPinBlob(builder, value_blob_object):
+            builder.NoBoxingCudaH2DStatelessCall(
+                cfg_op_attribute, ref_parallel_conf, bn_in_op2blob_object
+            )
+    else:
+        raise NotImplementedError(
+            "invalid device found. ref_device_tag: %s, value_device_tag: %s"
+            % (ref_device_tag, value_device_tag)
+        )
+
+
+def TryReplaceDeviceTag(builder, parallel_desc_symbol, device_tag):
+    return boxing_middle.TryReplaceDeviceTag(builder, parallel_desc_symbol, device_tag)
+
+
+def ReplaceDeviceTag(parallel_desc_symbol, device_tag, builder=None):
+    return boxing_middle.ReplaceDeviceTag(
+        parallel_desc_symbol, device_tag, builder=builder
+    )
+
+
+def _GetEagerNcclAllReduce(parallel_conf, ibn2blob_object):
+    op_conf = op_conf_pb.OperatorConf()
+    op_conf.device_tag = "gpu"
+    op_conf.name = "eager_nccl_all_reduce"
+    op_conf.user_conf.op_type_name = "eager_nccl_all_reduce"
+    op_conf.user_conf.input["in"].s.append("eager_nccl_all_reduce/in_0")
+    op_conf.user_conf.output["out"].s.append("eager_nccl_all_reduce/out_0")
+    op_conf.user_conf.attr["parallel_conf"].at_string = str(parallel_conf)
+    return op_infer_util.Infer(op_conf, ibn2blob_object)
+
+
+NcclAllReduce = Sequential(
+    boxing_middle.BoxingToMiddle(
+        GpuNcclAllReduce,
+        boxing_middle.ProducerParallelDesc,
+        boxing_middle.BroadcastParallel,
+    ),
+    OptionalBoxing(CopyD2H),
+)
+BoxingIntraNodeOneToOne = Sequential(
+    boxing_middle.BoxingToMiddle(
+        OptionalBoxing(CopyD2H),
+        boxing_middle.ReplaceProducerDeviceTag("cpu"),
+        boxing_middle.ProducerSbpParallel,
+    ),
+    boxing_middle.BoxingToMiddle(
+        CpuBroadcastOneToOne,
+        boxing_middle.ReplaceConsumerDeviceTag("cpu"),
+        boxing_middle.ConsumerSbpParallel,
+    ),
+    OptionalBoxing(CopyH2D),
+)
+BoxingInterNodeOneToOne = Sequential(
+    boxing_middle.BoxingToMiddle(
+        OptionalBoxing(CopyD2H),
+        boxing_middle.ReplaceProducerDeviceTag("cpu"),
+        boxing_middle.ProducerSbpParallel,
+    ),
+    boxing_middle.BoxingToMiddle(
+        InterNodeOneToOne,
+        boxing_middle.ReplaceConsumerDeviceTag("cpu"),
+        boxing_middle.ConsumerSbpParallel,
+    ),
+    OptionalBoxing(CopyH2D),
+)
+BoxingInterNodeOneToMany = Sequential(
+    boxing_middle.BoxingToMiddle(
+        OptionalBoxing(CopyD2H),
+        boxing_middle.ReplaceProducerDeviceTag("cpu"),
+        boxing_middle.ProducerSbpParallel,
+    ),
+    boxing_middle.BoxingToMiddle(
+        InterNodeOneToMany,
+        boxing_middle.ReplaceConsumerDeviceTag("cpu"),
+        boxing_middle.ConsumerSbpParallel,
+    ),
+    OptionalBoxing(CopyH2D),
+)
+conditional_function_table = [
+    CopyH2D,
+    CopyD2H,
+    NoBoxing,
+    BoxingIntraNodeOneToOne,
+    BoxingInterNodeOneToOne,
+    BoxingInterNodeOneToMany,
+    BroadcastManyToOne,
+    Sequential(
+        boxing_middle.BoxingToMiddle(
+            OptionalBoxing(BroadcastManyToOne),
+            boxing_middle.ProducerRandomParallelIdPerMachine(),
+            boxing_middle.ProducerSbpParallel,
+        ),
+        boxing_middle.BoxingToMiddle(
+            OptionalBoxing(CopyD2H),
+            boxing_middle.ReplaceProducerDeviceTag("cpu"),
+            boxing_middle.ProducerSbpParallel,
+        ),
+        boxing_middle.BoxingToMiddle(
+            OptionalBoxing(CpuBroadcastOneToOne),
+            boxing_middle.ConsumerRandomParallelIdPerMachine("cpu"),
+            boxing_middle.BroadcastParallel,
+        ),
+        boxing_middle.BoxingToMiddle(
+            OptionalBoxing(CpuBroadcastOneToMany),
+            boxing_middle.ReplaceConsumerDeviceTag("cpu"),
+            boxing_middle.BroadcastParallel,
+        ),
+        OptionalBoxing(CopyH2D),
+        exclude=(
+            BroadcastManyToOne,
+            CopyH2D,
+            CopyD2H,
+            NoBoxing,
+            BoxingIntraNodeOneToOne,
+        ),
+    ),
+    Sequential(
+        boxing_middle.BoxingToMiddle(
+            BroadcastManyToOne,
+            boxing_middle.ProducerRandomParallelIdPerMachine(),
+            boxing_middle.ProducerSbpParallel,
+        ),
+        boxing_middle.BoxingToMiddle(
+            OptionalBoxing(CopyD2H),
+            boxing_middle.ReplaceProducerDeviceTag("cpu"),
+            boxing_middle.ProducerSbpParallel,
+        ),
+        boxing_middle.BoxingToMiddle(
+            NaiveCpuSplitToSplit,
+            boxing_middle.ReplaceConsumerDeviceTag("cpu"),
+            boxing_middle.ConsumerSbpParallel,
+        ),
+        OptionalBoxing(CopyH2D),
+    ),
+    NcclAllReduce,
+    Sequential(
+        boxing_middle.BoxingToMiddle(
+            OptionalBoxing(CopyD2H),
+            boxing_middle.ReplaceProducerDeviceTag("cpu"),
+            boxing_middle.ProducerSbpParallel,
+        ),
+        boxing_middle.BoxingToMiddle(
+            NaiveCpuPartialSumToSplit,
+            boxing_middle.ConsumerRandomParallelIdPerMachine("cpu"),
+            boxing_middle.BroadcastParallel,
+        ),
+        boxing_middle.BoxingToMiddle(
+            CpuBroadcastOneToMany,
+            boxing_middle.ReplaceConsumerDeviceTag("cpu"),
+            boxing_middle.BroadcastParallel,
+        ),
+        OptionalBoxing(CopyH2D),
+        exclude=(NcclAllReduce,),
+    ),
+    Sequential(
+        boxing_middle.BoxingToMiddle(
+            OptionalBoxing(CopyD2H),
+            boxing_middle.ReplaceProducerDeviceTag("cpu"),
+            boxing_middle.ProducerSbpParallel,
+        ),
+        boxing_middle.BoxingToMiddle(
+            NaiveCpuPartialSumToSplit,
+            boxing_middle.ReplaceConsumerDeviceTag("cpu"),
+            boxing_middle.ConsumerSbpParallel,
+        ),
+        OptionalBoxing(CopyH2D),
+    ),
+    Sequential(
+        boxing_middle.BoxingToMiddle(
+            OptionalBoxing(CopyD2H),
+            boxing_middle.ReplaceProducerDeviceTag("cpu"),
+            boxing_middle.ProducerSbpParallel,
+        ),
+        boxing_middle.BoxingToMiddle(
+            NaiveCpuSplitToSplit,
+            boxing_middle.ConsumerRandomParallelIdPerMachine("cpu"),
+            boxing_middle.BroadcastParallel,
+        ),
+        boxing_middle.BoxingToMiddle(
+            CpuBroadcastOneToMany,
+            boxing_middle.ReplaceConsumerDeviceTag("cpu"),
+            boxing_middle.BroadcastParallel,
+        ),
+        OptionalBoxing(CopyH2D),
+        exclude=(NcclAllReduce,),
+    ),
+    Sequential(
+        boxing_middle.BoxingToMiddle(
+            OptionalBoxing(CopyD2H),
+            boxing_middle.ReplaceProducerDeviceTag("cpu"),
+            boxing_middle.ProducerSbpParallel,
+        ),
+        boxing_middle.BoxingToMiddle(
+            NaiveCpuSplitToSplit,
+            boxing_middle.ReplaceConsumerDeviceTag("cpu"),
+            boxing_middle.ConsumerSbpParallel,
+        ),
+        OptionalBoxing(CopyH2D),
+    ),
+]
+
+
+class BoxingUtil(oneflow._oneflow_internal.deprecated.ForeignBoxingUtil):
+    def __init__(self):
+        oneflow._oneflow_internal.deprecated.ForeignBoxingUtil.__init__(self)
+
+    def BoxingTo(self, builder, blob_object, op_arg_parallel_attr):
+        return BoxingTo(builder, blob_object, op_arg_parallel_attr)
+
+    def TryReplaceDeviceTag(self, builder, parallel_desc_symbol, device_tag):
+        return TryReplaceDeviceTag(builder, parallel_desc_symbol, device_tag)
+
+    def Assign(self, builder, target_blob_object, source_blob_object):
+        return Assign(builder, target_blob_object, source_blob_object)
+
+
+_global_boxing_util = BoxingUtil()
diff --git a/python/oneflow/compatible/single_client/eager/eager_blob_util.py b/python/oneflow/compatible/single_client/eager/eager_blob_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..96e7efb138ecb9855d81a87bcf8d8fac28ffd351
--- /dev/null
+++ b/python/oneflow/compatible/single_client/eager/eager_blob_util.py
@@ -0,0 +1,115 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow._oneflow_internal
+from oneflow.compatible.single_client.framework import blob_trait as blob_trait
+from oneflow.compatible.single_client.framework import (
+    python_callback as python_callback,
+)
+from oneflow.compatible.single_client.framework.dtype import (
+    convert_proto_dtype_to_oneflow_dtype,
+)
+from oneflow.compatible.single_client.support import async_util as async_util
+
+
+@property
+def dtype(self):
+    return convert_proto_dtype_to_oneflow_dtype(self.get_dtype())
+
+
+def numpy(self):
+    return _GetPhysicalBlobBodyCache(self.blob_object)
+
+
+def numpy_list(self):
+    return _GetPhysicalBlobBodyCache(self.blob_object)
+
+
+def RegisterMethod4EagerPhysicalBlob():
+    oneflow._oneflow_internal.EagerPhysicalBlob.dtype = dtype
+    oneflow._oneflow_internal.EagerPhysicalBlob.numpy = numpy
+    oneflow._oneflow_internal.EagerPhysicalBlob.numpy_list = numpy_list
+
+
+def FetchTensorBlobAsNumpyList(parallel_size, blob_object):
+    def AsyncFetchBlobBody(Yield):
+        fetcher = _MakeFetcherEagerBlobBodyAsNumpyFromOfBlob(Yield)
+
+        def BuildFetchBlobBodyInstruction(builder):
+            builder.FetchBlobBody(
+                blob_object, python_callback.GetIdForRegisteredCallback(fetcher)
+            )
+            builder.InsertRemoveForeignCallbackInstruction(
+                blob_object.object_id,
+                python_callback.GetIdForRegisteredCallback(fetcher),
+            )
+
+        oneflow._oneflow_internal.deprecated.PhysicalRun(BuildFetchBlobBodyInstruction)
+
+    return async_util.Await(parallel_size, AsyncFetchBlobBody)
+
+
+def _GetPhysicalBlobHeaderCache(blob_object):
+    return _FetchBlobHeader(blob_object)
+
+
+def _GetPhysicalBlobBodyCache(blob_object):
+    return _FetchPhysicalBlobBody(blob_object)
+
+
+def _FetchBlobHeader(blob_object):
+    def AsyncFetchBlobHeader(Yield):
+        fetcher = _MakeFetcherEagerPhysicalBlobHeaderFromOfBlob(Yield)
+
+        def BuildFetchBlobHeaderInstruction(builder):
+            builder.FetchBlobHeader(
+                blob_object, python_callback.GetIdForRegisteredCallback(fetcher)
+            )
+            builder.InsertRemoveForeignCallbackInstruction(
+                blob_object.object_id,
+                python_callback.GetIdForRegisteredCallback(fetcher),
+            )
+
+        oneflow._oneflow_internal.deprecated.PhysicalRun(
+            BuildFetchBlobHeaderInstruction
+        )
+
+    return async_util.Await(1, AsyncFetchBlobHeader)[0]
+
+
+def _FetchPhysicalBlobBody(blob_object):
+    return FetchTensorBlobAsNumpyList(1, blob_object)[0]
+
+
+def _MakeFetcherEagerPhysicalBlobHeaderFromOfBlob(Yield):
+    def Callback(ofblob):
+        Yield(
+            oneflow._oneflow_internal.EagerPhysicalBlobHeader(
+                ofblob.static_shape,
+                ofblob.shape,
+                oneflow._oneflow_internal.deprecated.GetProtoDtype4OfDtype(
+                    ofblob.dtype
+                ),
+            )
+        )
+
+    return Callback
+
+
+def _MakeFetcherEagerBlobBodyAsNumpyFromOfBlob(Yield):
+    def FetchFromOfBlob(ofblob):
+        Yield(ofblob.CopyToNdarray())
+
+    return FetchFromOfBlob
diff --git a/python/oneflow/compatible/single_client/eager/gradient_util.py b/python/oneflow/compatible/single_client/eager/gradient_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc479fd7c0419c553e4e8647d20b9fad26fb23a3
--- /dev/null
+++ b/python/oneflow/compatible/single_client/eager/gradient_util.py
@@ -0,0 +1,47 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.framework import session_context as session_ctx
+
+
+def GetDefaultBackwardBlobRegister():
+    return session_ctx.GetDefaultSession().backward_blob_register
+
+
+def ReleaseUnusedBlobObject(op_attribute, blob_register):
+    assert op_attribute.HasField("blob_last_used_signature"), op_attribute
+    signature_map = op_attribute.blob_last_used_signature.bn_in_op2blob_last_used
+    bn_in_op2lbi = op_attribute.arg_signature.bn_in_op2lbi
+    for (bn_in_op, is_blob_last_used) in signature_map.items():
+        if not is_blob_last_used:
+            continue
+        lbi = bn_in_op2lbi[bn_in_op]
+        lbn = "%s/%s" % (lbi.op_name, lbi.blob_name)
+        blob_register.ClearObject4BlobName(lbn)
+
+
+def TrySetBackwardUsedBlobObject(op_attribute, fw_blob_register, bw_blob_register):
+    assert op_attribute.HasField("blob_backward_used_signature"), op_attribute
+    signature_map = (
+        op_attribute.blob_backward_used_signature.bn_in_op2blob_backward_used
+    )
+    bn_in_op2lbi = op_attribute.arg_signature.bn_in_op2lbi
+    for (bn_in_op, is_blob_backward_used) in signature_map.items():
+        if not is_blob_backward_used:
+            continue
+        lbi = bn_in_op2lbi[bn_in_op]
+        lbn = "%s/%s" % (lbi.op_name, lbi.blob_name)
+        blob_object = fw_blob_register.GetObject4BlobName(lbn)
+        bw_blob_register.TrySetObject4BlobName(lbn, blob_object)
diff --git a/python/oneflow/compatible/single_client/eager/interpreter_callback.py b/python/oneflow/compatible/single_client/eager/interpreter_callback.py
new file mode 100644
index 0000000000000000000000000000000000000000..6782d17420ae4300cfe05e93d3f20ff06f63d9a8
--- /dev/null
+++ b/python/oneflow/compatible/single_client/eager/interpreter_callback.py
@@ -0,0 +1,99 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from google.protobuf import text_format
+
+import oneflow._oneflow_internal
+from oneflow.compatible.single_client.eager import gradient_util as gradient_util
+from oneflow.compatible.single_client.eager import op_executor as op_executor
+from oneflow.compatible.single_client.eager import symbol_storage as symbol_storage
+from oneflow.compatible.single_client.framework import scope_util as scope_util
+from oneflow.core.job import placement_pb2 as placement_pb
+from oneflow.core.job import scope_pb2 as scope_pb
+from oneflow.core.operator import op_attribute_pb2 as op_attribute_pb
+
+
+def MakeScopeSymbol(job_conf, parallel_conf, is_mirrored):
+    parallel_hierarchy = None
+    if parallel_conf.has_hierarchy():
+        parallel_hierarchy = oneflow._oneflow_internal.Size(
+            tuple(parallel_conf.hierarchy().dim())
+        )
+    return scope_util.MakeInitialScope(
+        job_conf,
+        parallel_conf.device_tag(),
+        list(parallel_conf.device_name()),
+        parallel_hierarchy,
+        is_mirrored,
+    ).symbol_id
+
+
+def MakeParallelDescSymbol(parallel_conf):
+    symbol_id = None
+
+    def BuildInstruction(builder):
+        nonlocal symbol_id
+        symbol_id = builder.GetParallelDescSymbol(parallel_conf).symbol_id
+
+    oneflow._oneflow_internal.deprecated.LogicalRun(BuildInstruction)
+    return symbol_id
+
+
+def MirroredCast(op_attribute_str, parallel_conf):
+    op_attribute = text_format.Parse(op_attribute_str, op_attribute_pb.OpAttribute())
+    blob_register = oneflow._oneflow_internal.GetDefaultBlobRegister()
+    is_cast_to_mirrored = op_attribute.op_conf.HasField("cast_to_mirrored_conf")
+    is_cast_from_mirrored = op_attribute.op_conf.HasField("cast_from_mirrored_conf")
+    assert is_cast_to_mirrored or is_cast_from_mirrored
+    _MirroredCastAndAddOutputBlobReleaser(op_attribute, blob_register)
+    bw_blob_register = gradient_util.GetDefaultBackwardBlobRegister()
+    gradient_util.TrySetBackwardUsedBlobObject(
+        op_attribute, blob_register, bw_blob_register
+    )
+
+
+def InterpretCompletedOp(op_attribute_str, parallel_conf):
+    op_attribute = text_format.Parse(op_attribute_str, op_attribute_pb.OpAttribute())
+    blob_register = gradient_util.GetDefaultBackwardBlobRegister()
+    _InterpretCompletedOp(op_attribute, parallel_conf, blob_register)
+    gradient_util.ReleaseUnusedBlobObject(op_attribute, blob_register)
+
+
+def _InterpretCompletedOp(op_attribute, parallel_conf, blob_register):
+    return op_executor.Interpret(op_attribute, parallel_conf, blob_register)
+
+
+def _MirroredCastAndAddOutputBlobReleaser(op_attribute, blob_register):
+    op_executor.MirroredCast(op_attribute, blob_register)
+    _AddOutputBlobObjectReleaser4InputBlobObject(op_attribute, blob_register)
+
+
+def _AddOutputBlobObjectReleaser4InputBlobObject(op_attribute, blob_register):
+    in_lbi = op_attribute.arg_signature.bn_in_op2lbi["in"]
+    in_lbn = "%s/%s" % (in_lbi.op_name, in_lbi.blob_name)
+    in_blob_object = blob_register.GetObject4BlobName(in_lbn)
+    release = _MakeReleaser4MirroredCastBlobObject(op_attribute, blob_register)
+    in_blob_object.add_releaser(release)
+
+
+def _MakeReleaser4MirroredCastBlobObject(op_attribute, blob_register):
+    def ReleaseMirroredBlobObject(obj):
+        for obn in op_attribute.output_bns:
+            lbi = op_attribute.arg_signature.bn_in_op2lbi[obn]
+            lbn = "%s/%s" % (lbi.op_name, lbi.blob_name)
+            blob_object = blob_register.GetObject4BlobName(lbn)
+            blob_register.ClearObject4BlobName(lbn)
+
+    return ReleaseMirroredBlobObject
diff --git a/python/oneflow/compatible/single_client/eager/op_executor.py b/python/oneflow/compatible/single_client/eager/op_executor.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd6f95860d2847287f2f8e6bf31699dc42737149
--- /dev/null
+++ b/python/oneflow/compatible/single_client/eager/op_executor.py
@@ -0,0 +1,479 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+
+import numpy as np
+from google.protobuf import text_format
+
+import oneflow._oneflow_internal
+from oneflow._oneflow_internal.oneflow.core.job import placement as placement_cfg
+from oneflow._oneflow_internal.oneflow.core.register import logical_blob_id as lbi_util
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.eager import blob_register as blob_register_util
+from oneflow.compatible.single_client.eager import boxing_util as boxing_util
+from oneflow.compatible.single_client.eager import op_infer_util as op_infer_util
+from oneflow.compatible.single_client.eager import symbol_storage as symbol_storage
+from oneflow.compatible.single_client.experimental import namescope as name_scope
+from oneflow.compatible.single_client.framework import c_api_util as c_api_util
+from oneflow.compatible.single_client.framework import (
+    python_callback as python_callback,
+)
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+from oneflow.compatible.single_client.framework import scope_util as scope_util
+from oneflow.compatible.single_client.framework import session_context as session_ctx
+from oneflow.core.operator import interface_blob_conf_pb2 as inter_face_blob_conf_util
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+from oneflow.core.operator import op_node_signature_pb2 as op_node_signature_pb
+from oneflow.core.register import logical_blob_id_pb2 as logical_blob_id_util
+
+default_blob_register = oneflow._oneflow_internal.GetDefaultBlobRegister()
+
+
+def Interpret(op_attribute, parallel_conf, blob_register):
+    if op_attribute.op_conf.HasField("cast_to_mirrored_conf"):
+        return MirroredCast(op_attribute, blob_register)
+    if op_attribute.op_conf.HasField("cast_from_mirrored_conf"):
+        return MirroredCast(op_attribute, blob_register)
+    assert isinstance(parallel_conf, placement_cfg.ParallelConf)
+    if op_attribute.op_conf.HasField("distribute_split_conf"):
+        return DistributeSplitOrClone(op_attribute, parallel_conf, blob_register)
+    if op_attribute.op_conf.HasField("distribute_clone_conf"):
+        return DistributeSplitOrClone(op_attribute, parallel_conf, blob_register)
+    if op_attribute.op_conf.HasField("distribute_concat_conf"):
+        return DistributeConcatOrAdd(op_attribute, parallel_conf, blob_register)
+    if op_attribute.op_conf.HasField("distribute_add_conf"):
+        return DistributeConcatOrAdd(op_attribute, parallel_conf, blob_register)
+    if op_attribute.op_conf.HasField("variable_conf"):
+        return _FindOrCreateVarBlobObject(op_attribute, parallel_conf, blob_register)
+    if op_attribute.op_conf.HasField("foreign_watch_conf"):
+        return _Watch(op_attribute, parallel_conf, blob_register)
+    return _NaiveInterpret(op_attribute, parallel_conf, blob_register)
+
+
+def OpKernelCall(opkernel_object, op_attribute, blob_register):
+    def BuildInstruction(builder):
+        with blob_register_util.BnInOp2BlobObjectScope(
+            blob_register, op_attribute
+        ) as bn_in_op2blob_object:
+            cfg_op_attribute = oneflow._oneflow_internal.deprecated.MakeOpAttributeByString(
+                str(op_attribute)
+            )
+            builder.StatefulCall(
+                cfg_op_attribute,
+                opkernel_object,
+                bn_in_op2blob_object,
+                boxing_util.BoxingTo,
+            )
+
+    oneflow._oneflow_internal.deprecated.LogicalRun(BuildInstruction)
+
+
+def MirroredCast(op_attribute, blob_register):
+    def BuildInstruction(builder):
+        with blob_register_util.BnInOp2BlobObjectScope(
+            blob_register, op_attribute
+        ) as bn_in_op2blob_object:
+            in_blob_object = bn_in_op2blob_object["in"]
+            parallel_desc_symbol = in_blob_object.parallel_desc_symbol
+            op_arg_parallel_attr = oneflow._oneflow_internal.GetOpArgParallelAttribute(
+                parallel_desc_symbol, str(op_attribute), "out"
+            )
+            out_blob_object = builder.MakeReferenceBlobObject(
+                in_blob_object, op_arg_parallel_attr
+            )
+            bn_in_op2blob_object["out"] = out_blob_object
+
+    oneflow._oneflow_internal.deprecated.LogicalRun(BuildInstruction)
+
+
+def DistributeSplitOrClone(op_attribute, parallel_conf, blob_register):
+    parallel_sig = op_attribute.parallel_signature.bn_in_op2parallel_desc_symbol_id
+
+    def GetInBlobObject(builder, ibn, bn_in_op2blob_object):
+        origin_blob_object = bn_in_op2blob_object[ibn]
+        in_op_parallel_desc_sym = oneflow._oneflow_internal.GetPlacementSymbol(
+            parallel_sig[ibn]
+        )
+        in_op_arg_parallel_attr = oneflow._oneflow_internal.GetOpArgParallelAttribute(
+            in_op_parallel_desc_sym, str(op_attribute), ibn
+        )
+        return boxing_util.BoxingTo(
+            builder, origin_blob_object, in_op_arg_parallel_attr
+        )
+
+    def BuildInstruction(builder):
+        with blob_register_util.BnInOp2BlobObjectScope(
+            blob_register, op_attribute
+        ) as bn_in_op2blob_object:
+            physical_out_blob_objects = builder.UnpackLogicalBlobToPhysicalBlobs(
+                GetInBlobObject(builder, "in", bn_in_op2blob_object)
+            )
+            for (i, blob_object) in enumerate(physical_out_blob_objects):
+                bn_in_op2blob_object["out_%s" % i] = blob_object
+
+    oneflow._oneflow_internal.deprecated.LogicalRun(BuildInstruction)
+
+
+def DistributeConcatOrAdd(op_attribute, parallel_conf, blob_register):
+    op_parallel_desc_sym = oneflow._oneflow_internal.GetPlacementSymbol(
+        op_attribute.parallel_signature.op_parallel_desc_symbol_id
+    )
+    parallel_size = len(op_attribute.input_bns)
+    op_arg_parallel_attr = oneflow._oneflow_internal.GetOpArgParallelAttribute(
+        op_parallel_desc_sym, str(op_attribute), "out"
+    )
+    op_arg_blob_attr = oneflow._oneflow_internal.GetOpArgBlobAttribute(
+        str(op_attribute), "out"
+    )
+    parallel_sig = op_attribute.parallel_signature.bn_in_op2parallel_desc_symbol_id
+
+    def GetInBlobObject(builder, i, bn_in_op2blob_object):
+        ibn = "in_%s" % i
+        origin_blob_object = bn_in_op2blob_object[ibn]
+        in_op_parallel_desc_sym = oneflow._oneflow_internal.GetPlacementSymbol(
+            parallel_sig[ibn]
+        )
+        in_op_arg_parallel_attr = oneflow._oneflow_internal.GetOpArgParallelAttribute(
+            in_op_parallel_desc_sym, str(op_attribute), ibn
+        )
+        return boxing_util.BoxingTo(
+            builder, origin_blob_object, in_op_arg_parallel_attr
+        )
+
+    def BuildInstruction(builder):
+        with blob_register_util.BnInOp2BlobObjectScope(
+            blob_register, op_attribute
+        ) as bn_in_op2blob_object:
+
+            def GetPhysicalInBlob(i):
+                return GetInBlobObject(builder, i, bn_in_op2blob_object)
+
+            in_blob_objects = [GetPhysicalInBlob(i) for i in range(parallel_size)]
+            bn_in_op2blob_object["out"] = builder.PackPhysicalBlobsToLogicalBlob(
+                in_blob_objects, op_arg_parallel_attr, op_arg_blob_attr
+            )
+
+    oneflow._oneflow_internal.deprecated.LogicalRun(BuildInstruction)
+
+
+def _FindOrCreateVarBlobObject(op_attribute, parallel_conf, blob_register):
+    job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+    name = name_scope.GetJobNameScopePrefix(job_name) + op_attribute.op_conf.name
+    sess = session_ctx.GetDefaultSession()
+    (var_blob, _) = sess.TryGetVariableBlobOfJobFromStash(job_name, name)
+    if var_blob is not None:
+        blob_register.SetObject4BlobName(
+            var_blob.logical_blob_name, var_blob.blob_object
+        )
+        return
+    _NaiveInterpret(op_attribute, parallel_conf, blob_register)
+    var_blob = _MakeEagerLogicalBlob(op_attribute, "out", blob_register=blob_register)
+    EagerInitVariableBlob(sess, op_attribute.op_conf, var_blob)
+    sess.StashVariableBlob4Job(job_name, op_attribute.op_conf.name, var_blob)
+    return var_blob
+
+
+def _Watch(op_attribute, parallel_conf, blob_register):
+    lbi = op_attribute.arg_signature.bn_in_op2lbi["in"]
+    uuid = op_attribute.op_conf.foreign_watch_conf.handler_uuid
+    lbn = "%s/%s" % (lbi.op_name, lbi.blob_name)
+    in_blob_object = blob_register.GetObject4BlobName(lbn)
+    if not isinstance(lbi, lbi_util.LogicalBlobId):
+        cfg_lbi = lbi_util.LogicalBlobId()
+        cfg_lbi.set_op_name(lbi.op_name)
+        cfg_lbi.set_blob_name(lbi.blob_name)
+        lbi = cfg_lbi
+    if in_blob_object.op_arg_parallel_attr.is_mirrored():
+        blob = oneflow._oneflow_internal.EagerMirroredBlob(
+            lbi, in_blob_object, default_blob_register
+        )
+    else:
+        blob = oneflow._oneflow_internal.EagerConsistentBlob(
+            lbi, in_blob_object, default_blob_register
+        )
+    uuid2watch_handler = session_ctx.GetDefaultSession().uuid2watch_handler
+    assert uuid in uuid2watch_handler
+    uuid2watch_handler[uuid](blob)
+    del uuid2watch_handler[uuid]
+
+
+def _NaiveInterpret(op_attribute, parallel_conf, blob_register):
+    def BuildInstruction(builder):
+        with blob_register_util.BnInOp2BlobObjectScope(
+            blob_register, op_attribute
+        ) as bn_in_op2blob_object:
+            cfg_op_attribute = oneflow._oneflow_internal.deprecated.MakeOpAttributeByString(
+                str(op_attribute)
+            )
+            builder.StatelessCall(
+                cfg_op_attribute,
+                parallel_conf,
+                bn_in_op2blob_object,
+                boxing_util.BoxingTo,
+            )
+
+    oneflow._oneflow_internal.deprecated.LogicalRun(BuildInstruction)
+
+
+def _MakeEagerLogicalBlob(op_attribute, obn, blob_register):
+    lbi = op_attribute.arg_signature.bn_in_op2lbi[obn]
+    blob_object = blob_register.GetObject4BlobName(
+        "%s/%s" % (lbi.op_name, lbi.blob_name)
+    )
+    mirrored_sig_map = op_attribute.mirrored_signature.bn_in_op2opt_mirrored_parallel
+    if not isinstance(lbi, lbi_util.LogicalBlobId):
+        cfg_lbi = lbi_util.LogicalBlobId()
+        cfg_lbi.set_op_name(lbi.op_name)
+        cfg_lbi.set_blob_name(lbi.blob_name)
+        lbi = cfg_lbi
+    if mirrored_sig_map[obn].HasField("mirrored_parallel"):
+        return oneflow._oneflow_internal.EagerMirroredBlob(
+            lbi, blob_object, default_blob_register
+        )
+    else:
+        return oneflow._oneflow_internal.EagerConsistentBlob(
+            lbi, blob_object, default_blob_register
+        )
+
+
+def EagerInitVariableBlob(sess, var_op_conf, var_blob):
+    snapshot_path = sess.snapshot_mgr.get_snapshot_path(var_op_conf.name)
+    with flow.scope.placement("cpu", "0:0"):
+        if snapshot_path is None:
+            blob_object = _EagerRunModelInit(var_op_conf)
+        else:
+            blob_object = _EagerRunModelLoad(var_op_conf, snapshot_path)
+        _Assign(var_blob.blob_object, blob_object)
+
+
+def EagerSaveVariableBlob(snapshot_path):
+    var_blobs = session_ctx.GetDefaultSession().var_name2var_blob.values()
+    with flow.scope.placement("cpu", "0:0"):
+        _EagerRunModelSave(var_blobs, snapshot_path)
+
+
+def _Assign(var_blob_object, value_blob_object):
+    def BuildAssignInstruction(builder):
+        new_parallel_desc_symbol = boxing_util.TryReplaceDeviceTag(
+            builder, var_blob_object.parallel_desc_symbol, "cpu"
+        )
+        consumer_op_arg_parallel_attr = oneflow._oneflow_internal.OpArgParallelAttribute(
+            new_parallel_desc_symbol,
+            str(var_blob_object.op_arg_parallel_attr.sbp_parallel),
+            str(var_blob_object.op_arg_parallel_attr.opt_mirrored_parallel),
+        )
+        tmp_blob_object = boxing_util.BoxingTo(
+            builder, value_blob_object, consumer_op_arg_parallel_attr
+        )
+        boxing_util.Assign(builder, var_blob_object, tmp_blob_object)
+
+    oneflow._oneflow_internal.deprecated.LogicalRun(BuildAssignInstruction)
+
+
+def _BuildNotMirroredScope(old_scope, builder):
+    return builder.BuildScopeWithNewIsMirrored(old_scope, False)
+
+
+def _EagerRunModelInit(var_op_conf):
+    (op_conf, _) = _GenModelInitOpConfAndRetLbi(var_op_conf)
+    bn_in_op2blob_object = oneflow._oneflow_internal.deprecated.BnInOp2BlobObject()
+
+    def BuildModelInitInstruction(builder):
+        upstream_signature = op_node_signature_pb.OpNodeSignature()
+        op_conf.scope_symbol_id = flow.current_scope().symbol_id
+        op_attribute = c_api_util.InferOpConf(op_conf, upstream_signature)
+        parallel_conf = flow.current_scope().device_parallel_desc_symbol.parallel_conf
+        cfg_op_attribute = oneflow._oneflow_internal.deprecated.MakeOpAttributeByString(
+            str(op_attribute)
+        )
+        builder.StatelessCall(
+            cfg_op_attribute, parallel_conf, bn_in_op2blob_object, boxing_util.BoxingTo
+        )
+
+    sess = session_ctx.GetDefaultSession()
+    with scope_util.ScopeContext(scope_util.MakeScope(_BuildNotMirroredScope)):
+        oneflow._oneflow_internal.deprecated.LogicalRun(BuildModelInitInstruction)
+    return bn_in_op2blob_object["out_0"]
+
+
+def _MakeModelIOPathInputBuilds(op_conf, path, bn_in_op2blob_object):
+    def BuildModelIOPathInputInstruction(builder):
+        op_attribute = op_infer_util.Infer(op_conf, ibn2blob_object={})
+        parallel_conf = flow.current_scope().device_parallel_desc_symbol.parallel_conf
+        cfg_op_attribute = oneflow._oneflow_internal.deprecated.MakeOpAttributeByString(
+            str(op_attribute)
+        )
+        builder.StatelessCall(
+            cfg_op_attribute, parallel_conf, bn_in_op2blob_object, boxing_util.BoxingTo
+        )
+
+    def FeedPath(ofblob):
+        ofblob.CopyFromNdarray(np.frombuffer(path.encode("ascii"), dtype=np.int8))
+
+    def BuildFeedPathInstruction(builder):
+        blob_object = bn_in_op2blob_object["out"]
+        builder.FeedBlob(
+            blob_object, python_callback.GetIdForRegisteredCallback(FeedPath)
+        )
+        builder.InsertRemoveForeignCallbackInstruction(
+            blob_object.object_id, python_callback.GetIdForRegisteredCallback(FeedPath)
+        )
+
+    return (BuildModelIOPathInputInstruction, BuildFeedPathInstruction)
+
+
+def _EagerRunModelLoad(var_op_conf, snapshot_path):
+    assert isinstance(snapshot_path, str)
+    assert os.path.basename(snapshot_path) == "out"
+    snapshot_path = os.path.dirname(snapshot_path)
+    assert os.path.basename(snapshot_path) == var_op_conf.name
+    snapshot_path = os.path.dirname(snapshot_path)
+    (path_input_op_conf, path_lbi) = _GenModelIOPathInputOpConfAndRetLbi()
+    path_input_blob_objects = {}
+    (
+        BuildModelIOPathInputInstruction,
+        BuildFeedPathInstruction,
+    ) = _MakeModelIOPathInputBuilds(
+        path_input_op_conf, snapshot_path, path_input_blob_objects
+    )
+    (model_load_op_conf, _) = _GenModelLoadOpConfAndRetLbi(var_op_conf, path_lbi)
+    model_load_blob_objects = oneflow._oneflow_internal.deprecated.BnInOp2BlobObject()
+
+    def BuildModelLoadInstruction(builder):
+        path_blob_object = path_input_blob_objects["out"]
+        model_load_blob_objects["path"] = path_blob_object
+        op_attribute = op_infer_util.Infer(
+            model_load_op_conf, ibn2blob_object=model_load_blob_objects
+        )
+        parallel_conf = path_blob_object.parallel_desc_symbol.parallel_conf
+        cfg_op_attribute = oneflow._oneflow_internal.deprecated.MakeOpAttributeByString(
+            str(op_attribute)
+        )
+        builder.StatelessCall(
+            cfg_op_attribute,
+            parallel_conf,
+            model_load_blob_objects,
+            boxing_util.BoxingTo,
+        )
+
+    sess = session_ctx.GetDefaultSession()
+    with scope_util.ScopeContext(scope_util.MakeScope(_BuildNotMirroredScope)):
+        oneflow._oneflow_internal.deprecated.LogicalRun(
+            BuildModelIOPathInputInstruction
+        )
+        oneflow._oneflow_internal.deprecated.LogicalRun(BuildFeedPathInstruction)
+        oneflow._oneflow_internal.deprecated.LogicalRun(BuildModelLoadInstruction)
+    return model_load_blob_objects["out_0"]
+
+
+def _EagerRunModelSave(var_blobs, snapshot_path):
+    (path_input_op_conf, path_lbi) = _GenModelIOPathInputOpConfAndRetLbi()
+    path_input_blob_objects = oneflow._oneflow_internal.deprecated.BnInOp2BlobObject()
+    (
+        BuildModelIOPathInputInstruction,
+        BuildFeedPathInstruction,
+    ) = _MakeModelIOPathInputBuilds(
+        path_input_op_conf, snapshot_path, path_input_blob_objects
+    )
+    model_save_op_conf = _GenModelSaveOpConf(var_blobs, path_lbi)
+    model_save_blob_objects = oneflow._oneflow_internal.deprecated.BnInOp2BlobObject()
+
+    def BuildModelSaveInstruction(builder):
+        path_blob_object = path_input_blob_objects["out"]
+        model_save_blob_objects["path"] = path_blob_object
+        for (i, blob) in enumerate(var_blobs):
+            model_save_blob_objects["in_{}".format(i)] = blob.blob_object
+        op_attribute = op_infer_util.Infer(
+            model_save_op_conf, ibn2blob_object=model_save_blob_objects
+        )
+        parallel_conf = path_blob_object.parallel_desc_symbol.parallel_conf
+        cfg_op_attribute = oneflow._oneflow_internal.deprecated.MakeOpAttributeByString(
+            str(op_attribute)
+        )
+        builder.StatelessCall(
+            cfg_op_attribute,
+            parallel_conf,
+            model_save_blob_objects,
+            boxing_util.BoxingTo,
+        )
+
+    sess = session_ctx.GetDefaultSession()
+    with scope_util.ScopeContext(scope_util.MakeScope(_BuildNotMirroredScope)):
+        oneflow._oneflow_internal.deprecated.LogicalRun(
+            BuildModelIOPathInputInstruction
+        )
+        oneflow._oneflow_internal.deprecated.LogicalRun(BuildFeedPathInstruction)
+        oneflow._oneflow_internal.deprecated.LogicalRun(BuildModelSaveInstruction)
+
+
+def _GenModelInitOpConfAndRetLbi(var_op_conf):
+    variable_op_conf = op_conf_util.VariableOpConf()
+    variable_op_conf.CopyFrom(var_op_conf.variable_conf)
+    op_conf = op_conf_util.OperatorConf()
+    op_conf.name = "model_init"
+    op_conf.device_tag = "cpu"
+    op_conf.model_init_conf.out.append("out_0")
+    op_conf.model_init_conf.variable_op_name.append(var_op_conf.name)
+    op_conf.model_init_conf.original_variable_conf.append(variable_op_conf)
+    lbi = logical_blob_id_util.LogicalBlobId()
+    lbi.op_name = op_conf.name
+    lbi.blob_name = op_conf.model_init_conf.out[0]
+    return (op_conf, lbi)
+
+
+def _GenModelLoadOpConfAndRetLbi(var_op_conf, path_lbi):
+    variable_op_conf = op_conf_util.VariableOpConf()
+    variable_op_conf.CopyFrom(var_op_conf.variable_conf)
+    op_conf = op_conf_util.OperatorConf()
+    op_conf.name = "model_load"
+    op_conf.device_tag = "cpu"
+    op_conf.model_load_conf.path = "{}/{}".format(path_lbi.op_name, path_lbi.blob_name)
+    op_conf.model_load_conf.out.append("out_0")
+    op_conf.model_load_conf.variable_op_name.append(var_op_conf.name)
+    op_conf.model_load_conf.original_variable_conf.append(variable_op_conf)
+    lbi = logical_blob_id_util.LogicalBlobId()
+    lbi.op_name = op_conf.name
+    lbi.blob_name = op_conf.model_load_conf.out[0]
+    return (op_conf, lbi)
+
+
+def _GenModelIOPathInputOpConfAndRetLbi():
+    op_conf = op_conf_util.OperatorConf()
+    op_conf.name = "model_io_path_input"
+    op_conf.device_tag = "cpu"
+    op_conf.input_conf.out = "out"
+    blob_conf = inter_face_blob_conf_util.InterfaceBlobConf()
+    blob_conf.shape.dim.append(65536)
+    blob_conf.data_type = oneflow._oneflow_internal.deprecated.GetProtoDtype4OfDtype(
+        flow.int8
+    )
+    blob_conf.is_dynamic = True
+    op_conf.input_conf.blob_conf.CopyFrom(blob_conf)
+    lbi = logical_blob_id_util.LogicalBlobId()
+    lbi.op_name = op_conf.name
+    lbi.blob_name = op_conf.input_conf.out
+    return (op_conf, lbi)
+
+
+def _GenModelSaveOpConf(var_blobs, path_lbi):
+    op_conf = op_conf_util.OperatorConf()
+    op_conf.name = "model_save"
+    op_conf.device_tag = "cpu"
+    op_conf.model_save_conf.path = "{}/{}".format(path_lbi.op_name, path_lbi.blob_name)
+    for blob in var_blobs:
+        getattr(op_conf.model_save_conf, "in").append(blob.logical_blob_name)
+        getattr(op_conf.model_save_conf, "key").append(blob.logical_blob_name)
+    return op_conf
diff --git a/python/oneflow/compatible/single_client/eager/op_infer_util.py b/python/oneflow/compatible/single_client/eager/op_infer_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..acf99ca3c2bcf55b77903114ff7cfaaf1269c6cd
--- /dev/null
+++ b/python/oneflow/compatible/single_client/eager/op_infer_util.py
@@ -0,0 +1,43 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from google.protobuf import text_format
+
+from oneflow._oneflow_internal.oneflow.core.operator import (
+    op_node_signature as op_node_signature_cfg,
+)
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import c_api_util as c_api_util
+from oneflow.core.operator import op_node_signature_pb2 as op_node_signature_pb
+
+
+def Infer(op_conf, ibn2blob_object, scope_symbol_id=None):
+    if scope_symbol_id is None:
+        scope_symbol_id = flow.current_scope().symbol_id
+    op_conf.scope_symbol_id = scope_symbol_id
+    upstream_signature = MakeUpstreamSignature(ibn2blob_object)
+    return c_api_util.InferOpConf(op_conf, upstream_signature)
+
+
+def MakeUpstreamSignature(ibn2blob_object):
+    upstream_signature_cfg = op_node_signature_cfg.OpNodeSignature()
+    for (ibn, blob_object) in ibn2blob_object.items():
+        blob_object.op_arg_blob_attr.DumpToOpNodeSignature(ibn, upstream_signature_cfg)
+        blob_object.op_arg_parallel_attr.DumpToOpNodeSignature(
+            ibn, upstream_signature_cfg
+        )
+    return text_format.Parse(
+        str(upstream_signature_cfg), op_node_signature_pb.OpNodeSignature()
+    )
diff --git a/python/oneflow/compatible/single_client/eager/symbol.py b/python/oneflow/compatible/single_client/eager/symbol.py
new file mode 100644
index 0000000000000000000000000000000000000000..33a27950bc34b20ca738e3060b0b9ea92666aa12
--- /dev/null
+++ b/python/oneflow/compatible/single_client/eager/symbol.py
@@ -0,0 +1,33 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import functools
+
+from oneflow.compatible.single_client.framework import c_api_util as c_api_util
+from oneflow.core.job import placement_pb2 as placement_pb
+
+
+class Symbol(object):
+    def __init__(self, symbol_id, data):
+        self.symbol_id_ = symbol_id
+        self.data_ = data
+
+    @property
+    def symbol_id(self):
+        return self.symbol_id_
+
+    @property
+    def data(self):
+        return self.data_
diff --git a/python/oneflow/compatible/single_client/eager/symbol_storage.py b/python/oneflow/compatible/single_client/eager/symbol_storage.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f5d0cddda3c664f3f7f55c9990e8437eb487fd3
--- /dev/null
+++ b/python/oneflow/compatible/single_client/eager/symbol_storage.py
@@ -0,0 +1,54 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+
+def HasSymbol4Id(symbol_id):
+    global id2symbol
+    return symbol_id in id2symbol
+
+
+def GetSymbol4Id(symbol_id):
+    global id2symbol
+    assert symbol_id in id2symbol
+    return id2symbol[symbol_id]
+
+
+def SetSymbol4Id(symbol_id, symbol):
+    global id2symbol
+    assert symbol_id not in id2symbol
+    id2symbol[symbol_id] = symbol
+
+
+id2symbol = {}
+
+
+def HasSymbol4SerializedOpConf(serialized_op_conf):
+    global serialized_op_conf2symbol
+    return serialized_op_conf in serialized_op_conf2symbol
+
+
+def GetSymbol4SerializedOpConf(serialized_op_conf):
+    global serialized_op_conf2symbol
+    return serialized_op_conf2symbol[serialized_op_conf]
+
+
+def SetSymbol4SerializedOpConf(serialized_op_conf, symbol):
+    assert not HasSymbol4SerializedOpConf(serialized_op_conf)
+    global serialized_op_conf2symbol
+    serialized_op_conf2symbol[serialized_op_conf] = symbol
+
+
+serialized_op_conf2symbol = {}
diff --git a/python/oneflow/compatible/single_client/env.py b/python/oneflow/compatible/single_client/env.py
new file mode 100644
index 0000000000000000000000000000000000000000..36dabbebeec64a88c625218fc63c07f0aafcc12e
--- /dev/null
+++ b/python/oneflow/compatible/single_client/env.py
@@ -0,0 +1,42 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.framework.env_util import (
+    api_all_device_placement as all_device_placement,
+)
+from oneflow.compatible.single_client.framework.env_util import (
+    api_ctrl_port as ctrl_port,
+)
+from oneflow.compatible.single_client.framework.env_util import (
+    api_data_port as data_port,
+)
+from oneflow.compatible.single_client.framework.env_util import api_env_init as init
+from oneflow.compatible.single_client.framework.env_util import (
+    api_get_current_resource as current_resource,
+)
+from oneflow.compatible.single_client.framework.env_util import (
+    api_grpc_use_no_signal as grpc_use_no_signal,
+)
+from oneflow.compatible.single_client.framework.env_util import (
+    api_init_bootstrap_confs as init_bootstrap_confs,
+)
+from oneflow.compatible.single_client.framework.env_util import api_log_dir as log_dir
+from oneflow.compatible.single_client.framework.env_util import (
+    api_logbuflevel as logbuflevel,
+)
+from oneflow.compatible.single_client.framework.env_util import (
+    api_logtostderr as logtostderr,
+)
+from oneflow.compatible.single_client.framework.env_util import api_machine as machine
diff --git a/python/oneflow/compatible/single_client/experimental/F/__init__.py b/python/oneflow/compatible/single_client/experimental/F/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/python/oneflow/compatible/single_client/experimental/__init__.py b/python/oneflow/compatible/single_client/experimental/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fc8347785f2e3cc51f3df1e97cef7bee8f98683
--- /dev/null
+++ b/python/oneflow/compatible/single_client/experimental/__init__.py
@@ -0,0 +1,178 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from oneflow.compatible.single_client import unittest
+from oneflow.compatible.single_client.experimental.indexed_slices_ops import (
+    indexed_slices_reduce_sum,
+)
+from oneflow.compatible.single_client.experimental.interface_op_read_and_write import (
+    FeedValueToInterfaceBlob as set_interface_blob_value,
+)
+from oneflow.compatible.single_client.experimental.interface_op_read_and_write import (
+    GetInterfaceBlobValue as get_interface_blob_value,
+)
+from oneflow.compatible.single_client.experimental.namescope import (
+    deprecated_name_scope as name_scope,
+)
+from oneflow.compatible.single_client.experimental.square_sum_op import square_sum
+from oneflow.compatible.single_client.experimental.ssp_variable_proxy_op import (
+    ssp_variable_proxy,
+)
+from oneflow.compatible.single_client.experimental.typing_check import (
+    api_enable_typing_check as enable_typing_check,
+)
+from oneflow.compatible.single_client.experimental.unique_op import unique_with_counts
+from oneflow.compatible.single_client.framework.c_api_util import (
+    GetJobSet as get_job_set,
+)
+from oneflow.compatible.single_client.nn.modules.abs import abs_op as abs
+from oneflow.compatible.single_client.nn.modules.acos import acos_op as acos
+from oneflow.compatible.single_client.nn.modules.acosh import acosh_op as acosh
+from oneflow.compatible.single_client.nn.modules.acosh import arccosh_op as arccosh
+from oneflow.compatible.single_client.nn.modules.activation import gelu_op as gelu
+from oneflow.compatible.single_client.nn.modules.activation import mish_op as mish
+from oneflow.compatible.single_client.nn.modules.activation import sigmoid_op as sigmoid
+from oneflow.compatible.single_client.nn.modules.activation import softmax_op as softmax
+from oneflow.compatible.single_client.nn.modules.activation import tanh_op as tanh
+from oneflow.compatible.single_client.nn.modules.arange import arange_op as arange
+from oneflow.compatible.single_client.nn.modules.argmax import argmax_op as argmax
+from oneflow.compatible.single_client.nn.modules.argsort import argsort_op as argsort
+from oneflow.compatible.single_client.nn.modules.argwhere import argwhere_op as argwhere
+from oneflow.compatible.single_client.nn.modules.atan2 import atan2_op as atan2
+from oneflow.compatible.single_client.nn.modules.atanh import arctanh_op as arctanh
+from oneflow.compatible.single_client.nn.modules.atanh import atanh_op as atanh
+from oneflow.compatible.single_client.nn.modules.bmm import bmm_op as bmm
+from oneflow.compatible.single_client.nn.modules.broadcast_like import (
+    broadcast_like_op as broadcast_like,
+)
+from oneflow.compatible.single_client.nn.modules.cast import cast_op as cast
+from oneflow.compatible.single_client.nn.modules.chunk import chunk_op as chunk
+from oneflow.compatible.single_client.nn.modules.concat import concat_op as cat
+from oneflow.compatible.single_client.nn.modules.constant import (
+    ones_like_op as ones_like,
+)
+from oneflow.compatible.single_client.nn.modules.constant import ones_op as ones
+from oneflow.compatible.single_client.nn.modules.constant import (
+    zeros_like_op as zeros_like,
+)
+from oneflow.compatible.single_client.nn.modules.constant import zeros_op as zeros
+from oneflow.compatible.single_client.nn.modules.dataset import (
+    tensor_buffer_to_list_of_tensors,
+)
+from oneflow.compatible.single_client.nn.modules.eq import eq_op as eq
+from oneflow.compatible.single_client.nn.modules.eq import eq_op as equal
+from oneflow.compatible.single_client.nn.modules.exp import exp_op as exp
+from oneflow.compatible.single_client.nn.modules.expand import expand_op as expand
+from oneflow.compatible.single_client.nn.modules.flatten import _flow_flatten as flatten
+from oneflow.compatible.single_client.nn.modules.floor import floor_op as floor
+from oneflow.compatible.single_client.nn.modules.gather import gather_op as gather
+from oneflow.compatible.single_client.nn.modules.greater import greater_op as gt
+from oneflow.compatible.single_client.nn.modules.greater_equal import (
+    greater_equal_op as ge,
+)
+from oneflow.compatible.single_client.nn.modules.less import less_op as lt
+from oneflow.compatible.single_client.nn.modules.less_equal import less_equal_op as le
+from oneflow.compatible.single_client.nn.modules.log1p import log1p_op as log1p
+from oneflow.compatible.single_client.nn.modules.masked_fill import (
+    masked_fill_op as masked_fill,
+)
+from oneflow.compatible.single_client.nn.modules.masked_select import (
+    masked_select_op as masked_select,
+)
+from oneflow.compatible.single_client.nn.modules.math_ops import _add as add
+from oneflow.compatible.single_client.nn.modules.math_ops import _div as div
+from oneflow.compatible.single_client.nn.modules.math_ops import _mul as mul
+from oneflow.compatible.single_client.nn.modules.math_ops import (
+    _reciprocal as reciprocal,
+)
+from oneflow.compatible.single_client.nn.modules.math_ops import _sub as sub
+from oneflow.compatible.single_client.nn.modules.math_ops import addmm_op as addmm
+from oneflow.compatible.single_client.nn.modules.math_ops import arcsin_op as arcsin
+from oneflow.compatible.single_client.nn.modules.math_ops import arcsinh_op as arcsinh
+from oneflow.compatible.single_client.nn.modules.math_ops import arctan_op as arctan
+from oneflow.compatible.single_client.nn.modules.math_ops import asin_op as asin
+from oneflow.compatible.single_client.nn.modules.math_ops import asinh_op as asinh
+from oneflow.compatible.single_client.nn.modules.math_ops import atan_op as atan
+from oneflow.compatible.single_client.nn.modules.math_ops import ceil_op as ceil
+from oneflow.compatible.single_client.nn.modules.math_ops import clamp_op as clamp
+from oneflow.compatible.single_client.nn.modules.math_ops import clip_op as clip
+from oneflow.compatible.single_client.nn.modules.math_ops import cos_op as cos
+from oneflow.compatible.single_client.nn.modules.math_ops import cosh_op as cosh
+from oneflow.compatible.single_client.nn.modules.math_ops import erf_op as erf
+from oneflow.compatible.single_client.nn.modules.math_ops import erfc_op as erfc
+from oneflow.compatible.single_client.nn.modules.math_ops import expm1_op as expm1
+from oneflow.compatible.single_client.nn.modules.math_ops import log_op as log
+from oneflow.compatible.single_client.nn.modules.math_ops import pow_op as pow
+from oneflow.compatible.single_client.nn.modules.math_ops import rsqrt_op as rsqrt
+from oneflow.compatible.single_client.nn.modules.math_ops import sin_op as sin
+from oneflow.compatible.single_client.nn.modules.math_ops import sqrt_op as sqrt
+from oneflow.compatible.single_client.nn.modules.math_ops import square_op as square
+from oneflow.compatible.single_client.nn.modules.math_ops import std_op as std
+from oneflow.compatible.single_client.nn.modules.math_ops import topk_op as topk
+from oneflow.compatible.single_client.nn.modules.math_ops import variance_op as var
+from oneflow.compatible.single_client.nn.modules.matmul import matmul_op as matmul
+from oneflow.compatible.single_client.nn.modules.meshgrid import meshgrid_op as meshgrid
+from oneflow.compatible.single_client.nn.modules.ne import ne_op as ne
+from oneflow.compatible.single_client.nn.modules.ne import ne_op as not_equal
+from oneflow.compatible.single_client.nn.modules.negative import negative_op as neg
+from oneflow.compatible.single_client.nn.modules.negative import negative_op as negative
+from oneflow.compatible.single_client.nn.modules.reduce_ops import _max as max
+from oneflow.compatible.single_client.nn.modules.reduce_ops import _mean as mean
+from oneflow.compatible.single_client.nn.modules.reduce_ops import _min as min
+from oneflow.compatible.single_client.nn.modules.reduce_ops import _sum as sum
+from oneflow.compatible.single_client.nn.modules.repeat import repeat_op as repeat
+from oneflow.compatible.single_client.nn.modules.reshape import reshape_op as reshape
+from oneflow.compatible.single_client.nn.modules.reshape import view_op as view
+from oneflow.compatible.single_client.nn.modules.round import round_op as round
+from oneflow.compatible.single_client.nn.modules.sign import sign_op as sign
+from oneflow.compatible.single_client.nn.modules.sinh import sinh_op as sinh
+from oneflow.compatible.single_client.nn.modules.slice import slice_op as slice
+from oneflow.compatible.single_client.nn.modules.slice import (
+    slice_update_op as slice_update,
+)
+from oneflow.compatible.single_client.nn.modules.softplus import softplus_op as softplus
+from oneflow.compatible.single_client.nn.modules.sort import sort_op as sort
+from oneflow.compatible.single_client.nn.modules.squeeze import squeeze_op as squeeze
+from oneflow.compatible.single_client.nn.modules.stack import stack
+from oneflow.compatible.single_client.nn.modules.tan import tan_op as tan
+from oneflow.compatible.single_client.nn.modules.tensor_buffer import gen_tensor_buffer
+from oneflow.compatible.single_client.nn.modules.tensor_buffer import (
+    tensor_buffer_to_tensor_op as tensor_buffer_to_tensor,
+)
+from oneflow.compatible.single_client.nn.modules.tensor_buffer import (
+    tensor_to_tensor_buffer,
+)
+from oneflow.compatible.single_client.nn.modules.tile import tile_op as tile
+from oneflow.compatible.single_client.nn.modules.transpose import (
+    transpose_op as transpose,
+)
+from oneflow.compatible.single_client.nn.modules.triu import triu_op as triu
+from oneflow.compatible.single_client.nn.modules.unsqueeze import (
+    unsqueeze_op as unsqueeze,
+)
+from oneflow.compatible.single_client.nn.modules.where import where_op as where
+from oneflow.compatible.single_client.ops.array_ops import (
+    logical_slice,
+    logical_slice_assign,
+)
+from oneflow.compatible.single_client.ops.assign_op import (
+    api_one_to_one_assign as eager_assign_121,
+)
+from oneflow.compatible.single_client.ops.util.custom_op_module import (
+    CustomOpModule as custom_op_module,
+)
+
+from . import scope
diff --git a/python/oneflow/compatible/single_client/experimental/indexed_slices_ops.py b/python/oneflow/compatible/single_client/experimental/indexed_slices_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1d22da8ead5258b53e6e965d7bc4d2c37a44a9e
--- /dev/null
+++ b/python/oneflow/compatible/single_client/experimental/indexed_slices_ops.py
@@ -0,0 +1,46 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional, Tuple
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import distribute as distribute_util
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework import input_blob_def as input_blob_util
+from oneflow.compatible.single_client.framework import interpret_util as interpret_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+from oneflow.core.register import logical_blob_id_pb2 as logical_blob_id_util
+
+
+def indexed_slices_reduce_sum(
+    indices: input_blob_util.ArgBlobDef,
+    values: input_blob_util.ArgBlobDef,
+    name: Optional[str] = None,
+) -> Tuple[oneflow._oneflow_internal.BlobDesc]:
+    op = (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("IndexedSlicesReduceSum_")
+        )
+        .Op("indexed_slices_reduce_sum")
+        .Input("x_indices", [indices])
+        .Input("x_values", [values])
+        .Output("y_indices")
+        .Output("y_values")
+        .Output("num_unique")
+        .Build()
+    )
+    return op.InferAndTryRun().RemoteBlobList()
diff --git a/python/oneflow/compatible/single_client/experimental/interface_op_read_and_write.py b/python/oneflow/compatible/single_client/experimental/interface_op_read_and_write.py
new file mode 100644
index 0000000000000000000000000000000000000000..f866ffe6b202e0c3e9953b6d2201ed01bceefa4e
--- /dev/null
+++ b/python/oneflow/compatible/single_client/experimental/interface_op_read_and_write.py
@@ -0,0 +1,172 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow._oneflow_internal
+from oneflow._oneflow_internal.oneflow.core.common import shape as shape_proto_cfg
+from oneflow._oneflow_internal.oneflow.core.job import placement as placement_cfg
+from oneflow._oneflow_internal.oneflow.core.register import logical_blob_id as lbi_util
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import dtype as dtype_util
+from oneflow.compatible.single_client.framework import (
+    input_blob_def as input_blob_def_util,
+)
+from oneflow.compatible.single_client.framework import push_util as push_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+from oneflow.compatible.single_client.framework import runtime_mode as rt_mode
+from oneflow.compatible.single_client.framework import session_context as session_ctx
+from oneflow.compatible.single_client.support import async_util as async_util
+from oneflow.core.register import logical_blob_id_pb2 as logical_blob_id_util
+
+
+def sync_default_session_if_normal():
+    if rt_mode.CurrentMode() == rt_mode.NORMAL_MODE:
+        flow.sync_default_session()
+    else:
+        pass
+
+
+blob_register = oneflow._oneflow_internal.GetDefaultBlobRegister()
+
+
+def _GetInterfaceBlobObject(builder, op_name):
+    sess = session_ctx.GetDefaultSession()
+    if oneflow._oneflow_internal.EagerExecutionEnabled():
+        return sess.var_name2var_blob[op_name].blob_object
+    sess = session_ctx.GetDefaultSession()
+    op_attribute = sess.OpAttribute4InterfaceOpName(op_name)
+    cfg_op_attribute = oneflow._oneflow_internal.deprecated.MakeOpAttributeByString(
+        str(op_attribute)
+    )
+    parallel_conf = sess.ParallelConf4LazyInterfaceOpName(op_name)
+    if not isinstance(
+        parallel_conf, oneflow._oneflow_internal.oneflow.core.job.placement.ParallelConf
+    ):
+        parallel_conf_cfg = placement_cfg.ParallelConf()
+        parallel_conf_cfg.set_device_tag(parallel_conf.device_tag)
+        for device_name in parallel_conf.device_name:
+            parallel_conf_cfg.add_device_name(device_name)
+        if parallel_conf.HasField("hierarchy"):
+            hierarchy = shape_proto_cfg.ShapeProto()
+            for dim in parallel_conf.hierarchy.dim:
+                hierarchy.add_dim(dim)
+            assert hierarchy.dim_size() > 0
+            parallel_conf_cfg.mutable_hierarchy().CopyFrom(hierarchy)
+        parallel_conf = parallel_conf_cfg
+    blob_object = builder.MakeLazyRefBlobObject(
+        op_name, cfg_op_attribute, parallel_conf
+    )
+    return blob_object
+
+
+def GetEagerInterfaceBlob(op_name):
+    sync_default_session_if_normal()
+    sess = session_ctx.GetDefaultSession()
+
+    def CreateBlob():
+        job_name = sess.JobName4InterfaceOpName(op_name)
+
+        def Build(builder, Yield):
+            blob_object = _GetInterfaceBlobObject(builder, op_name)
+            lbi = lbi_util.LogicalBlobId()
+            lbi.set_op_name(op_name)
+            op_attribute = sess.OpAttribute4InterfaceOpName(op_name)
+            assert len(op_attribute.output_bns) == 1
+            lbi.set_blob_name(op_attribute.output_bns[0])
+            if blob_object.op_arg_parallel_attr.is_mirrored():
+                remote_blob = oneflow._oneflow_internal.EagerMirroredBlob(
+                    lbi, blob_object, blob_register, job_name
+                )
+            else:
+                remote_blob = oneflow._oneflow_internal.EagerConsistentBlob(
+                    lbi, blob_object, blob_register, job_name
+                )
+            Yield(remote_blob)
+
+        def AsyncGetInterfaceBlob(Yield):
+            oneflow._oneflow_internal.deprecated.LogicalRun(
+                lambda builder: Build(builder, Yield)
+            )
+
+        blob = async_util.Await(1, AsyncGetInterfaceBlob)[0]
+        return blob
+
+    return sess.FindOrCreateLazyBlob(op_name, CreateBlob)
+
+
+def GetInterfaceBlobValue(op_name):
+    sync_default_session_if_normal()
+    sess = session_ctx.GetDefaultSession()
+    job_name = sess.JobName4InterfaceOpName(op_name)
+
+    def AsyncGetInterfaceBlobValue(Yield):
+        def build(builder):
+            blob_object = GetEagerInterfaceBlob(op_name).blob_object
+            lbi = lbi_util.LogicalBlobId()
+            lbi.set_op_name(op_name)
+            op_attribute = sess.OpAttribute4InterfaceOpName(op_name)
+            assert len(op_attribute.output_bns) == 1
+            lbi.set_blob_name(op_attribute.output_bns[0])
+            if not isinstance(lbi, lbi_util.LogicalBlobId):
+                cfg_lbi = lbi_util.LogicalBlobId()
+                cfg_lbi.set_op_name(lbi.op_name)
+                cfg_lbi.set_blob_name(lbi.blob_name)
+                lbi = cfg_lbi
+            if blob_object.op_arg_parallel_attr.is_mirrored():
+                remote_blob = oneflow._oneflow_internal.EagerMirroredBlob(
+                    lbi, blob_object, blob_register, job_name
+                )
+            else:
+                remote_blob = oneflow._oneflow_internal.EagerConsistentBlob(
+                    lbi, blob_object, blob_register, job_name
+                )
+            value = remote_blob.numpy()
+            Yield(value)
+
+        oneflow._oneflow_internal.deprecated.LogicalRun(build)
+
+    return async_util.Await(1, AsyncGetInterfaceBlobValue)[0]
+
+
+def FeedValueToInterfaceBlobObject(blob_object, ndarray):
+    sync_default_session_if_normal()
+
+    def build(builder):
+        if blob_object.op_arg_parallel_attr.is_mirrored():
+            input_blob_def = input_blob_def_util.MirroredTensorDef(
+                ndarray.shape,
+                dtype=dtype_util.convert_numpy_dtype_to_oneflow_dtype(ndarray.dtype),
+            )
+        else:
+            input_blob_def = input_blob_def_util.FixedTensorDef(
+                ndarray.shape,
+                dtype=dtype_util.convert_numpy_dtype_to_oneflow_dtype(ndarray.dtype),
+            )
+        push_util.FeedValueToEagerBlob(blob_object, input_blob_def, ndarray)
+
+    oneflow._oneflow_internal.deprecated.LogicalRun(build)
+
+
+def FeedValueToInterfaceBlob(op_name, ndarray):
+    sync_default_session_if_normal()
+
+    def AsyncFeedValueToInterfaceBlob(Yield):
+        def build(builder):
+            blob_object = GetEagerInterfaceBlob(op_name).blob_object
+            FeedValueToInterfaceBlobObject(blob_object, ndarray)
+            Yield()
+
+        oneflow._oneflow_internal.deprecated.LogicalRun(build)
+
+    async_util.Await(1, AsyncFeedValueToInterfaceBlob)
diff --git a/python/oneflow/compatible/single_client/experimental/linalg.py b/python/oneflow/compatible/single_client/experimental/linalg.py
new file mode 100644
index 0000000000000000000000000000000000000000..250d3b3d1562be68252b5b0dd334f9b7db3b30ba
--- /dev/null
+++ b/python/oneflow/compatible/single_client/experimental/linalg.py
@@ -0,0 +1,16 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.nn.modules.norm import norm_op as norm
diff --git a/python/oneflow/compatible/single_client/experimental/load_mnist.py b/python/oneflow/compatible/single_client/experimental/load_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1ddd8655cee8f193445f7406da090c1d72e60ea
--- /dev/null
+++ b/python/oneflow/compatible/single_client/experimental/load_mnist.py
@@ -0,0 +1,101 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import hashlib
+import os
+
+import numpy as np
+import requests
+from tqdm import tqdm
+
+
+def get_sha256hash(file_path, Bytes=1024):
+    sha256hash = hashlib.sha256()
+    with open(file_path, "rb") as f:
+        while True:
+            data = f.read(Bytes)
+            if data:
+                sha256hash.update(data)
+            else:
+                break
+    ret = sha256hash.hexdigest()
+    return ret
+
+
+def download_mnist_file(out_path, url):
+    resp = requests.get(url=url, stream=True)
+    size = int(resp.headers["Content-Length"]) / 1024
+    print("File size: %.4f kb, downloading..." % size)
+    with open(out_path, "wb") as f:
+        for data in tqdm(
+            iterable=resp.iter_content(1024), total=size, unit="k", desc=out_path
+        ):
+            f.write(data)
+        print("Done!")
+
+
+def get_mnist_file(sha256, url, out_dir):
+    path = os.path.join(out_dir, "mnist.npz")
+    if not os.path.isfile(path):
+        download_mnist_file(path, url)
+    print("File mnist.npz already exist, path:", path)
+    if not get_sha256hash(path) == sha256:
+        cheksum_fail = "sha256 verification failed, remove {0} and try again".format(
+            path
+        )
+        raise Exception(cheksum_fail)
+    return path
+
+
+def load_mnist(
+    train_batch_size=100,
+    test_batch_size=100,
+    data_format="NCHW",
+    url="https://oneflow-public.oss-cn-beijing.aliyuncs.com/datasets/mnist.npz",
+    hash_check="63d4344077849053dc3036b247fa012b2b381de53fd055a66b539dffd76cf08e",
+    out_dir=".",
+):
+    """Load mnist dataset, return images and labels,
+            if  dataset doesn't exist, then download it to directory that out_dir specified
+
+    Args:
+        train_batch_size (int, optional): batch size for train. Defaults to 100.
+        test_batch_size (int, optional): batch size for test or evaluate. Defaults to 100.
+        data_format (str, optional): data format. Defaults to "NCHW".
+        url (str, optional): url to get mnist.npz. Defaults to "https://oneflow-public.oss-cn-beijing.aliyuncs.com/datasets/mnist.npz".
+        hash_check (str, optional): file hash value. Defaults to "63d4344077849053dc3036b247fa012b2b381de53fd055a66b539dffd76cf08e".
+        out_dir (str, optional): dir to save downloaded file. Defaults to "./".
+
+    Returns:
+        [type]: (train_images, train_labels), (test_images, test_labels)
+    """
+    path = get_mnist_file(hash_check, url, out_dir)
+    with np.load(path, allow_pickle=True) as f:
+        (x_train, y_train) = (f["x_train"], f["y_train"])
+        (x_test, y_test) = (f["x_test"], f["y_test"])
+
+    def normalize(x, y, batch_size):
+        x = x.astype(np.float32) / 255.0
+        y = y.astype(np.int32)
+        if data_format == "NCHW":
+            images = x.reshape((-1, batch_size, 1, x.shape[1], x.shape[2]))
+        else:
+            images = x.reshape((-1, batch_size, x.shape[1], x.shape[2], 1))
+        labels = y.reshape((-1, batch_size))
+        return (images, labels)
+
+    (train_images, train_labels) = normalize(x_train, y_train, train_batch_size)
+    (test_images, test_labels) = normalize(x_test, y_test, test_batch_size)
+    return ((train_images, train_labels), (test_images, test_labels))
diff --git a/python/oneflow/compatible/single_client/experimental/namescope.py b/python/oneflow/compatible/single_client/experimental/namescope.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cd92c6e4a53bb822a478796c664ba896996b645
--- /dev/null
+++ b/python/oneflow/compatible/single_client/experimental/namescope.py
@@ -0,0 +1,101 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import traceback
+from contextlib import contextmanager
+
+import oneflow._oneflow_internal
+from oneflow import oneflow_deprecate
+from oneflow.compatible.single_client.framework import scope_util as scope_util
+from oneflow.compatible.single_client.framework import (
+    session_context as session_context,
+)
+
+
+@oneflow_deprecate()
+def deprecated_name_scope(*args, **kwargs):
+    print(
+        "WARNING:",
+        "oneflow.compatible.single_client.name_scope/oneflow.compatible.single_client.experimental.name_scope/deprecated.variable_scope",
+        "will be removed in the future, use {} instead.".format(
+            "oneflow.compatible.single_client.scope.namespace"
+        ),
+    )
+    print(traceback.format_stack()[-2])
+    return name_scope(*args, **kwargs)
+
+
+@contextmanager
+def name_scope(name: str) -> None:
+    """Create a namespace. All variables within the namespace will have a prefix `[SCOPE NAME]-`. This is for convenience only and has no other effect on the system.
+    Usage::
+
+        with oneflow.compatible.single_client.scope.namespace("scope1"):
+            ...
+            with oneflow.compatible.single_client.scope.namespace("scope2"):
+                ...
+
+    Args:
+        name: Name of this namespace
+
+    """
+    assert isinstance(name, str)
+    name_scope_stack_push(name)
+
+    def BuildScope(old_scope, builder):
+        return builder.BuildScopeWithNewScopeName(old_scope, name)
+
+    sess = session_context.GetDefaultSession()
+    try:
+        with scope_util.ScopeContext(scope_util.MakeScope(BuildScope)):
+            yield
+    finally:
+        name_scope_stack_pop()
+
+
+def name_scope_stack_push(name):
+    job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+    sess = session_context.GetDefaultSession()
+    if job_name not in sess.job_name2name_scope_stack:
+        sess.job_name2name_scope_stack[job_name] = []
+    sess.job_name2name_scope_stack[job_name].append(name)
+
+
+def name_scope_stack_pop():
+    job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+    sess = session_context.GetDefaultSession()
+    assert job_name in sess.job_name2name_scope_stack
+    assert len(sess.job_name2name_scope_stack[job_name]) > 0
+    return sess.job_name2name_scope_stack[job_name].pop()
+
+
+def GetJobNameScopePrefix(job_name):
+    sess = session_context.GetDefaultSession()
+    if job_name not in sess.job_name2name_scope_stack:
+        return ""
+    if len(sess.job_name2name_scope_stack[job_name]) == 0:
+        return ""
+    return "-".join(sess.job_name2name_scope_stack[job_name]) + "-"
+
+
+def PrependOpNamePrefixIfNeed(op_conf):
+    if op_conf.HasField("variable_conf"):
+        return
+    if op_conf.HasField("decode_ofrecord_conf"):
+        return
+    if op_conf.HasField("user_conf"):
+        return
+    job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+    op_conf.name = GetJobNameScopePrefix(job_name) + op_conf.name
diff --git a/python/oneflow/compatible/single_client/experimental/nn/__init__.py b/python/oneflow/compatible/single_client/experimental/nn/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cced108d0ef9cd300cc1b279fccf2ea8013b3e0e
--- /dev/null
+++ b/python/oneflow/compatible/single_client/experimental/nn/__init__.py
@@ -0,0 +1,99 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.nn.modules.activation import (
+    ELU,
+    GELU,
+    Hardsigmoid,
+    Hardswish,
+    Hardtanh,
+    LeakyReLU,
+    LogSigmoid,
+    LogSoftmax,
+    Mish,
+    PReLU,
+    ReLU,
+    ReLU6,
+    Sigmoid,
+    Softmax,
+    Softplus,
+    Tanh,
+)
+from oneflow.compatible.single_client.nn.modules.adaptive_pool import AdaptiveAvgPool2d
+from oneflow.compatible.single_client.nn.modules.batchnorm import (
+    BatchNorm1d,
+    BatchNorm2d,
+)
+from oneflow.compatible.single_client.nn.modules.constantpad2d import ConstantPad2d
+from oneflow.compatible.single_client.nn.modules.container import (
+    ModuleDict,
+    ModuleList,
+    ParameterDict,
+    ParameterList,
+    Sequential,
+)
+from oneflow.compatible.single_client.nn.modules.conv import Conv1d, Conv2d
+from oneflow.compatible.single_client.nn.modules.dataset import (
+    COCOReader,
+    CoinFlip,
+    CropMirrorNormalize,
+    OFRecordImageDecoder,
+    OFRecordImageDecoderRandomCrop,
+    OfrecordRawDecoder,
+    OfrecordReader,
+)
+from oneflow.compatible.single_client.nn.modules.deconv import ConvTranspose2d
+from oneflow.compatible.single_client.nn.modules.dropout import Dropout
+from oneflow.compatible.single_client.nn.modules.flatten import Flatten
+from oneflow.compatible.single_client.nn.modules.instancenorm import (
+    InstanceNorm1d,
+    InstanceNorm2d,
+    InstanceNorm3d,
+)
+from oneflow.compatible.single_client.nn.modules.linear import Identity, Linear
+from oneflow.compatible.single_client.nn.modules.loss import (
+    BCELoss,
+    BCEWithLogitsLoss,
+    CrossEntropyLoss,
+    CTCLoss,
+    KLDivLoss,
+    L1Loss,
+    MarginRankingLoss,
+    MSELoss,
+    NLLLoss,
+)
+from oneflow.compatible.single_client.nn.modules.normalization import (
+    GroupNorm,
+    LayerNorm,
+)
+from oneflow.compatible.single_client.nn.modules.padding import (
+    ReflectionPad2d,
+    ReplicationPad2d,
+)
+from oneflow.compatible.single_client.nn.modules.pixelshuffle import PixelShuffle
+from oneflow.compatible.single_client.nn.modules.pooling import (
+    AvgPool1d,
+    AvgPool2d,
+    AvgPool3d,
+    MaxPool1d,
+    MaxPool2d,
+    MaxPool3d,
+)
+from oneflow.compatible.single_client.nn.modules.upsampling import (
+    Upsample,
+    UpsamplingBilinear2d,
+    UpsamplingNearest2d,
+)
+from oneflow.compatible.single_client.nn.modules.zeropad2d import ZeroPad2d
diff --git a/python/oneflow/compatible/single_client/experimental/nn/image.py b/python/oneflow/compatible/single_client/experimental/nn/image.py
new file mode 100644
index 0000000000000000000000000000000000000000..1491d5b9f3bf6b1091ce8ece208dd7487e967384
--- /dev/null
+++ b/python/oneflow/compatible/single_client/experimental/nn/image.py
@@ -0,0 +1,23 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.nn.modules.dataset import (
+    ImageBatchAlign as batch_align,
+)
+from oneflow.compatible.single_client.nn.modules.dataset import ImageDecode as decode
+from oneflow.compatible.single_client.nn.modules.dataset import (
+    ImageNormalize as normalize,
+)
+from oneflow.compatible.single_client.nn.modules.dataset import ImageResize as Resize
diff --git a/python/oneflow/compatible/single_client/experimental/optim/__init__.py b/python/oneflow/compatible/single_client/experimental/optim/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..66b9888ab1f0effd67f1ae68004cf52e7eeef6a6
--- /dev/null
+++ b/python/oneflow/compatible/single_client/experimental/optim/__init__.py
@@ -0,0 +1,20 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.nn.optimizer.adam import Adam
+from oneflow.compatible.single_client.nn.optimizer.adamw import AdamW
+from oneflow.compatible.single_client.nn.optimizer.optimizer import Optimizer
+from oneflow.compatible.single_client.nn.optimizer.rmsprop import RMSprop
+from oneflow.compatible.single_client.nn.optimizer.sgd import SGD
diff --git a/python/oneflow/compatible/single_client/experimental/optim/lr_scheduler.py b/python/oneflow/compatible/single_client/experimental/optim/lr_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f43850baaa23724957c10cf039ace385a0cb41e
--- /dev/null
+++ b/python/oneflow/compatible/single_client/experimental/optim/lr_scheduler.py
@@ -0,0 +1,23 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.nn.optimizer.cosine_annealing_lr import (
+    CosineAnnealingLR,
+)
+from oneflow.compatible.single_client.nn.optimizer.lambda_lr import LambdaLR
+from oneflow.compatible.single_client.nn.optimizer.lr_scheduler import (
+    LrScheduler as _LRScheduler,
+)
+from oneflow.compatible.single_client.nn.optimizer.step_lr import StepLR
diff --git a/python/oneflow/compatible/single_client/experimental/scope.py b/python/oneflow/compatible/single_client/experimental/scope.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1746c7585e35bfa7e5cde6afdc73804dee5ce09
--- /dev/null
+++ b/python/oneflow/compatible/single_client/experimental/scope.py
@@ -0,0 +1,18 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.framework.scope_util import (
+    api_scope_config as config,
+)
diff --git a/python/oneflow/compatible/single_client/experimental/square_sum_op.py b/python/oneflow/compatible/single_client/experimental/square_sum_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..db2de8fd6785008e8aca7f25582c7d7e6bfc957a
--- /dev/null
+++ b/python/oneflow/compatible/single_client/experimental/square_sum_op.py
@@ -0,0 +1,44 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import operator
+from functools import reduce
+from typing import Optional
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import distribute as distribute_util
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework import input_blob_def as input_blob_util
+from oneflow.compatible.single_client.framework import interpret_util as interpret_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+from oneflow.core.register import logical_blob_id_pb2 as logical_blob_id_util
+
+
+def square_sum(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    return (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("SquareSum_")
+        )
+        .Op("square_sum")
+        .Input("x", [x])
+        .Output("y")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
diff --git a/python/oneflow/compatible/single_client/experimental/ssp_variable_proxy_op.py b/python/oneflow/compatible/single_client/experimental/ssp_variable_proxy_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..c600e716c6c1da679c2d506e55ead54f87ec4879
--- /dev/null
+++ b/python/oneflow/compatible/single_client/experimental/ssp_variable_proxy_op.py
@@ -0,0 +1,41 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Tuple
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+
+
+def ssp_variable_proxy(
+    var: oneflow._oneflow_internal.BlobDesc, buffer_size: int = 1, name=None
+) -> Tuple[oneflow._oneflow_internal.BlobDesc, oneflow._oneflow_internal.BlobDesc]:
+    """ return ref_blob, value_blob """
+    if name is None:
+        name = id_util.UniqueStr("SspVariableProxy_")
+    blob_dict = (
+        flow.user_op_builder(name)
+        .Op("ssp_variable_proxy")
+        .Input("var", [var])
+        .Output("ref")
+        .Output("value")
+        .Attr("buffer_size", buffer_size)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobDict()
+    )
+    return (blob_dict["ref"][0], blob_dict["value"][0])
diff --git a/python/oneflow/compatible/single_client/experimental/tmp.py b/python/oneflow/compatible/single_client/experimental/tmp.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0d7f87d5ca0ebdc202f3cf67508d31eb093dded
--- /dev/null
+++ b/python/oneflow/compatible/single_client/experimental/tmp.py
@@ -0,0 +1,24 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.nn.modules.dataset import (
+    get_ofrecord_handle as OfrecordReader,
+)
+from oneflow.compatible.single_client.nn.modules.dataset import (
+    raw_decoder as RawDecoder,
+)
+from oneflow.compatible.single_client.nn.modules.slice import (
+    logical_slice_assign_op as logical_slice_assign,
+)
diff --git a/python/oneflow/compatible/single_client/experimental/typing_check.py b/python/oneflow/compatible/single_client/experimental/typing_check.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bf4dc884455029632ad774c236fee97e996b017
--- /dev/null
+++ b/python/oneflow/compatible/single_client/experimental/typing_check.py
@@ -0,0 +1,31 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.framework import hob as hob
+from oneflow.compatible.single_client.support import enable_if as enable_if
+
+
+def api_enable_typing_check(val: bool = True) -> None:
+    """ enable typing check for global_function """
+    return enable_if.unique([enable_typing_check])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.any_global_function_defined)
+def enable_typing_check(val):
+    global typing_check_enabled
+    typing_check_enabled = val
+
+
+typing_check_enabled = False
diff --git a/python/oneflow/compatible/single_client/experimental/unique_op.py b/python/oneflow/compatible/single_client/experimental/unique_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..1868c89d775451d9c29ceaca983bf69f3f9ca9ef
--- /dev/null
+++ b/python/oneflow/compatible/single_client/experimental/unique_op.py
@@ -0,0 +1,47 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional, Tuple
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import distribute as distribute_util
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework import input_blob_def as input_blob_util
+from oneflow.compatible.single_client.framework import interpret_util as interpret_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+from oneflow.core.register import logical_blob_id_pb2 as logical_blob_id_util
+
+
+def unique_with_counts(
+    x: input_blob_util.ArgBlobDef,
+    out_idx: flow.dtype = flow.int32,
+    name: Optional[str] = None,
+) -> Tuple[oneflow._oneflow_internal.BlobDesc]:
+    op = (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("UniqueWithCounts_")
+        )
+        .Op("unique_with_counts")
+        .Input("x", [x])
+        .Attr("out_idx", out_idx)
+        .Output("y")
+        .Output("idx")
+        .Output("count")
+        .Output("num_unique")
+        .Build()
+    )
+    return op.InferAndTryRun().RemoteBlobList()
diff --git a/python/oneflow/compatible/single_client/framework/__init__.py b/python/oneflow/compatible/single_client/framework/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/python/oneflow/compatible/single_client/framework/attr_util.py b/python/oneflow/compatible/single_client/framework/attr_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..328dd2879f0f412ac92a779cc26d00ac8dd5efaa
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/attr_util.py
@@ -0,0 +1,128 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow._oneflow_internal
+from oneflow._oneflow_internal.oneflow.core.common import data_type as data_type_cfg
+from oneflow._oneflow_internal.oneflow.core.common import shape as shape_cfg
+from oneflow._oneflow_internal.oneflow.core.framework import (
+    user_op_attr as user_op_attr_cfg,
+)
+from oneflow.compatible import single_client as flow
+
+
+def SetAttrValue(attr_value, py_value, default_attr_value):
+    if default_attr_value.HasField("at_bool"):
+        if py_value is None:
+            py_value = True
+        assert type(py_value) is bool
+        attr_value.set_at_bool(py_value)
+    elif default_attr_value.HasField("at_int64"):
+        assert type(py_value) is int
+        attr_value.set_at_int64(py_value)
+    elif default_attr_value.HasField("at_double"):
+        assert type(py_value) is float
+        attr_value.set_at_double(py_value)
+    elif default_attr_value.HasField("at_string"):
+        assert type(py_value) is str
+        attr_value.set_at_string(py_value)
+    else:
+        raise ValueError(
+            "config with type %s is invalid. supported types: [bool, int, float, str]"
+            % type(py_value)
+        )
+
+
+def convert_to_user_attr_value(op_type_name, attr_name, attr_value):
+    attribute = user_op_attr_cfg.AttrValue()
+    assert isinstance(attr_name, str)
+    attr_type = oneflow._oneflow_internal.GetUserOpAttrType(op_type_name, attr_name)
+    if attr_type == user_op_attr_cfg.kAtInt32:
+        assert isinstance(attr_value, int)
+        attribute.set_at_int32(attr_value)
+    elif attr_type == user_op_attr_cfg.kAtInt64:
+        assert isinstance(attr_value, int)
+        attribute.set_at_int64(attr_value)
+    elif attr_type == user_op_attr_cfg.kAtBool:
+        assert isinstance(attr_value, bool)
+        attribute.set_at_bool(attr_value)
+    elif attr_type == user_op_attr_cfg.kAtFloat:
+        assert isinstance(attr_value, (float, int))
+        attribute.set_at_float(attr_value)
+    elif attr_type == user_op_attr_cfg.kAtDouble:
+        assert isinstance(attr_value, (float, int))
+        attribute.set_at_double(attr_value)
+    elif attr_type == user_op_attr_cfg.kAtString:
+        assert isinstance(attr_value, str)
+        attribute.set_at_string(attr_value)
+    elif attr_type == user_op_attr_cfg.kAtShape:
+        assert isinstance(attr_value, (tuple, list))
+        attribute_mutable_at_shape = attribute.mutable_at_shape()
+        for x in attr_value:
+            assert isinstance(x, int)
+            attribute_mutable_at_shape.add_dim(x)
+    elif attr_type == user_op_attr_cfg.kAtDataType:
+        assert attr_value in flow.dtypes()
+        attr_value = oneflow._oneflow_internal.deprecated.GetProtoDtype4OfDtype(
+            attr_value
+        )
+        assert isinstance(attr_value, int)
+        attribute.set_at_data_type(data_type_cfg.DataType(attr_value))
+    elif attr_type == user_op_attr_cfg.kAtListInt32:
+        assert isinstance(attr_value, (tuple, list))
+        attribute_mutable_at_list_int32 = attribute.mutable_at_list_int32()
+        for x in attr_value:
+            assert isinstance(x, int)
+            attribute_mutable_at_list_int32.add_val(x)
+    elif attr_type == user_op_attr_cfg.kAtListInt64:
+        assert isinstance(attr_value, (tuple, list))
+        attribute_mutable_at_list_int64 = attribute.mutable_at_list_int64()
+        for x in attr_value:
+            assert isinstance(x, int)
+            attribute_mutable_at_list_int64.add_val(x)
+    elif attr_type == user_op_attr_cfg.kAtListFloat:
+        assert isinstance(attr_value, (tuple, list))
+        attribute_mutable_at_list_float = attribute.mutable_at_list_float()
+        for x in attr_value:
+            assert isinstance(x, (float, int))
+            attribute_mutable_at_list_float.add_val(x)
+    elif attr_type == user_op_attr_cfg.kAtListDataType:
+        assert isinstance(attr_value, (tuple, list))
+        attribute_mutable_at_list_data_type = attribute.mutable_at_list_data_type()
+        for x in attr_value:
+            assert x in flow.dtypes()
+            x = oneflow._oneflow_internal.deprecated.GetProtoDtype4OfDtype(x)
+            assert isinstance(x, int)
+            attribute_mutable_at_list_data_type.add_val(data_type_cfg.DataType(x))
+    elif attr_type == user_op_attr_cfg.kAtListShape:
+        assert isinstance(attr_value, (tuple, list))
+        attribute_mutable_at_list_shape = (
+            attribute.mutable_at_list_shape().mutable_val()
+        )
+        for x in attr_value:
+            assert isinstance(x, (tuple, list))
+            shape = shape_cfg.ShapeProto()
+            for dim in x:
+                assert isinstance(dim, int)
+                shape.add_dim(dim)
+            attribute_mutable_at_list_shape.Add().CopyFrom(shape)
+    elif attr_type == user_op_attr_cfg.kAtListString:
+        assert isinstance(attr_value, (tuple, list))
+        attribute_mutable_at_list_string = attribute.mutable_at_list_string()
+        for x in attr_value:
+            assert isinstance(x, str)
+            attribute_mutable_at_list_string.add_val(x)
+    else:
+        raise ValueError("Invalid op attribute type {}".format(attr_type))
+    return attribute
diff --git a/python/oneflow/compatible/single_client/framework/balanced_splitter.py b/python/oneflow/compatible/single_client/framework/balanced_splitter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9a72178fb31eca0e8569681163736bc9d4f8584
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/balanced_splitter.py
@@ -0,0 +1,32 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+
+def BalancedPartNums(total, part_size):
+    base = int(total / part_size)
+    remainder = total % part_size
+    return [base + int(i < remainder) for i in range(part_size)]
+
+
+def BalancedRanges(total, part_size):
+    balanced_part_nums = BalancedPartNums(total, part_size)
+    ranges = []
+    start = 0
+    for part_num in balanced_part_nums:
+        end = start + part_num
+        ranges.append((start, end))
+        start == end
+    return ranges
diff --git a/python/oneflow/compatible/single_client/framework/blob.py b/python/oneflow/compatible/single_client/framework/blob.py
new file mode 100644
index 0000000000000000000000000000000000000000..07591c2bce285bcdb943afc072f4636508fee4ad
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/blob.py
@@ -0,0 +1,67 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import numpy as np
+
+
+class Blob(object):
+    def __init__(self, ndarray=None):
+        self.ndarray_ = ndarray
+
+    def ndarray(self):
+        return self.ndarray_
+
+    def set_ndarray(self, ndarray):
+        self.ndarray_ = ndarray
+
+    def __getattr__(self, attr):
+        return getattr(self.ndarray_, attr)
+
+
+no_override_field = set(
+    [
+        "__class__",
+        "__doc__",
+        "__new__",
+        "__init__",
+        "__del__",
+        "__call__",
+        "__getattr__",
+        "__getattribute__",
+        "__setattr__",
+        "__delattr__",
+        "__dir__",
+        "__get__",
+        "__set__",
+        "__delete__",
+    ]
+)
+
+
+def MakeBlobMethod(field_name):
+    def ConvertOtherArgs(args):
+        return [x.ndarray_ if isinstance(x, Blob) else x for x in args]
+
+    return lambda self, *args: getattr(self.ndarray_, field_name)(
+        *ConvertOtherArgs(args)
+    )
+
+
+for field_name in dir(np.ndarray):
+    if field_name.startswith("__") == False:
+        continue
+    if field_name in no_override_field:
+        continue
+    setattr(Blob, field_name, MakeBlobMethod(field_name))
diff --git a/python/oneflow/compatible/single_client/framework/blob_trait.py b/python/oneflow/compatible/single_client/framework/blob_trait.py
new file mode 100644
index 0000000000000000000000000000000000000000..076ca496f68b58fec4d21293eba5b7297a3bbf54
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/blob_trait.py
@@ -0,0 +1,99 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+
+
+def __add__(self, rhs):
+    return flow.math.add(self, rhs)
+
+
+def __radd__(self, lhs):
+    return flow.math.add(lhs, self)
+
+
+def __sub__(self, rhs):
+    return flow.math.subtract(self, rhs)
+
+
+def __rsub__(self, lhs):
+    return flow.math.subtract(lhs, self)
+
+
+def __mul__(self, rhs):
+    return flow.math.multiply(self, rhs)
+
+
+def __rmul__(self, lhs):
+    return flow.math.multiply(lhs, self)
+
+
+def __truediv__(self, rhs):
+    return flow.math.divide(self, rhs)
+
+
+def __rtruediv__(self, lhs):
+    return flow.math.divide(lhs, self)
+
+
+def __div__(self, rhs):
+    return flow.math.divide(self, rhs)
+
+
+def __mod__(self, rhs):
+    return flow.math.mod(self, rhs)
+
+
+def __eq__(self, rhs):
+    return flow.math.equal(self, rhs)
+
+
+def __ne__(self, rhs):
+    return flow.math.not_equal(self, rhs)
+
+
+def __lt__(self, rhs):
+    return flow.math.less(self, rhs)
+
+
+def __le__(self, rhs):
+    return flow.math.less_equal(self, rhs)
+
+
+def __gt__(self, rhs):
+    return flow.math.greater(self, rhs)
+
+
+def __ge__(self, rhs):
+    return flow.math.greater_equal(self, rhs)
+
+
+def RegisterBlobOperatorTraitMethod(blob_class):
+    blob_class.__add__ = __add__
+    blob_class.__radd__ = __radd__
+    blob_class.__sub__ = __sub__
+    blob_class.__rsub__ = __rsub__
+    blob_class.__mul__ = __mul__
+    blob_class.__rmul__ = __rmul__
+    blob_class.__truediv__ = __truediv__
+    blob_class.__rtruediv__ = __rtruediv__
+    blob_class.__div__ = __div__
+    blob_class.__mod__ = __mod__
+    blob_class.__eq__ = __eq__
+    blob_class.__ne__ = __ne__
+    blob_class.__lt__ = __lt__
+    blob_class.__le__ = __le__
+    blob_class.__gt__ = __gt__
+    blob_class.__ge__ = __ge__
diff --git a/python/oneflow/compatible/single_client/framework/c_api_util.py b/python/oneflow/compatible/single_client/framework/c_api_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..54bd6c464be3ce7bbcb34e7d00ad2b6ca61aedcd
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/c_api_util.py
@@ -0,0 +1,252 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from google.protobuf import text_format
+
+import oneflow._oneflow_internal
+from oneflow._oneflow_internal.oneflow.core.job import placement as placement_cfg
+from oneflow.core.common import data_type_pb2 as dtype_util
+from oneflow.core.common import error_pb2 as error_util
+from oneflow.core.framework.config_def_pb2 import ConfigDef
+from oneflow.core.job import env_pb2 as env_pb2
+from oneflow.core.job import job_pb2 as job_pb
+from oneflow.core.job import job_set_pb2 as job_set_pb
+from oneflow.core.job import placement_pb2 as placement_pb
+from oneflow.core.job import resource_pb2 as resource_util
+from oneflow.core.job.inter_user_job_info_pb2 import InterUserJobInfo
+from oneflow.core.operator import op_attribute_pb2 as op_attribute_pb
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+from oneflow.core.record import record_pb2 as record_util
+from oneflow.core.register import logical_blob_id_pb2 as logical_blob_id_util
+
+
+def CurrentResource():
+    resource = oneflow._oneflow_internal.CurrentResource()
+    return text_format.Parse(resource, resource_util.Resource())
+
+
+def EnvResource():
+    resource = oneflow._oneflow_internal.EnvResource()
+    return text_format.Parse(resource, resource_util.Resource())
+
+
+def InitEnv(env_proto, is_multi_client):
+    assert type(env_proto) is env_pb2.EnvProto
+    env_proto_str = text_format.MessageToString(env_proto)
+    oneflow._oneflow_internal.InitEnv(env_proto_str, is_multi_client)
+
+
+def InitLazyGlobalSession(config_proto):
+    assert type(config_proto) is job_set_pb.ConfigProto
+    config_proto_str = text_format.MessageToString(config_proto)
+    oneflow._oneflow_internal.InitLazyGlobalSession(config_proto_str)
+
+
+def GetInterUserJobInfo():
+    inter_user_job_info = oneflow._oneflow_internal.GetSerializedInterUserJobInfo()
+    ret = InterUserJobInfo()
+    ret.ParseFromString(inter_user_job_info)
+    return ret
+
+
+def JobBuildAndInferCtx_Open(job_name):
+    job_name = str(job_name)
+    oneflow._oneflow_internal.JobBuildAndInferCtx_Open(job_name)
+
+
+def CurJobBuildAndInferCtx_SetJobConf(job_config_proto):
+    oneflow._oneflow_internal.CurJobBuildAndInferCtx_SetJobConf(job_config_proto)
+
+
+def CurJobBuildAndInferCtx_SetTrainConf(train_config_cfg):
+    oneflow._oneflow_internal.CurJobBuildAndInferCtx_SetTrainConf(train_config_cfg)
+
+
+def InferOpConf(op_conf_proto, upstream_signature):
+    serialized_op_conf = str(text_format.MessageToString(op_conf_proto))
+    serialized_upstream_sig = str(text_format.MessageToString(upstream_signature))
+    op_attribute_str = oneflow._oneflow_internal.InferOpConf(
+        serialized_op_conf, serialized_upstream_sig
+    )
+    return text_format.Parse(op_attribute_str, op_attribute_pb.OpAttribute())
+
+
+def IsInterfaceOpConf(op_conf):
+    op_type_field = op_conf.WhichOneof("op_type")
+    field_number = op_conf_util.OperatorConf.DESCRIPTOR.fields_by_name[
+        op_type_field
+    ].number
+    return oneflow._oneflow_internal.IsInterfaceOpTypeCase(field_number)
+
+
+def GetOpParallelSymbolId(op_conf_proto):
+    serialized_op_conf = str(text_format.MessageToString(op_conf_proto))
+    return oneflow._oneflow_internal.GetOpParallelSymbolId(serialized_op_conf)
+
+
+def CheckAndCompleteUserOpConf(op_conf_proto):
+    serialized_op_conf = str(text_format.MessageToString(op_conf_proto))
+    new_op_conf = oneflow._oneflow_internal.CheckAndCompleteUserOpConf(
+        serialized_op_conf
+    )
+    return text_format.Parse(new_op_conf, op_conf_util.OperatorConf())
+
+
+def CurJobBuildAndInferCtx_AddAndInferConsistentOp(op_conf_proto):
+    serialized_op_conf = str(text_format.MessageToString(op_conf_proto))
+    add_and_infer = (
+        oneflow._oneflow_internal.CurJobBuildAndInferCtx_AddAndInferConsistentOp
+    )
+    op_attribute_str = add_and_infer(serialized_op_conf)
+    return text_format.Parse(op_attribute_str, op_attribute_pb.OpAttribute())
+
+
+def CurJobBuildAndInferCtx_AddAndInferMirroredOp(op_conf_proto):
+    serialized_op_conf = str(text_format.MessageToString(op_conf_proto))
+    add_and_infer = (
+        oneflow._oneflow_internal.CurJobBuildAndInferCtx_AddAndInferMirroredOp
+    )
+    op_attribute_str = add_and_infer(serialized_op_conf)
+    return text_format.Parse(op_attribute_str, op_attribute_pb.OpAttribute())
+
+
+def CurJobBuildAndInferCtx_AddLossLogicalBlobName(lbn):
+    lbn = str(lbn)
+    oneflow._oneflow_internal.CurJobBuildAndInferCtx_AddLossLogicalBlobName(lbn)
+
+
+def CurJobBuildAndInferCtx_AddLbiAndDiffWatcherUuidPair(lbi_and_uuid):
+    serialized = str(text_format.MessageToString(lbi_and_uuid))
+    oneflow._oneflow_internal.CurJobBuildAndInferCtx_AddLbiAndDiffWatcherUuidPair(
+        serialized
+    )
+
+
+def JobBuildAndInferCtx_IsMirroredBlob(job_name, lbn):
+    job_name = str(job_name)
+    lbn = str(lbn)
+    return oneflow._oneflow_internal.JobBuildAndInferCtx_IsMirroredBlob(job_name, lbn)
+
+
+def JobBuildAndInferCtx_MirroredBlobGetNumSubLbi(job_name, lbn):
+    job_name = str(job_name)
+    lbn = str(lbn)
+    return oneflow._oneflow_internal.JobBuildAndInferCtx_MirroredBlobGetNumSubLbi(
+        job_name, lbn
+    )
+
+
+def JobBuildAndInferCtx_MirroredBlobGetSubLbi(job_name, lbn, index):
+    job_name = str(job_name)
+    lbn = str(lbn)
+    ret = oneflow._oneflow_internal.JobBuildAndInferCtx_MirroredBlobGetSerializedSubLbi(
+        job_name, lbn, index
+    )
+    return text_format.Parse(ret, logical_blob_id_util.LogicalBlobId())
+
+
+def JobBuildAndInferCtx_GetStaticShape(job_name, lbn):
+    job_name = str(job_name)
+    lbn = str(lbn)
+    axis_str = oneflow._oneflow_internal.JobBuildAndInferCtx_GetSerializedIdListAsStaticShape(
+        job_name, lbn
+    )
+    int_list = text_format.Parse(axis_str, record_util.Int64List())
+    return tuple(map(int, int_list.value))
+
+
+def JobBuildAndInferCtx_GetDataType(job_name, lbn):
+    job_name = str(job_name)
+    lbn = str(lbn)
+    dtype = oneflow._oneflow_internal.JobBuildAndInferCtx_GetDataType(job_name, lbn)
+    return int(dtype)
+
+
+def JobBuildAndInferCtx_IsDynamic(job_name, lbn):
+    job_name = str(job_name)
+    lbn = str(lbn)
+    ret = oneflow._oneflow_internal.JobBuildAndInferCtx_IsDynamic(job_name, lbn)
+    return ret
+
+
+def JobBuildAndInferCtx_DisableBoxing(job_name, lbn):
+    job_name = str(job_name)
+    lbn = str(lbn)
+    ret = oneflow._oneflow_internal.JobBuildAndInferCtx_DisableBoxing(job_name, lbn)
+    return ret
+
+
+def JobBuildAndInferCtx_GetSplitAxisFromProducerView(job_name, lbn):
+    job_name = str(job_name)
+    lbn = str(lbn)
+    split_axis_str = oneflow._oneflow_internal.JobBuildAndInferCtx_GetSplitAxisFromProducerView(
+        job_name, lbn
+    )
+    split_axis = text_format.Parse(split_axis_str, dtype_util.OptInt64())
+    if split_axis.HasField("value"):
+        return split_axis.value
+    return None
+
+
+def JobBuildAndInferCtx_GetParallelConfFromProducerView(job_name, lbn):
+    job_name = str(job_name)
+    lbn = str(lbn)
+    GetParallelConf = (
+        oneflow._oneflow_internal.JobBuildAndInferCtx_GetSerializedParallelConfFromProducerView
+    )
+    parallel_conf = GetParallelConf(job_name, lbn)
+    parallel_conf = text_format.Parse(parallel_conf, placement_pb.ParallelConf())
+    parallel_conf_cfg = placement_cfg.ParallelConf()
+    parallel_conf_cfg.set_device_tag(parallel_conf.device_tag)
+    for device_name in parallel_conf.device_name:
+        parallel_conf_cfg.add_device_name(device_name)
+    return parallel_conf_cfg
+
+
+def GetMachine2DeviceIdListOFRecordFromParallelConf(parallel_conf):
+    serialized_parallel_conf = str(parallel_conf)
+    ofrecord = oneflow._oneflow_internal.GetMachine2DeviceIdListOFRecordFromParallelConf(
+        serialized_parallel_conf
+    )
+    return text_format.Parse(ofrecord, record_util.OFRecord())
+
+
+def GetFunctionConfigDef():
+    func_config_def = oneflow._oneflow_internal.GetFunctionConfigDef()
+    return text_format.Parse(func_config_def, ConfigDef())
+
+
+def GetScopeConfigDef():
+    scope_config_def = oneflow._oneflow_internal.GetScopeConfigDef()
+    return text_format.Parse(scope_config_def, ConfigDef())
+
+
+def GetInterfaceOpAttributes():
+    op_attributes = oneflow._oneflow_internal.GetSerializedInterfaceOpAttributes()
+    return text_format.Parse(op_attributes, op_attribute_pb.OpAttributeList())
+
+
+def GetJobSet():
+    job_set = oneflow._oneflow_internal.GetSerializedJobSet()
+    ret = job_set_pb.JobSet()
+    ret.ParseFromString(job_set)
+    return ret
+
+
+def GetCurrentJob():
+    serialized_job = oneflow._oneflow_internal.GetSerializedCurrentJob()
+    ret = job_pb.Job()
+    ret.ParseFromString(serialized_job)
+    return ret
diff --git a/python/oneflow/compatible/single_client/framework/check_point.py b/python/oneflow/compatible/single_client/framework/check_point.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ed1d963822b1c690bfdc95b7c90b7a8d16c3843
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/check_point.py
@@ -0,0 +1,230 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import datetime
+import os
+import shutil
+from typing import List, Union
+
+import numpy as np
+
+from oneflow.compatible.single_client.eager import op_executor as op_executor
+from oneflow.compatible.single_client.framework import check_point_v2 as check_point_v2
+from oneflow.compatible.single_client.framework import config_util as config_util
+from oneflow.compatible.single_client.framework import hob as hob
+from oneflow.compatible.single_client.framework import job_instance as job_instance
+from oneflow.compatible.single_client.framework import session_context as session_ctx
+from oneflow.compatible.single_client.support import enable_if as enable_if
+
+
+class CheckPoint(object):
+    """Create a `CheckPoint` object to manage checkpoint manually.
+
+    """
+
+    def __init__(self) -> None:
+        if not config_util.api_legacy_model_io_enabled():
+            print(
+                "\x1b[1mWARNING: 'flow.train.CheckPoint' is deprecated. Please use the new API:\x1b[0m\nflow.train.CheckPoint().save(path) => \x1b[1m\x1b[92mflow.checkpoint.save(path)\x1b[0m\nflow.train.CheckPoint().load(path) => \x1b[1m\x1b[92mflow.load_variables(flow.checkpoint.get(path))\x1b[0m\nflow.train.CheckPoint().init() is not needed any more.\n"
+            )
+
+    @session_ctx.try_init_default_session
+    def save(self, path: str) -> None:
+        """save a checkpoint to `path`.
+
+        Args:
+            path: A `string` of path to save checkpoint. 
+        """
+        if not config_util.api_legacy_model_io_enabled():
+            check_point_v2.SaveVarDict(path)
+            return
+        assert type(path) is str
+        enable_if.unique([lazy_checkpoint_save, eager_checkpoint_save])(path)
+
+    @session_ctx.try_init_default_session
+    def init(self) -> None:
+        """Initialize models by default initializer of op or Job.
+        """
+        if not config_util.api_legacy_model_io_enabled():
+            return
+        enable_if.unique([lazy_checkpoint_init, eager_checkpoint_init])()
+
+    @session_ctx.try_init_default_session
+    def load(self, path: str) -> None:
+        """load a checkpoint from `path` and initialize models.
+
+        Args:
+            path: A `string` of path to load checkpoint.
+        """
+        if not config_util.api_legacy_model_io_enabled():
+            check_point_v2.LoadVariables(check_point_v2.GetCheckpoint(path))
+            return
+        assert type(path) is str
+        enable_if.unique([lazy_checkpoint_load, eager_checkpoint_load])(path)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.eager_execution_enabled)
+def lazy_checkpoint_save(path):
+    session_ctx.GetDefaultSession().LaunchJob(_MakeModelSaveJobFunc(path))
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.eager_execution_enabled)
+def lazy_checkpoint_init():
+    session_ctx.GetDefaultSession().LaunchJob(_MakeModelInitJobFunc())
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.eager_execution_enabled)
+def lazy_checkpoint_load(path):
+    session_ctx.GetDefaultSession().LaunchJob(_MakeModelLoadJobFunc(path))
+
+
+@enable_if.condition(hob.in_normal_mode & hob.eager_execution_enabled)
+def eager_checkpoint_save(path):
+    op_executor.EagerSaveVariableBlob(path)
+
+
+@enable_if.condition(hob.in_normal_mode & hob.eager_execution_enabled)
+def eager_checkpoint_init():
+    pass
+
+
+@enable_if.condition(hob.in_normal_mode & hob.eager_execution_enabled)
+def eager_checkpoint_load(path):
+    session_ctx.GetDefaultSession().snapshot_mgr.load(path)
+
+
+def _MakeModelInitJobFunc():
+    def push_cb(blob):
+        pass
+
+    def finish_cb():
+        pass
+
+    sess = session_ctx.GetDefaultSession()
+    return job_instance.MakeJobInstance(
+        str(sess.inter_user_job_info.global_model_init_job_name),
+        push_cb=push_cb,
+        finish_cb=finish_cb,
+    )
+
+
+def _MakeModelLoadJobFunc(path):
+    def push_cb(blob):
+        blob.CopyFromNdarray(np.frombuffer(path.encode("ascii"), dtype=np.int8))
+
+    def finish_cb():
+        pass
+
+    sess = session_ctx.GetDefaultSession()
+    return job_instance.MakeJobInstance(
+        str(sess.inter_user_job_info.global_model_load_job_name),
+        push_cb=push_cb,
+        finish_cb=finish_cb,
+    )
+
+
+def _MakeModelSaveJobFunc(path):
+    def push_cb(blob):
+        blob.CopyFromNdarray(np.frombuffer(path.encode("ascii"), dtype=np.int8))
+
+    def finish_cb():
+        pass
+
+    sess = session_ctx.GetDefaultSession()
+    return job_instance.MakeJobInstance(
+        str(sess.inter_user_job_info.global_model_save_job_name),
+        push_cb=push_cb,
+        finish_cb=finish_cb,
+    )
+
+
+class SimpleCheckPointManager(object):
+    """`SimpleCheckPointManager` is a simple automatic checkpoint manager.
+
+    Args:
+        root_path: root path of snapshot
+        prefix: prefix of snapshot
+    """
+
+    def __init__(self, root_path: str, prefix: str = "snapshot_") -> None:
+        if not os.path.exists(root_path):
+            os.makedirs(root_path)
+        else:
+            assert os.path.isdir(root_path)
+        self._root_path = root_path
+        self._prefix = prefix
+
+    def list_checkpoints(self) -> List[str]:
+        def is_snapshot(name):
+            if not name.startswith(self._prefix):
+                return False
+            snapshot_done = os.path.join(self._GetSnapshotPath(name), "snapshot_done")
+            return os.path.exists(snapshot_done) and os.path.isfile(snapshot_done)
+
+        return sorted([f for f in os.listdir(self._root_path) if is_snapshot(f)])
+
+    def latest_checkpoint(self) -> Union[str, None]:
+        names = self.list_checkpoints()
+        if not names:
+            return None
+        else:
+            return names[-1]
+
+    def initialize_or_restore(self) -> None:
+        name = self.latest_checkpoint()
+        if name:
+            check_point_v2.LoadVariables(
+                check_point_v2.GetCheckpoint(self._GetSnapshotPath(name))
+            )
+        else:
+            self.save()
+
+    def save(self) -> None:
+        check_point_v2.SaveVarDict(self._GetSnapshotPath(self._NextSnapshotName()))
+
+    def _NextSnapshotName(self) -> str:
+        return self._prefix + datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+
+    def _GetSnapshotPath(self, name: str) -> str:
+        return os.path.join(self._root_path, name)
+
+
+class SnapshotManager(object):
+    def __init__(self):
+        self.name2path_ = dict()
+
+    def load(self, root_dir, refresh=True):
+        assert os.path.isdir(root_dir)
+        if refresh:
+            self.name2path_ = dict()
+        for file in os.listdir(root_dir):
+            file_path = os.path.join(root_dir, file)
+            if not os.path.isdir(file_path):
+                continue
+            has_out_subfile = False
+            for f in os.listdir(file_path):
+                fpath = os.path.join(file_path, f)
+                if f == "out" and os.path.isfile(fpath):
+                    has_out_subfile = True
+            if not has_out_subfile:
+                continue
+            assert file not in self.name2path_
+            self.name2path_[file] = os.path.join(file_path, "out")
+
+    def get_snapshot_path(self, name):
+        try:
+            return self.name2path_[name]
+        except KeyError:
+            return None
diff --git a/python/oneflow/compatible/single_client/framework/check_point_v2.py b/python/oneflow/compatible/single_client/framework/check_point_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..1260d93c03c5fd8d53a70f151e677c5fb934a81c
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/check_point_v2.py
@@ -0,0 +1,624 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+from google.protobuf import text_format
+
+import oneflow._oneflow_internal
+from oneflow._oneflow_internal import EagerBlobTrait
+from oneflow._oneflow_internal.oneflow.core.register import logical_blob_id as lbi_util
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.eager import boxing_util as boxing_util
+from oneflow.compatible.single_client.eager import op_infer_util as op_infer_util
+from oneflow.compatible.single_client.experimental import interface_op_read_and_write
+from oneflow.compatible.single_client.framework import config_util as config_util
+from oneflow.compatible.single_client.framework import dtype as dtype_util
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+from oneflow.compatible.single_client.framework import runtime_mode as rt_mode
+from oneflow.compatible.single_client.framework import session_context as session_ctx
+from oneflow.compatible.single_client.ops import get_variable as get_variable
+from oneflow.compatible.single_client.ops import initializer_util as initializer_util
+from oneflow.compatible.single_client.support import async_util as async_util
+from oneflow.core.framework import user_op_attr_pb2 as attr_value_pb
+from oneflow.core.framework import variable_meta_info_pb2 as variable_meta_info_pb
+from oneflow.core.job import initializer_conf_pb2 as initializer_conf_util
+from oneflow.core.operator import op_conf_pb2 as op_conf_pb
+from oneflow.core.register import logical_blob_id_pb2 as logical_blob_id_util
+
+META_INFO_FILENAME = "meta"
+DATA_FILENAME = "out"
+FAKE_JOB_NAME = "system_checkpoint"
+OP_PREFIX = "system_checkpoint"
+blob_register = oneflow._oneflow_internal.GetDefaultBlobRegister()
+
+
+def sync_default_session_if_normal():
+    if rt_mode.CurrentMode() == rt_mode.NORMAL_MODE:
+        flow.sync_default_session()
+    else:
+        pass
+
+
+class FileBackendVariableBlob:
+    def __init__(
+        self,
+        var_dir: str,
+        dtype: Optional[flow.dtype] = None,
+        shape: Optional[Sequence[int]] = None,
+    ):
+        data_path = os.path.join(var_dir, DATA_FILENAME)
+        assert os.path.isfile(data_path)
+        self.var_dir_ = var_dir
+        meta_info_path = os.path.join(self.var_dir_, META_INFO_FILENAME)
+        if os.path.exists(meta_info_path):
+            meta_info = variable_meta_info_pb.VariableMetaInfo()
+            with open(meta_info_path) as f:
+                text_format.Parse(f.read(), meta_info)
+            self.has_meta_info_ = True
+        else:
+            self.has_meta_info_ = False
+        if self.has_meta_info_:
+            assert dtype is None and shape is None
+            self.shape_ = tuple(meta_info.shape.dim)
+            self.dtype_ = dtype_util.convert_proto_dtype_to_oneflow_dtype(
+                meta_info.data_type
+            )
+        elif shape is not None and dtype is not None:
+            self.shape_ = shape
+            self.dtype_ = dtype
+            self.has_meta_info_ = True
+        elif shape is not None or dtype is not None:
+            raise RuntimeError("both or neither of shape and dtype should be None")
+        else:
+            pass
+        if self.has_meta_info_:
+            itemsize = np.dtype(
+                dtype_util.convert_oneflow_dtype_to_numpy_dtype(self.dtype_)
+            ).itemsize
+            assert os.path.getsize(data_path) == np.prod(self.shape).item() * itemsize
+
+    @property
+    def file_path(self) -> str:
+        return os.path.join(self.var_dir_, DATA_FILENAME)
+
+    @property
+    def shape(self) -> Tuple[int]:
+        return self.shape_
+
+    @property
+    def quant_info(self):
+        raise NotImplementedError()
+
+    @property
+    def dtype(self) -> flow.dtype:
+        return self.dtype_
+
+    def numpy(self) -> np.ndarray:
+        if not self.has_meta_info_:
+            raise RuntimeError("This variable does not have meta info")
+        return np.fromfile(
+            self.file_path,
+            dtype=dtype_util.convert_oneflow_dtype_to_numpy_dtype(self.dtype),
+        ).reshape(self.shape)
+
+
+ValueContainer = Union[
+    EagerBlobTrait,
+    FileBackendVariableBlob,
+    np.ndarray,
+    "oneflow.compatible.single_client.Tensor",
+]
+
+
+def _ElemCnt(shape):
+    return np.prod(shape).astype(int).item()
+
+
+@session_ctx.try_init_default_session
+def GetAllVariables() -> Dict[str, oneflow._oneflow_internal.EagerConsistentBlob]:
+    """
+    Get all variables of all jobs as a dict.
+    """
+    sync_default_session_if_normal()
+    sess = session_ctx.GetDefaultSession()
+    interface_ops = sess.interface_ops
+    variables = {}
+    for op in interface_ops:
+        op_attr = sess.OpAttribute4InterfaceOpName(op)
+        if op_attr.op_conf.WhichOneof("op_type") != "variable_conf":
+            continue
+        variables[op] = interface_op_read_and_write.GetEagerInterfaceBlob(op)
+    return variables
+
+
+def _LoadSingleVariable(path: str) -> Optional[FileBackendVariableBlob]:
+    if os.path.isfile(os.path.join(path, DATA_FILENAME)):
+        return FileBackendVariableBlob(path)
+    return None
+
+
+def _GetCheckpoint(
+    path: str,
+) -> Union[Dict[str, FileBackendVariableBlob], FileBackendVariableBlob]:
+    assert os.path.isdir(path), "Directory {} doesn't exist!".format(path)
+    single_var = _LoadSingleVariable(path)
+    if single_var is not None:
+        return single_var
+    var_dict = {}
+    for f in os.listdir(path):
+        var_dir = os.path.join(path, f)
+        var = _LoadSingleVariable(var_dir)
+        if var is not None:
+            var_dict[f] = var
+    return var_dict
+
+
+@session_ctx.try_init_default_session
+def GetCheckpoint(
+    path: str,
+) -> Union[Dict[str, FileBackendVariableBlob], FileBackendVariableBlob]:
+    """
+    Load variable(s) from file system.
+    """
+    return _GetCheckpoint(path)
+
+
+def Load(
+    path: str,
+) -> Union[Dict[str, FileBackendVariableBlob], FileBackendVariableBlob]:
+    return _GetCheckpoint(path)
+
+
+def _GetOpNameFromLbn(lbn):
+    return lbn.split("/")[0]
+
+
+def _GetScopeSymbolIdFromEagerBlob(blob):
+    name = _GetOpNameFromLbn(blob.logical_blob_name)
+    sess = session_ctx.GetDefaultSession()
+    op_conf = sess.OpAttribute4InterfaceOpName(name).op_conf
+    scope_symbol_id = op_conf.scope_symbol_id
+    return scope_symbol_id
+
+
+def _ReadSlice(
+    container: ValueContainer,
+) -> Iterable[Tuple[Sequence[int], Sequence[int], np.ndarray]]:
+    """
+    Return a generator which iterates over the input blob or array and yields
+    (start_nd_idx, stop_nd_idx, slice_np_array)
+    """
+    if isinstance(container, flow.Tensor):
+
+        def ReadFromTensor(tensor, start_nd_idx, stop_nd_idx):
+            start_nd_idx = list(map(int, start_nd_idx))
+            stop_nd_idx = list(map(int, stop_nd_idx))
+            return tensor[
+                tuple(
+                    [
+                        slice(start_nd_idx[i], stop_nd_idx[i])
+                        for i in range(len(start_nd_idx))
+                    ]
+                )
+            ].numpy()
+
+        yield from _ForEachSlice(container, ReadFromTensor)
+    elif isinstance(container, EagerBlobTrait):
+
+        def ReadFromEagerBlob(eager_blob, start_nd_idx, stop_nd_idx):
+            scope_symbol_id = _GetScopeSymbolIdFromEagerBlob(eager_blob)
+            return _LogicalSlice(
+                eager_blob.blob_object, start_nd_idx, stop_nd_idx, scope_symbol_id
+            )
+
+        yield from _ForEachSlice(container, ReadFromEagerBlob)
+    elif isinstance(container, FileBackendVariableBlob):
+        np_dtype = np.dtype(
+            dtype_util.convert_oneflow_dtype_to_numpy_dtype(container.dtype)
+        )
+        with open(container.file_path, "rb") as f:
+
+            def ReadFromFile(_, start_nd_idx, stop_nd_idx):
+                length = _ElemCnt(np.array(stop_nd_idx) - np.array(start_nd_idx))
+                slice = f.read(length * np_dtype.itemsize)
+                return np.frombuffer(slice, dtype=np_dtype).reshape(
+                    np.array(stop_nd_idx) - np.array(start_nd_idx)
+                )
+
+            yield from _ForEachSlice(container, ReadFromFile)
+    elif isinstance(container, np.ndarray):
+
+        def ReadFromNpArray(array, start_nd_idx, stop_nd_idx):
+            slice_objs = []
+            for (start, stop) in zip(start_nd_idx, stop_nd_idx):
+                slice_objs.append(slice(start, stop))
+            return array[tuple(slice_objs)]
+
+        yield from _ForEachSlice(container, ReadFromNpArray)
+    else:
+        raise RuntimeError("Unknown type: {}".format(type(container).__name__))
+
+
+def _SaveVarDict(
+    path: str,
+    var_dict: Optional[
+        Dict[str, Union[FileBackendVariableBlob, EagerBlobTrait]]
+    ] = None,
+) -> None:
+    if var_dict is None:
+        var_dict = GetAllVariables()
+
+    def IsFileOrNonEmptyDir(path):
+        if os.path.isfile(path):
+            return True
+        if os.path.isdir(path) and len(os.listdir(path)) != 0:
+            return True
+        return False
+
+    assert not IsFileOrNonEmptyDir(
+        path
+    ), "{} is a file or non-empty directory! Note that flow.save is different from torch.save. It saves each weight as a separated file so that a directory instead of a file should be given.".format(
+        path
+    )
+    os.makedirs(path, exist_ok=True)
+    for (name, var) in var_dict.items():
+        meta_info = variable_meta_info_pb.VariableMetaInfo()
+        meta_info.shape.dim[:] = var.shape
+        meta_info.data_type = oneflow._oneflow_internal.deprecated.GetProtoDtype4OfDtype(
+            var.dtype
+        )
+        var_dir = os.path.join(path, name)
+        param_path = os.path.join(var_dir, DATA_FILENAME)
+        os.makedirs(os.path.dirname(param_path))
+        with open(param_path, "wb") as f:
+            for (_, _, slice) in _ReadSlice(var):
+                f.write(slice.tobytes())
+        with open(os.path.join(var_dir, META_INFO_FILENAME), "w") as f:
+            f.write(text_format.MessageToString(meta_info))
+    with open(os.path.join(path, "snapshot_done"), "w"):
+        pass
+
+
+@session_ctx.try_init_default_session
+def SaveVarDict(
+    path: str,
+    var_dict: Optional[
+        Dict[str, Union[FileBackendVariableBlob, EagerBlobTrait]]
+    ] = None,
+) -> None:
+    """
+    Save `var_dict` to `path`
+    """
+    sync_default_session_if_normal()
+    return _SaveVarDict(path, var_dict)
+
+
+def save(obj, save_dir):
+    return _SaveVarDict(save_dir, obj)
+
+
+def _LogicalSlice(
+    input_blob_object: oneflow._oneflow_internal.BlobObject,
+    start: Sequence[int],
+    stop: Sequence[int],
+    scope_symbol_id: int,
+) -> np.ndarray:
+    """
+    Construct a logical_slice op and run it by oneflow eager,
+    return the sliced result as a numpy ndarray
+    """
+    op_name = id_util.UniqueStr(OP_PREFIX)
+
+    def AsyncSlice(Yield):
+        def build(builder):
+            op_conf = op_conf_pb.OperatorConf()
+            device_tag = flow.current_scope().device_parallel_desc_symbol.device_tag
+            op_conf.device_tag = device_tag
+            op_conf.name = op_name
+            op_conf.user_conf.op_type_name = "logical_slice"
+            op_conf.user_conf.input["x"].s.append("{}/x_0".format(op_name))
+            op_conf.user_conf.output["y"].s.append("{}/y_0".format(op_name))
+            parallel_conf = input_blob_object.parallel_desc_symbol.parallel_conf
+            op_conf.user_conf.attr["parallel_conf"].at_string = str(parallel_conf)
+            op_conf.user_conf.attr["start"].at_list_int64.val[:] = start
+            op_conf.user_conf.attr["stop"].at_list_int64.val[:] = stop
+            op_conf.user_conf.attr["step"].at_list_int64.val[:] = [1] * len(start)
+            bn_in_op2blob_object = (
+                oneflow._oneflow_internal.deprecated.BnInOp2BlobObject()
+            )
+            bn_in_op2blob_object["x_0"] = input_blob_object
+            op_attribute = op_infer_util.Infer(
+                op_conf, bn_in_op2blob_object, scope_symbol_id
+            )
+            cfg_op_attribute = oneflow._oneflow_internal.deprecated.MakeOpAttributeByString(
+                str(op_attribute)
+            )
+            builder.StatelessCall(
+                cfg_op_attribute,
+                parallel_conf,
+                bn_in_op2blob_object,
+                boxing_util.BoxingTo,
+            )
+            Yield(bn_in_op2blob_object["y_0"])
+
+        oneflow._oneflow_internal.deprecated.LogicalRun(build)
+
+    lbi = lbi_util.LogicalBlobId()
+    lbi.set_op_name(op_name)
+    lbi.set_blob_name(op_name)
+    blob_object = async_util.Await(1, AsyncSlice)[0]
+    blob = oneflow._oneflow_internal.EagerConsistentBlob(
+        lbi,
+        blob_object=blob_object,
+        blob_register=blob_register,
+        job_name=FAKE_JOB_NAME,
+    )
+    return blob.numpy()
+
+
+def _GetCpu0VariableBlobFromNumpy(
+    np_array: np.ndarray, dtype: flow.dtype
+) -> oneflow._oneflow_internal.EagerConsistentBlob:
+    """
+    Add a variable on cpu 0, and feed the value of `np_array`
+
+    Note: dtype argument cannot be eliminated by
+    convert_numpy_dtype_to_oneflow_dtype(np_array.dtype),
+    because np.int8 == np.char and
+    numpy_dtype_to_oneflow_dtype(oneflow_dtype_to_numpy_dtype(flow.int8))
+    may be flow.char
+    """
+    with flow.scope.placement("cpu", "0:0"):
+        op_name = id_util.UniqueStr(OP_PREFIX)
+        op_conf = get_variable.GenerateVariableOpConf(
+            name=op_name,
+            shape=np_array.shape,
+            dtype=dtype,
+            initializer=initializer_util.zeros_initializer(dtype=dtype),
+            trainable=False,
+        )
+        current_parallel_desc_sym = flow.current_scope().device_parallel_desc_symbol
+        device_tag = current_parallel_desc_sym.device_tag
+        op_conf.device_tag = device_tag
+        op_attribute = op_infer_util.Infer(op_conf, {})
+        var_blob = get_variable.CreateEagerVariableBlob(
+            op_attribute, job_name=FAKE_JOB_NAME
+        )
+        interface_op_read_and_write.FeedValueToInterfaceBlobObject(
+            var_blob.blob_object, np_array
+        )
+        return var_blob
+
+
+def _LogicalSliceAssign(
+    ref_blob_object: oneflow._oneflow_internal.BlobObject,
+    value_blob_object: oneflow._oneflow_internal.BlobObject,
+    start: Sequence[int],
+    stop: Sequence[int],
+    scope_symbol_id: Optional[int],
+) -> None:
+    """
+    Construct a logical_slice_assign op and run it by oneflow eager
+    """
+
+    def BuildAssignInstruction(builder):
+        op_conf = op_conf_pb.OperatorConf()
+        device_tag = flow.current_scope().device_parallel_desc_symbol.device_tag
+        op_conf.device_tag = device_tag
+        op_name = id_util.UniqueStr(OP_PREFIX)
+        op_conf.name = op_name
+        op_conf.user_conf.op_type_name = "logical_slice_assign"
+        op_conf.user_conf.input["value"].s.append("{}/value_0".format(op_name))
+        op_conf.user_conf.input["ref"].s.append("{}/ref_0".format(op_name))
+        parallel_conf = ref_blob_object.parallel_desc_symbol.parallel_conf
+        op_conf.user_conf.attr["parallel_conf"].at_string = str(parallel_conf)
+        op_conf.user_conf.attr["start"].at_list_int64.val[:] = start
+        op_conf.user_conf.attr["stop"].at_list_int64.val[:] = stop
+        op_conf.user_conf.attr["step"].at_list_int64.val[:] = [1] * len(start)
+        bn_in_op2blob_object = oneflow._oneflow_internal.deprecated.BnInOp2BlobObject()
+        bn_in_op2blob_object["ref_0"] = ref_blob_object
+        bn_in_op2blob_object["value_0"] = value_blob_object
+        op_attribute = op_infer_util.Infer(
+            op_conf, bn_in_op2blob_object, scope_symbol_id
+        )
+        cfg_op_attribute = oneflow._oneflow_internal.deprecated.MakeOpAttributeByString(
+            str(op_attribute)
+        )
+        builder.StatelessCall(
+            cfg_op_attribute, parallel_conf, bn_in_op2blob_object, boxing_util.BoxingTo
+        )
+
+    oneflow._oneflow_internal.deprecated.LogicalRun(BuildAssignInstruction)
+
+
+def FeedValueToVariable(
+    var_blob: Union[
+        oneflow._oneflow_internal.EagerConsistentBlob,
+        "oneflow.compatible.single_client.Tensor",
+    ],
+    value: ValueContainer,
+    scope_symbol_id: Optional[int],
+) -> None:
+    """
+    Feed the value of `value` to the variable `var_blob`
+    """
+    assert isinstance(
+        value, (EagerBlobTrait, FileBackendVariableBlob, np.ndarray, flow.Tensor)
+    ), "Unknown value type: {}".format(type(value).__name__)
+    if isinstance(value, FileBackendVariableBlob):
+        if not value.has_meta_info_:
+            value = FileBackendVariableBlob(
+                value.var_dir_, var_blob.dtype, var_blob.shape
+            )
+    assert var_blob.shape == value.shape, "{} vs {}".format(var_blob.shape, value.shape)
+    if isinstance(value, np.ndarray):
+        value_flow_dtype = dtype_util.convert_numpy_dtype_to_oneflow_dtype(value.dtype)
+    else:
+        value_flow_dtype = value.dtype
+    assert var_blob.dtype == value_flow_dtype, "{} vs {}".format(
+        var_blob.dtype, value_flow_dtype
+    )
+    if isinstance(var_blob, flow.Tensor):
+        raise ValueError("Tensor object arguments are not supported")
+    else:
+        assert isinstance(var_blob, EagerBlobTrait)
+        var_blob_object = var_blob.blob_object
+    for (start, stop, slice) in _ReadSlice(value):
+        slice_value_blob = _GetCpu0VariableBlobFromNumpy(slice, var_blob.dtype)
+        _LogicalSliceAssign(
+            var_blob_object, slice_value_blob.blob_object, start, stop, scope_symbol_id
+        )
+
+
+@session_ctx.try_init_default_session
+def LoadVariables(value_dict: Dict[str, ValueContainer], ignore_mismatch: bool = True):
+    """
+    Load value in `value_dict` into oneflow variables.
+    For example, if `value_dict` is {'x', np.ones(x_shape)},
+    the value of variable "x" will all ones.
+    If `ignore_mismatch` is False, an exception will be raised when
+    there is a name in `value_dict` not belonging to any variable.
+    """
+    sync_default_session_if_normal()
+    all_vars = GetAllVariables()
+    for (name, value) in value_dict.items():
+        if name in all_vars:
+            var_blob = interface_op_read_and_write.GetEagerInterfaceBlob(name)
+            scope_symbol_id = _GetScopeSymbolIdFromEagerBlob(var_blob)
+            FeedValueToVariable(var_blob, value, scope_symbol_id)
+        elif not ignore_mismatch:
+            raise RuntimeError('"{}" is not a variable name'.format(name))
+    oneflow._oneflow_internal.eager.single_client.Sync()
+
+
+def _ForEachSlice(
+    container: ValueContainer,
+    f: Union[
+        Callable[[EagerBlobTrait, Sequence[int], Sequence[int]], Any],
+        Callable[[FileBackendVariableBlob, Sequence[int], Sequence[int]], Any],
+        Callable[[np.ndarray, Sequence[int], Sequence[int]], Any],
+    ],
+):
+    """
+    Slice container into slices whose size < SLICE_BYTES. For every slice,
+    yield start_nd_idx, stop_nd_idx and f(slice)
+    """
+    assert isinstance(
+        container, (EagerBlobTrait, FileBackendVariableBlob, np.ndarray, flow.Tensor)
+    ), "Unknown type: {}".format(type(container).__name__)
+    assert container.shape is not None
+    SLICE_BYTES = 32 * 1024 * 1024
+    if isinstance(container, np.ndarray):
+        np_dtype = container.dtype
+    else:
+        np_dtype = np.dtype(
+            dtype_util.convert_oneflow_dtype_to_numpy_dtype(container.dtype)
+        )
+    SLICE_LEN = SLICE_BYTES // np_dtype.itemsize
+    start_idx = 0
+    size = _ElemCnt(container.shape)
+    cnt = 1
+    for axis in reversed(range(len(container.shape))):
+        cnt *= container.shape[axis]
+        if cnt > SLICE_LEN:
+            break
+    unit_size = _ElemCnt(tuple(container.shape)[axis + 1 :])
+    max_unit_num = SLICE_LEN // unit_size
+    while start_idx < size:
+        remainder = container.shape[axis]
+        while remainder > 0:
+            unit_num = max_unit_num if remainder >= max_unit_num else remainder
+            length = unit_num * unit_size
+            remainder -= unit_num
+            stop_idx = start_idx + length
+            start_nd_idx = np.unravel_index(start_idx, container.shape)
+            stop_nd_idx = np.unravel_index(stop_idx - 1, container.shape)
+            stop_nd_idx = tuple([x + 1 for x in stop_nd_idx])
+            yield (start_nd_idx, stop_nd_idx, f(container, start_nd_idx, stop_nd_idx))
+            start_idx = stop_idx
+
+
+def generate_values_by_initializer(initializer, shape, dtype):
+    np_dtype = np.dtype(dtype_util.convert_oneflow_dtype_to_numpy_dtype(dtype))
+    length = _ElemCnt(shape)
+    return np.array(initializer(length)).astype(np_dtype).reshape(shape)
+
+
+def init_by_initializer_conf(
+    var_blob: Union[EagerBlobTrait, "oneflow.compatible.single_client.Tensor"],
+    initializer_conf: initializer_conf_util.InitializerConf,
+    sync_between_multi_machine: bool,
+    scope_symbol_id: Optional[int],
+    random_seed: int = 0,
+):
+    initializer = initializer_util.GetInitializer(
+        initializer_conf, random_seed, var_blob.shape
+    )
+    if initializer is None:
+        return
+
+    def GenerateValueAndAssign(var_blob, start_nd_idx, stop_nd_idx):
+        shape = np.array(stop_nd_idx) - np.array(start_nd_idx)
+        vals = generate_values_by_initializer(initializer, shape, var_blob.dtype)
+        if isinstance(var_blob, flow.Tensor):
+            raise ValueError("Tensor object arguments are not supported")
+        else:
+            assert isinstance(var_blob, EagerBlobTrait)
+            var_blob_object = var_blob.blob_object
+        slice_value_blob = _GetCpu0VariableBlobFromNumpy(vals, var_blob.dtype)
+        _LogicalSliceAssign(
+            var_blob_object,
+            slice_value_blob.blob_object,
+            start_nd_idx,
+            stop_nd_idx,
+            scope_symbol_id,
+        )
+
+    for _ in _ForEachSlice(var_blob, GenerateValueAndAssign):
+        pass
+    if sync_between_multi_machine:
+        oneflow._oneflow_internal.eager.single_client.Sync()
+
+
+def Init() -> None:
+    sync_default_session_if_normal()
+    sess = session_ctx.GetDefaultSession()
+    for (op_name, var_blob) in GetAllVariables().items():
+        var_conf = sess.OpAttribute4InterfaceOpName(op_name).op_conf.variable_conf
+        if not (
+            var_conf.HasField("initializer")
+            or var_conf.HasField("initialize_with_snapshot")
+        ):
+            continue
+        if var_conf.HasField("initialize_with_snapshot"):
+            initialize_with_snapshot_conf = var_conf.initialize_with_snapshot
+            if initialize_with_snapshot_conf.HasField("key"):
+                snapshot_key = op_name
+            else:
+                snapshot_key = initialize_with_snapshot_conf.key
+            var_dir = os.path.dirname(
+                os.path.join(initialize_with_snapshot_conf.path, snapshot_key)
+            )
+            LoadVariables({op_name: GetCheckpoint(var_dir)})
+            continue
+        scope_symbol_id = _GetScopeSymbolIdFromEagerBlob(var_blob)
+        init_by_initializer_conf(
+            var_blob, var_conf.initializer, False, scope_symbol_id, var_conf.random_seed
+        )
+    oneflow._oneflow_internal.eager.single_client.Sync()
diff --git a/python/oneflow/compatible/single_client/framework/compile_context.py b/python/oneflow/compatible/single_client/framework/compile_context.py
new file mode 100644
index 0000000000000000000000000000000000000000..87c852cb34ec4b6f6c2e4f9f6e987d8344b6f8af
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/compile_context.py
@@ -0,0 +1,86 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from contextlib import contextmanager
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import c_api_util as c_api_util
+from oneflow.compatible.single_client.framework import (
+    distribute_context as distribute_ctx,
+)
+from oneflow.compatible.single_client.framework import hob as hob
+from oneflow.compatible.single_client.framework import (
+    placement_context as placement_context,
+)
+from oneflow.compatible.single_client.framework import session_context as session_ctx
+from oneflow.compatible.single_client.support import enable_if as enable_if
+
+
+def GetCurJobConfigProto():
+    return enable_if.unique([GetEagerCurJobConfigProto, GetLazyCurJobConfigProto])()
+
+
+@enable_if.condition(hob.in_global_mode & hob.eager_execution_enabled)
+def GetEagerCurJobConfigProto():
+    function_desc = session_ctx.GetDefaultSession().CurrentEagerGlobalFunctionDesc()
+    assert function_desc is not None
+    return function_desc.job_config_proto
+
+
+@enable_if.condition(hob.in_global_mode & ~hob.eager_execution_enabled)
+def GetLazyCurJobConfigProto():
+    job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+    function_desc = session_ctx.GetDefaultSession().GetLazyFunctionDesc(job_name)
+    assert function_desc is not None
+    return function_desc.job_config_proto
+
+
+logged_op_confs = set({})
+
+
+def CurJobAddOp(op_conf, scope_symbol=None):
+    if distribute_ctx.IsMirroredStrategyEnabled():
+        return CurJobAddMirroredOp(op_conf, scope_symbol)
+    return CurJobAddConsistentOp(op_conf, scope_symbol)
+
+
+def CurJobAddConsistentOp(op_conf, scope_symbol=None):
+    if scope_symbol is None:
+        scope_symbol = flow.current_scope()
+    op_conf.scope_symbol_id = scope_symbol.symbol_id
+    if not op_conf.HasField("device_tag"):
+        device_tag = scope_symbol.device_parallel_desc_symbol.device_tag
+        op_conf.device_tag = device_tag
+    op_attr = c_api_util.CurJobBuildAndInferCtx_AddAndInferConsistentOp(op_conf)
+    if c_api_util.IsInterfaceOpConf(op_conf):
+        sess = session_ctx.GetDefaultSession()
+        sess.AddInfo4InterfaceOpName(op_conf.name, op_attr)
+    return op_attr
+
+
+def CurJobAddMirroredOp(op_conf, scope_symbol=None):
+    assert not hob.consistent_view_enabled(None)
+    if scope_symbol is None:
+        scope_symbol = flow.current_scope()
+    op_conf.scope_symbol_id = scope_symbol.symbol_id
+    if not op_conf.HasField("device_tag"):
+        device_tag = scope_symbol.device_parallel_desc_symbol.device_tag
+        op_conf.device_tag = device_tag
+    op_attr = c_api_util.CurJobBuildAndInferCtx_AddAndInferMirroredOp(op_conf)
+    if c_api_util.IsInterfaceOpConf(op_conf):
+        sess = session_ctx.GetDefaultSession()
+        sess.AddInfo4InterfaceOpName(op_conf.name, op_attr)
+    return op_attr
diff --git a/python/oneflow/compatible/single_client/framework/compiler.py b/python/oneflow/compatible/single_client/framework/compiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..70d8935ccf717abb2c8b0ed446ffca479a5af2df
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/compiler.py
@@ -0,0 +1,224 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import inspect
+import typing
+from contextlib import contextmanager
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import ops as ops
+from oneflow.compatible.single_client.framework import c_api_util as c_api_util
+from oneflow.compatible.single_client.framework import distribute as distribute_util
+from oneflow.compatible.single_client.framework import hob as hob
+from oneflow.compatible.single_client.framework import input_blob_def as input_blob_util
+from oneflow.compatible.single_client.framework import (
+    placement_context as placement_ctx,
+)
+from oneflow.compatible.single_client.framework import placement_util as placement_util
+from oneflow.compatible.single_client.framework import push_util as push_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+from oneflow.compatible.single_client.framework import runtime_mode as runtime_mode
+from oneflow.compatible.single_client.framework import scope_util as scope_util
+from oneflow.compatible.single_client.framework import session_context as session_ctx
+from oneflow.compatible.single_client.framework import typing as oft
+from oneflow.compatible.single_client.framework import typing_util as oft_util
+from oneflow.compatible.single_client.support import enable_if as enable_if
+from oneflow.compatible.single_client.support import (
+    func_inspect_util as func_inspect_util,
+)
+
+
+def Compile(session, function_desc, config_proto):
+    with InterpretScope(session, function_desc, config_proto):
+        _CompileJob(session, function_desc)
+        session.StashJob(function_desc.job_func.__name__)
+        oneflow._oneflow_internal.CurJobBuildAndInferCtx_Complete()
+        session.StashJob(
+            function_desc.job_func.__name__,
+            function_desc.job_func.__name__ + "_after_complete",
+        )
+
+
+def EagerRun(session, function_desc, config_proto, args):
+    with InterpretScope(session, function_desc, config_proto):
+        ret = _InterpretGlobalFunction(function_desc, args)
+        oneflow._oneflow_internal.CurJobBuildAndInferCtx_Complete()
+        session_ctx.GetDefaultSession().UpdateInfo4InterfaceOp()
+    return ret
+
+
+@contextmanager
+def InterpretScope(session, function_desc, config_proto):
+    job_conf = function_desc.job_config_proto
+    job_conf.set_job_name(function_desc.job_func.__name__)
+    placement_scope = function_desc.function_attribute.default_placement_scope
+    if placement_scope is None:
+        tag_and_dev_ids = placement_util.GetDefaultMachineDeviceIds(session.resource)
+        hierarchy = None
+    else:
+        assert isinstance(placement_scope, placement_ctx.EmptyPlacementScope)
+        tag_and_dev_ids = (
+            placement_scope.device_tag,
+            placement_scope.machine_device_ids,
+        )
+        hierarchy = placement_scope.hierarchy
+    distribute_strategy = function_desc.function_attribute.default_distribute_strategy
+    if distribute_strategy is None:
+        distribute_strategy = distribute_util.DistributeConsistentStrategy()
+    is_mirrored = isinstance(
+        distribute_strategy, distribute_util.DistributeMirroredStrategy
+    )
+    assert isinstance(hierarchy, (list, tuple)) or hierarchy is None
+    if hierarchy is not None:
+        hierarchy = oneflow._oneflow_internal.Size(tuple(hierarchy))
+    scope = scope_util.MakeInitialScope(
+        job_conf, *tag_and_dev_ids, hierarchy, is_mirrored
+    )
+    with _JobBuildAndInferCtx(job_conf.job_name()), distribute_strategy:
+        c_api_util.CurJobBuildAndInferCtx_SetJobConf(job_conf)
+        with runtime_mode.ModeScope(runtime_mode.GLOBAL_MODE):
+            with scope_util.ScopeContext(scope):
+                yield
+
+
+def _CompileJob(session, function_desc):
+    func = function_desc.job_func
+    parameters = func.__oneflow_function_signature__.parameters
+    if len(parameters) == 0:
+        func.__oneflow_input_blob_defs__ = ()
+    elif all((p.annotation is inspect._empty for (_, p) in parameters.items())):
+        func.__oneflow_input_blob_defs__ = _GetArgDefault(func)
+    elif all((p.annotation is not inspect._empty for (_, p) in parameters.items())):
+        func.__oneflow_input_blob_defs__ = _MakeInputBlobDefFromParameterSignature(
+            parameters
+        )
+    else:
+        raise NotImplementedError(
+            "All parameters of global function should be annotated"
+        )
+    inputs = _RecursiveMakeInputBlobs(func.__oneflow_input_blob_defs__)
+    ret = func(*inputs)
+    return_annotation = func.__oneflow_function_signature__.return_annotation
+    oft_util.CheckReturnByAnnotation(func.__name__, ret, return_annotation)
+    func.__oneflow_output_remote_blobs__ = _RecursiveMakeRetRemoteBlobs(
+        ret, allow_cpu_return_op=function_desc.function_attribute.allow_cpu_return_op
+    )
+
+
+def _InterpretGlobalFunction(function_desc, args):
+    func = function_desc.job_func
+    parameters = func.__oneflow_function_signature__.parameters
+    if len(parameters) == 0:
+        func.__oneflow_input_blob_defs__ = ()
+    elif all((p.annotation is inspect._empty for (_, p) in parameters.items())):
+        func.__oneflow_input_blob_defs__ = _GetArgDefault(func)
+    elif all((p.annotation is not inspect._empty for (_, p) in parameters.items())):
+        func.__oneflow_input_blob_defs__ = _MakeInputBlobDefFromParameterSignature(
+            parameters
+        )
+    else:
+        raise NotImplementedError(
+            "All parameters of global function should be annotated"
+        )
+    inputs = push_util.MakeEagerInputBlobs(func.__oneflow_input_blob_defs__, args)
+    ret = func(*inputs)
+    return_annotation = func.__oneflow_function_signature__.return_annotation
+    oft_util.CheckReturnByAnnotation(func.__name__, ret, return_annotation)
+    return _RecursiveMakeRetRemoteBlobs(
+        ret, allow_cpu_return_op=function_desc.function_attribute.allow_cpu_return_op
+    )
+
+
+@contextmanager
+def _JobBuildAndInferCtx(job_name):
+    c_api_util.JobBuildAndInferCtx_Open(job_name)
+    try:
+        yield
+    finally:
+        oneflow._oneflow_internal.JobBuildAndInferCtx_Close()
+
+
+def _GetArgDefault(func):
+    if hasattr(func, "__oneflow_arg_default__"):
+        return func.__oneflow_arg_default__
+    return _CloneArgBlobDef(func_inspect_util.GetArgDefaults(func))
+
+
+def _CloneArgBlobDef(args):
+    if isinstance(args, input_blob_util.ArgBlobDef):
+        return args.Clone()
+    if isinstance(args, (tuple, list)):
+        return type(args)((_CloneArgBlobDef(x) for x in args))
+    if isinstance(args, dict):
+        return {k: _CloneArgBlobDef(v) for (k, v) in args}
+    raise NotImplementedError(
+        "oneflow.compatible.single_client.global_function only accepts nested input blob defs"
+    )
+
+
+def _RecursiveMakeInputBlobs(input_blob_def):
+    if isinstance(input_blob_def, input_blob_util.ArgBlobDef):
+        return ops.InputOpByArgBlobDef(input_blob_def)
+    if isinstance(input_blob_def, (tuple, list)):
+        return type(input_blob_def)(
+            (_RecursiveMakeInputBlobs(x) for x in input_blob_def)
+        )
+    if isinstance(input_blob_def, dict):
+        return {k: _RecursiveMakeInputBlobs(v) for (k, v) in input_blob_def.items()}
+    raise NotImplementedError(
+        "oneflow.compatible.single_client.global_function accepts "
+        + "ArgBlobDefs or list/tuple/dict nested ArgBlobDefs as argument"
+    )
+
+
+def _MakeInputBlobDefFromParameterSignature(parameters):
+    def CheckAndRecusiveMake(p):
+        return _RecusiveMakeInputBlobDef(p.annotation)
+
+    return tuple((CheckAndRecusiveMake(p) for (_, p) in parameters.items()))
+
+
+def _RecusiveMakeInputBlobDef(cls):
+    if oft.OriginFrom(cls, oft.OneflowNumpyDef):
+        return cls.NewInputBlobDef()
+    elif oft.OriginFrom(cls, typing.Tuple):
+        return tuple((_RecusiveMakeInputBlobDef(a) for a in cls.__args__))
+    else:
+        raise NotImplementedError(
+            "\nannotation %s" % cls
+            + "not supported"
+            + "\nonly support oneflow.compatible.single_client.typing.Numpy.Placeholder, oneflow.compatible.single_client.typing.ListNumpy.Placeholder"
+        )
+
+
+def _RecursiveMakeRetRemoteBlobs(remote_blobs, **kwarg):
+    if remote_blobs is None:
+        return None
+    if isinstance(remote_blobs, oneflow._oneflow_internal.BlobDesc):
+        return ops.ReturnRemoteBlob(remote_blobs, **kwarg)
+    if isinstance(remote_blobs, (tuple, list)):
+        return type(remote_blobs)(
+            (_RecursiveMakeRetRemoteBlobs(x, **kwarg) for x in remote_blobs)
+        )
+    if isinstance(remote_blobs, dict):
+        return {
+            k: _RecursiveMakeRetRemoteBlobs(v, **kwarg)
+            for (k, v) in remote_blobs.items()
+        }
+    raise NotImplementedError(
+        "oneflow.compatible.single_client.global_function returns "
+        + "RemoteBlob or list/tuple/dict nested RemoteBlob only"
+    )
diff --git a/python/oneflow/compatible/single_client/framework/config_util.py b/python/oneflow/compatible/single_client/framework/config_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..7eea82d2d2f075d7fe07bad8a363bc85d52a0072
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/config_util.py
@@ -0,0 +1,615 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import traceback
+
+import oneflow._oneflow_internal
+from oneflow.compatible.single_client.framework import hob as hob
+from oneflow.compatible.single_client.framework import session_context as session_ctx
+from oneflow.compatible.single_client.support import enable_if as enable_if
+
+
+def api_load_library(val: str) -> None:
+    """Load necessary library for job
+
+    Args:
+        val (str): path to shared object file
+    """
+    return enable_if.unique([load_library, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def load_library(val):
+    assert type(val) is str
+    sess = session_ctx.GetDefaultSession()
+    sess.config_proto.load_lib_path.append(val)
+
+
+def api_load_library_now(val: str) -> None:
+    """Load necessary library for job now
+
+    Args:
+        val (str): path to shared object file
+    """
+    return enable_if.unique([load_library_now, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def load_library_now(val):
+    assert type(val) is str
+    oneflow._oneflow_internal.LoadLibraryNow(val)
+
+
+def api_machine_num(val: int) -> None:
+    """Set available number of machine/node for  running job .
+
+    Args:
+        val (int): available number of machines
+    """
+    return enable_if.unique([machine_num, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def machine_num(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is int
+    sess.config_proto.resource.machine_num = val
+
+
+def api_gpu_device_num(val: int) -> None:
+    """Set number of GPUs on each machine to run oneflow on.
+
+    Args:
+        val (int): number of GPUs. It is identical on every machine. In other words,
+        you can't specify different number of GPUs you would like to use on each machine.
+    """
+    if oneflow._oneflow_internal.flags.with_cuda():
+        return enable_if.unique([gpu_device_num, do_nothing])(val)
+    else:
+        print(
+            "INFO: for CPU-only OneFlow, oneflow.compatible.single_client.config.gpu_device_num is equivalent to oneflow.compatible.single_client.config.cpu_device_num"
+        )
+        print(traceback.format_stack()[-2])
+        return enable_if.unique([cpu_device_num, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def gpu_device_num(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is int
+    sess.config_proto.resource.gpu_device_num = val
+
+
+def api_cpu_device_num(val: int) -> None:
+    """Set number of CPUs on each machine to run oneflow on. Usually you don't need to set this.
+
+    Args:
+        val (int): number of CPUs. It is identical on every machine.
+    """
+    return enable_if.unique([cpu_device_num, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def cpu_device_num(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is int
+    sess.config_proto.resource.cpu_device_num = val
+
+
+def api_comm_net_worker_num(val: int) -> None:
+    """Set up the workers number in epoll  mode network,
+            If use RDMA mode network, then doesn't need.
+
+    Args:
+        val (int): number of workers
+    """
+    return enable_if.unique([comm_net_worker_num, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def comm_net_worker_num(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is int
+    sess.config_proto.resource.comm_net_worker_num = val
+
+
+def api_max_mdsave_worker_num(val: int) -> None:
+    """Set up max number of workers for mdsave process.
+
+    Args:
+        val (int):  max number of workers
+    """
+    return enable_if.unique([max_mdsave_worker_num, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def max_mdsave_worker_num(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is int
+    sess.config_proto.resource.max_mdsave_worker_num = val
+
+
+def api_numa_aware_cuda_malloc_host(val: bool = True) -> None:
+    """Whether or not let numa know  that  cuda allocated host's memory.
+
+    Args:
+        val (bool, optional): True or False. Defaults to True.
+    """
+    print(
+        "'enable_numa_aware_cuda_malloc_host' has been deprecated, has no effect and will be removed in the future."
+    )
+
+
+def api_compute_thread_pool_size(val: int) -> None:
+    """Set up the size of compute thread pool
+
+    Args:
+        val (int): size of  thread pool
+    """
+    return enable_if.unique([compute_thread_pool_size, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def compute_thread_pool_size(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is int
+    sess.config_proto.resource.compute_thread_pool_size = val
+
+
+def api_rdma_mem_block_mbyte(val: int) -> None:
+    """Set up the memory block size in rdma mode.
+
+    Args:
+        val (int): size of block, e.g. 1024(mb)
+    """
+    return enable_if.unique([rdma_mem_block_mbyte, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def rdma_mem_block_mbyte(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is int
+    sess.config_proto.resource.rdma_mem_block_mbyte = val
+
+
+def api_rdma_recv_msg_buf_mbyte(val: int) -> None:
+    """Set up the buffer size for receiving messages in rama mode
+
+    Args:
+        val (int): buffer size, e.g. 1024(mb)
+    """
+    return enable_if.unique([rdma_recv_msg_buf_mbyte, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def rdma_recv_msg_buf_mbyte(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is int
+    sess.config_proto.resource.rdma_recv_msg_buf_mbyte = val
+
+
+def api_reserved_host_mem_mbyte(val: int) -> None:
+    """Set up the memory size of reserved host
+
+    Args:
+        val (int):  memory size, e.g. 1024(mb)
+    """
+    return enable_if.unique([reserved_host_mem_mbyte, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def reserved_host_mem_mbyte(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is int
+    sess.config_proto.resource.reserved_host_mem_mbyte = val
+
+
+def api_reserved_device_mem_mbyte(val: int) -> None:
+    """Set up the memory size of reserved device
+
+    Args:
+        val (int):  memory size, e.g. 1024(mb)
+    """
+    return enable_if.unique([reserved_device_mem_mbyte, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def reserved_device_mem_mbyte(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is int
+    sess.config_proto.resource.reserved_device_mem_mbyte = val
+
+
+def api_use_rdma(val: bool = True) -> None:
+    """Whether use RDMA to speed up data transmission in cluster nodes or not.
+          if not, then use normal epoll mode.
+
+    Args:
+        val (bool, optional):  Defaults to True.
+    """
+    return enable_if.unique([use_rdma, do_nothing])(val=val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def use_rdma(val=True):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is bool
+    sess.config_proto.resource.use_rdma = val
+
+
+def api_thread_enable_local_message_queue(val: bool) -> None:
+    """Whether or not enable thread using local  message queue.
+
+    Args:
+        val (bool):  True or False
+    """
+    return enable_if.unique([thread_enable_local_message_queue, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def thread_enable_local_message_queue(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is bool
+    sess.config_proto.resource.thread_enable_local_message_queue = val
+
+
+def api_enable_debug_mode(val: bool) -> None:
+    """Whether use debug mode or not.
+
+    Args:
+        val (bool):  True or False
+    """
+    return enable_if.unique([enable_debug_mode, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def enable_debug_mode(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is bool
+    sess.config_proto.resource.enable_debug_mode = val
+
+
+def api_legacy_model_io_enabled():
+    sess = session_ctx.GetDefaultSession()
+    return sess.config_proto.resource.enable_legacy_model_io
+
+
+def api_enable_legacy_model_io(val: bool = True):
+    """Whether or not use legacy model io.
+
+    Args:
+        val ([type]): True or False
+    """
+    return enable_if.unique([enable_legacy_model_io, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def enable_legacy_model_io(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is bool
+    sess.config_proto.resource.enable_legacy_model_io = val
+
+
+def api_enable_model_io_v2(val):
+    """Whether or not use version2  of model input/output function.
+
+    Args:
+        val ([type]): True or False
+    """
+    return enable_if.unique([enable_model_io_v2, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def enable_model_io_v2(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is bool
+    sess.config_proto.resource.enable_model_io_v2 = val
+
+
+def api_collect_act_event(val: bool = True) -> None:
+    """Whether or not collect active event.
+
+    Args:
+        val (bool, optional): True or False. Defaults to True.
+    """
+    return enable_if.unique([collect_act_event, do_nothing])(val=val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def collect_act_event(val=True):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is int
+    sess.config_proto.profile_conf.collect_act_event = val
+
+
+def api_enable_fusion(val: bool = True) -> None:
+    """Whether or not allow fusion the operators
+
+    Args:
+        val (bool, optional): True or False. Defaults to True.
+    """
+    return enable_if.unique([enable_fusion, do_nothing])(val=val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def enable_fusion(val=True):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is bool
+    sess.config_proto.resource.collective_boxing_conf.enable_fusion = val
+
+
+def api_num_callback_threads(val: int) -> None:
+    """Set up number of callback threads for boxing process.
+            Boxing is used to convert between different parallel properties of logical tensor
+
+    Args:
+        val (int): number of  callback threads
+    """
+    return enable_if.unique([num_callback_threads, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def num_callback_threads(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is int
+    sess.config_proto.resource.collective_boxing_conf.num_callback_threads = val
+
+
+def api_enable_tensor_float_32_compute(val: bool = True) -> None:
+    """Whether or not to enable Tensor-float-32 on supported GPUs
+
+    Args:
+        val (bool, optional): True or False. Defaults to True.
+    """
+    return enable_if.unique([enable_tensor_float_32_compute, do_nothing])(val=val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def enable_tensor_float_32_compute(val=True):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is bool
+    sess.config_proto.resource.enable_tensor_float_32_compute = val
+
+
+def api_enable_mem_chain_merge(val: bool = True) -> None:
+    """Whether or not to enable MemChain merge.
+
+    Args:
+        val (bool, optional): True or False. Defaults to True.
+    """
+    return enable_if.unique([enable_mem_chain_merge, do_nothing])(val=val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def enable_mem_chain_merge(val=True):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is bool
+    sess.config_proto.resource.enable_mem_chain_merge = val
+
+
+def api_nccl_use_compute_stream(val: bool = False) -> None:
+    """Whether or not nccl use compute stream to reuse nccl memory and speedup
+
+    Args:
+        val (bool, optional): True or False. Defaults to False.
+    """
+    return enable_if.unique([nccl_use_compute_stream, do_nothing])(val=val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def nccl_use_compute_stream(val=False):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is bool
+    sess.config_proto.resource.nccl_use_compute_stream = val
+
+
+def api_disable_group_boxing_by_dst_parallel(val: bool = False) -> None:
+    """Whether or not disable group boxing by dst parallel pass to reduce boxing memory life cycle.
+
+    Args:
+        val (bool, optional): True or False. Defaults to False.
+    """
+    return enable_if.unique([disable_group_boxing_by_dst_parallel, do_nothing])(val=val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def disable_group_boxing_by_dst_parallel(val=False):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is bool
+    sess.config_proto.resource.disable_group_boxing_by_dst_parallel = val
+
+
+def api_nccl_num_streams(val: int) -> None:
+    """Set up the number of nccl parallel streams while use boxing
+
+    Args:
+        val (int): number of streams
+    """
+    return enable_if.unique([nccl_num_streams, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def nccl_num_streams(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is int
+    sess.config_proto.resource.collective_boxing_conf.nccl_num_streams = val
+
+
+def api_nccl_fusion_threshold_mb(val: int) -> None:
+    """Set up threshold for oprators fusion
+
+    Args:
+        val (int): int number, e.g. 10(mb)
+    """
+    return enable_if.unique([nccl_fusion_threshold_mb, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def nccl_fusion_threshold_mb(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is int
+    sess.config_proto.resource.collective_boxing_conf.nccl_fusion_threshold_mb = val
+
+
+def api_nccl_fusion_all_reduce_use_buffer(val: bool) -> None:
+    """Whether or not use buffer during nccl fusion progress
+
+    Args:
+        val (bool): True or False
+    """
+    return enable_if.unique([nccl_fusion_all_reduce_use_buffer, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def nccl_fusion_all_reduce_use_buffer(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is bool
+    sess.config_proto.resource.collective_boxing_conf.nccl_fusion_all_reduce_use_buffer = (
+        val
+    )
+
+
+def api_nccl_fusion_all_reduce(val: bool) -> None:
+    """Whether or not use nccl fusion during all reduce progress
+
+    Args:
+        val (bool):  True or False
+    """
+    return enable_if.unique([nccl_fusion_all_reduce, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def nccl_fusion_all_reduce(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is bool
+    sess.config_proto.resource.collective_boxing_conf.nccl_fusion_all_reduce = val
+
+
+def api_nccl_fusion_reduce_scatter(val: bool) -> None:
+    """Whether or not  use nccl fusion during reduce scatter progress
+
+    Args:
+        val (bool): True or False
+    """
+    return enable_if.unique([nccl_fusion_reduce_scatter, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def nccl_fusion_reduce_scatter(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is bool
+    sess.config_proto.resource.collective_boxing_conf.nccl_fusion_reduce_scatter = val
+
+
+def api_nccl_fusion_all_gather(val: bool) -> None:
+    """Whether or not use nccl fusion during all  gather progress
+
+    Args:
+        val (bool): True or False
+    """
+    return enable_if.unique([nccl_fusion_all_gather, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def nccl_fusion_all_gather(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is bool
+    sess.config_proto.resource.collective_boxing_conf.nccl_fusion_all_gather = val
+
+
+def api_nccl_fusion_reduce(val: bool) -> None:
+    """Whether or not use nccl fusion during reduce progress
+
+    Args:
+        val (bool): True or False
+    """
+    return enable_if.unique([nccl_fusion_reduce, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def nccl_fusion_reduce(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is bool
+    sess.config_proto.resource.collective_boxing_conf.nccl_fusion_reduce = val
+
+
+def api_nccl_fusion_broadcast(val: bool) -> None:
+    """Whether or not use nccl fusion during broadcast progress
+
+    Args:
+        val (bool): True or False
+    """
+    return enable_if.unique([nccl_fusion_broadcast, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def nccl_fusion_broadcast(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is bool
+    sess.config_proto.resource.collective_boxing_conf.nccl_fusion_broadcast = val
+
+
+def api_nccl_fusion_max_ops(val: int) -> None:
+    """Maximum number of ops for nccl fusion.
+
+    Args:
+        val (int): Maximum number of ops
+    """
+    return enable_if.unique([nccl_fusion_max_ops, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def nccl_fusion_max_ops(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is int
+    sess.config_proto.resource.collective_boxing_conf.nccl_fusion_max_ops = val
+
+
+def api_nccl_enable_all_to_all(val: bool) -> None:
+    """Whether or not use nccl all2all during s2s boxing
+
+    Args:
+        val (bool): True or False
+    """
+    return enable_if.unique([nccl_enable_all_to_all, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def nccl_enable_all_to_all(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is bool
+    sess.config_proto.resource.collective_boxing_conf.nccl_enable_all_to_all = val
+
+
+def api_nccl_enable_mixed_fusion(val: bool) -> None:
+    """Whether or not use nccl mixed fusion
+
+    Args:
+        val (bool): True or False
+    """
+    return enable_if.unique([nccl_enable_mixed_fusion, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def nccl_enable_mixed_fusion(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is bool
+    sess.config_proto.resource.collective_boxing_conf.nccl_enable_mixed_fusion = val
+
+
+@enable_if.condition(hob.in_normal_mode & hob.session_initialized)
+def do_nothing(*args, **kwargs):
+    print("Nothing happened because the session is running")
+    return False
diff --git a/python/oneflow/compatible/single_client/framework/distribute.py b/python/oneflow/compatible/single_client/framework/distribute.py
new file mode 100644
index 0000000000000000000000000000000000000000..892aa39dca50dfac1b2809f0132083e5643f9e92
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/distribute.py
@@ -0,0 +1,228 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import traceback
+from contextlib import contextmanager
+
+import oneflow._oneflow_internal
+from oneflow import oneflow_deprecate
+from oneflow.compatible.single_client.framework import (
+    distribute_context as distribute_ctx,
+)
+
+
+@oneflow_deprecate()
+def deprecated_mirrored_strategy():
+    print(
+        "WARNING:",
+        "oneflow.compatible.single_client.distribute.mirrored_strategy",
+        "will be removed in the future, use {} instead.".format(
+            "oneflow.compatible.single_client.scope.mirrored_view"
+        ),
+    )
+    print(traceback.format_stack()[-2])
+    return DistributeMirroredStrategy()
+
+
+class DistributeMirroredStrategy(distribute_ctx.DistributeStrategy):
+    """Create a scope in mirrored view. All operators within the scope will be mirrored among diffierent accelerators.
+    Usage::
+
+        with oneflow.compatible.single_client.scope.mirrored_view():
+            ...
+
+    """
+
+    def __init__(self):
+        distribute_ctx.DistributeStrategy.__init__(self, True)
+
+
+from oneflow import oneflow_deprecate
+
+
+@oneflow_deprecate()
+def deprecated_mirrored_strategy_enabled():
+    print(
+        "WARNING:",
+        "oneflow.compatible.single_client.distribute.mirrored_strategy_enabled",
+        "will be removed in the future, use {} instead.".format(
+            "oneflow.compatible.single_client.scope.mirrored_view_enabled"
+        ),
+    )
+    print(traceback.format_stack()[-2])
+    return MirroredStrategyEnabled()
+
+
+def MirroredStrategyEnabled() -> bool:
+    """
+
+    Returns:
+        bool: `True` if mirrored strategy is enabled in current context where this function is called.
+
+    """
+    return distribute_ctx.IsMirroredStrategyEnabled()
+
+
+from oneflow import oneflow_deprecate
+
+
+@oneflow_deprecate()
+def deprecated_consistent_strategy():
+    print(
+        "WARNING:",
+        "oneflow.compatible.single_client.distribute.consistent_strategy",
+        "will be removed in the future, use {} instead.".format(
+            "oneflow.compatible.single_client.scope.consistent_view"
+        ),
+    )
+    print(traceback.format_stack()[-2])
+    return DistributeConsistentStrategy()
+
+
+class DistributeConsistentStrategy(distribute_ctx.DistributeStrategy):
+    """Create a scope in consistent view. All operators within the scope will be automatically parallelized among diffierent accelerators for best performance and least data transfer.
+
+    Usage::
+
+        with oneflow.compatible.single_client.scope.consistent_view():
+            ...
+
+    """
+
+    def __init__(self):
+        distribute_ctx.DistributeStrategy.__init__(self, False)
+
+
+from oneflow import oneflow_deprecate
+
+
+@oneflow_deprecate()
+def deprecated_consistent_strategy_enabled():
+    print(
+        "WARNING:",
+        "oneflow.compatible.single_client.distribute.consistent_strategy_enabled",
+        "will be removed in the future, use {} instead.".format(
+            "oneflow.compatible.single_client.scope.consistent_view_enabled"
+        ),
+    )
+    print(traceback.format_stack()[-2])
+    return ConsistentStrategyEnabled()
+
+
+def ConsistentStrategyEnabled() -> bool:
+    """
+
+    Returns:
+        bool: `True` if consistent strategy is enabled in current context where this function is called.
+
+    """
+    return distribute_ctx.IsConsistentStrategyEnabled()
+
+
+def split(axis: int) -> oneflow._oneflow_internal.distribute.SplitDistribute:
+    """Generate a split scheme in which op will be splitted at `axis`.
+
+    Args:
+        axis (int): At `axis` the op will be splitted.
+
+    Returns:
+        SplitDistribute: Split scheme object, often required by `with_distribute` method of `Blob` or `oneflow.compatible.single_client.get_variable`.
+
+    Example::
+        weight = weight.with_distribute(distribute.split(1))
+
+    """
+    assert type(axis) is int
+    return oneflow._oneflow_internal.distribute.split(axis)
+
+
+def broadcast() -> oneflow._oneflow_internal.distribute.BroadcastDistribute:
+    """Generate a broadcast scheme.
+
+    Returns:
+        BroadcastDistribute: Broadcast scheme object, often required by `with_distribute` method of `Blob` or `oneflow.compatible.single_client.get_variable`.
+
+    Example::
+        segment_ids = segment_ids.with_distribute(flow.distribute.broadcast())
+
+    """
+    return oneflow._oneflow_internal.distribute.broadcast()
+
+
+def auto() -> oneflow._oneflow_internal.distribute.AutoDistribute:
+    """Generate a broadcast scheme.
+
+    Returns:
+        AutoDistribute: Auto distribute scheme object, often required by `with_distribute` method of `Blob` or `oneflow.compatible.single_client.get_variable`.
+
+    """
+    return oneflow._oneflow_internal.distribute.auto()
+
+
+def assert_is_valid_distribute(
+    distribute: oneflow._oneflow_internal.distribute.Distribute,
+) -> None:
+    assert isinstance(
+        distribute, oneflow._oneflow_internal.distribute.Distribute
+    ), "not a valid distribute policy.\n           expected: 1) oneflow.compatible.single_client.distribute.split(axis); 2) oneflow.compatible.single_client.distribute.broadcast(); 3) oneflow.compatible.single_client.distribute.auto()"
+
+
+def get_local_rank():
+    return oneflow._oneflow_internal.GetLocalRank()
+
+
+def get_rank():
+    """Returns the rank of current process group.
+
+    Returns:
+        The rank of the process group.
+
+    """
+    return oneflow._oneflow_internal.GetRank()
+
+
+def get_world_size():
+    """Returns the number of processes in the current process group.
+
+    Returns:
+        The world size of the process group.
+
+    """
+    return oneflow._oneflow_internal.GetWorldSize()
+
+
+def is_multi_client():
+    return oneflow._oneflow_internal.IsMultiClient()
+
+
+def split_sbp(
+    axis: int,
+) -> oneflow._oneflow_internal.oneflow.core.job.sbp_parallel.SbpParallel:
+    """Generate a split scheme in which op will be splitted at `axis`.
+
+    Args:
+        axis (int): At `axis` the op will be splitted.
+
+    Returns:
+        SbpParallel: Split scheme object, often required by `to_consistent` method of `Tensor`
+
+    Example::
+        array = numpy.array([[1.0, 2.0], [3.0, 4.0]])
+        t1 = flow.tensor(array)
+        ct2 = t1.to_consistent(sbp=flow.sbp.split(0), placement=("cuda", {0: [0, 1, 2, 3]}))
+
+    """
+    assert type(axis) is int
+    return oneflow._oneflow_internal.sbp.split(axis)
diff --git a/python/oneflow/compatible/single_client/framework/distribute_context.py b/python/oneflow/compatible/single_client/framework/distribute_context.py
new file mode 100644
index 0000000000000000000000000000000000000000..316df9778b2fac11d5b479c56b6f5d27d2f85fc2
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/distribute_context.py
@@ -0,0 +1,60 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.framework import scope_util as scope_util
+from oneflow.compatible.single_client.framework import session_context as session_ctx
+
+
+class DistributeStrategy(object):
+    def __init__(self, is_mirrored):
+        self.is_mirrored_ = is_mirrored
+        self.scope_context_ = None
+        sess = session_ctx.GetDefaultSession()
+        if sess.is_running and (
+            not sess.has_empty_is_mirrored_strategy_enabled_stack()
+        ):
+
+            def BuildScope(old_scope, builder):
+                return builder.BuildScopeWithNewIsMirrored(old_scope, is_mirrored)
+
+            self.scope_context_ = scope_util.ScopeContext(
+                scope_util.MakeScope(BuildScope)
+            )
+
+    def __enter__(self, *argc, **kwarg):
+        PushMirroredStrategyEnabled(self.is_mirrored_)
+        if self.scope_context_ is not None:
+            self.scope_context_.__enter__(*argc, **kwarg)
+
+    def __exit__(self, *argc, **kwarg):
+        PopMirroredStrategyEnabled()
+        if self.scope_context_ is not None:
+            self.scope_context_.__exit__(*argc, **kwarg)
+
+
+def PushMirroredStrategyEnabled(val):
+    session_ctx.GetDefaultSession().push_mirrored_strategy_enabled(val)
+
+
+def IsMirroredStrategyEnabled():
+    return session_ctx.GetDefaultSession().is_mirrored_strategy_enabled()
+
+
+def IsConsistentStrategyEnabled():
+    return session_ctx.GetDefaultSession().is_consistent_strategy_enabled()
+
+
+def PopMirroredStrategyEnabled():
+    session_ctx.GetDefaultSession().pop_mirrored_strategy_enabled()
diff --git a/python/oneflow/compatible/single_client/framework/dtype.py b/python/oneflow/compatible/single_client/framework/dtype.py
new file mode 100644
index 0000000000000000000000000000000000000000..96005a349ff219fa30ae59efaf83760060decaca
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/dtype.py
@@ -0,0 +1,74 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import numpy as np
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.core.common import data_type_pb2 as data_type_pb2
+
+_dtypes = [
+    flow.char,
+    flow.float,
+    flow.float32,
+    flow.double,
+    flow.float64,
+    flow.float16,
+    flow.int8,
+    flow.int32,
+    flow.int64,
+    flow.uint8,
+    flow.record,
+    flow.tensor_buffer,
+]
+
+
+def dtypes():
+    return _dtypes
+
+
+def convert_proto_dtype_to_oneflow_dtype(proto_dtype):
+    return oneflow._oneflow_internal.deprecated.GetDTypeByDataType(proto_dtype)
+
+
+_ONEFLOW_DTYPE_TO_NUMPY_DTYPE = {
+    flow.char: np.byte,
+    flow.float: np.float32,
+    flow.float16: np.float16,
+    flow.float32: np.float32,
+    flow.float64: np.double,
+    flow.double: np.double,
+    flow.int8: np.int8,
+    flow.int32: np.int32,
+    flow.int64: np.int64,
+    flow.uint8: np.uint8,
+}
+
+
+def convert_oneflow_dtype_to_numpy_dtype(oneflow_dtype: flow.dtype):
+    if oneflow_dtype not in _ONEFLOW_DTYPE_TO_NUMPY_DTYPE:
+        raise NotImplementedError
+    return _ONEFLOW_DTYPE_TO_NUMPY_DTYPE[oneflow_dtype]
+
+
+def convert_numpy_dtype_to_oneflow_dtype(numpy_dtype: np.dtype):
+    for (k, v) in _ONEFLOW_DTYPE_TO_NUMPY_DTYPE.items():
+        if v == numpy_dtype:
+            return k
+    raise NotImplementedError
+
+
+del data_type_pb2
+del np
diff --git a/python/oneflow/compatible/single_client/framework/env_util.py b/python/oneflow/compatible/single_client/framework/env_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7e4f2aae6d027d7a9fae8eb63ec72304f0baf56
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/env_util.py
@@ -0,0 +1,403 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+import socket
+import traceback
+from contextlib import closing
+
+import oneflow._oneflow_internal
+from oneflow import oneflow_deprecate
+from oneflow.compatible.single_client.framework import c_api_util as c_api_util
+from oneflow.compatible.single_client.framework import hob as hob
+from oneflow.compatible.single_client.framework import (
+    placement_context as placement_ctx,
+)
+from oneflow.compatible.single_client.framework import scope_util as scope_util
+from oneflow.compatible.single_client.framework import session_context as session_ctx
+from oneflow.compatible.single_client.support import enable_if as enable_if
+from oneflow.core.control import ctrl_bootstrap_pb2 as ctrl_bootstrap_pb
+from oneflow.core.job import env_pb2 as env_pb
+from oneflow.core.job import resource_pb2 as resource_util
+
+
+def api_all_device_placement(device_type: str) -> None:
+    """Return a placement containing all devices of all machines under env.
+
+    Args:
+        device_type (str): cuda or cpu
+    """
+    return oneflow._oneflow_internal.AllDevicePlacement(device_type)
+
+
+def api_enable_eager_execution(val: bool = True) -> None:
+    """If True, job will execute in eager mode, else use lazy mode(static graph).
+
+    Args:
+        val (bool, optional): Whether  eager execution or not.  Defaults to True.
+    """
+    return enable_if.unique([enable_eager_environment])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.any_global_function_defined)
+def enable_eager_environment(val=True):
+    return oneflow._oneflow_internal.EnableEagerEnvironment(val)
+
+
+def api_env_init() -> bool:
+    """Init environment for job
+
+    Returns:
+        bool: [description]
+    """
+    return enable_if.unique([_env_init_single_client, do_nothing])()
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.env_initialized)
+def _env_init_single_client():
+    return env_init(False)
+
+
+def env_init(is_multi_client):
+    global default_env_proto
+    assert len(default_env_proto.machine) > 0
+    CompleteEnvProto(default_env_proto, is_multi_client)
+    c_api_util.InitEnv(default_env_proto, is_multi_client)
+    if not is_multi_client:
+        if oneflow._oneflow_internal.CurrentMachineId() == 0:
+            scope_util.InitScopeStack()
+        else:
+            exit(0)
+    return True
+
+
+def api_get_current_resource() -> resource_util.Resource:
+    """Get current resources, such as:machine nums, cpu/gpu device nums,
+            epoch network threed num, rdma params...
+
+    Returns:
+        resource_util.Resource: [description]
+    """
+    return enable_if.unique([get_current_resource])()
+
+
+@enable_if.condition(hob.in_normal_mode & hob.env_initialized)
+def get_current_resource():
+    return c_api_util.CurrentResource()
+
+
+def api_get_current_machine_id():
+    """Get machine id of current machine/node
+
+    Returns:
+        [type]: [description]
+    """
+    return enable_if.unique([get_current_machine_id])()
+
+
+@enable_if.condition(hob.in_normal_mode & hob.env_initialized)
+def get_current_machine_id() -> int:
+    return oneflow._oneflow_internal.CurrentMachineId()
+
+
+def api_machine(*val: list) -> None:
+    """Set machines' hostnames.
+
+    For instance:
+
+        oneflow.compatible.single_client.env.machine([{"addr": "192.168.1.1"}, {"addr": "192.168.1.2"}])
+
+    Args:
+        val:  `list`, `tuple` or multiple arguments of `dict`. First in the list is the master machine.
+    """
+    return enable_if.unique([machine, do_nothing])(*val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.env_initialized)
+def machine(*val):
+    del default_env_proto.machine[:]
+    if len(val) == 1 and isinstance(val[0], (list, tuple)):
+        val = val[0]
+    default_env_proto.ClearField("machine")
+    default_env_proto.machine.extend(_MakeMachine(val))
+
+
+def api_ctrl_port(val: int) -> None:
+    """Set port number used to control the execution across multiple machines. Same on every machine.
+
+    Args:
+        val: a port number accessible to peer machines
+    """
+    return enable_if.unique([ctrl_port, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.env_initialized)
+def ctrl_port(val):
+    assert type(val) is int
+    default_env_proto.ctrl_port = val
+
+
+def api_data_port(val: int) -> None:
+    """Set port number used to data transfer among multiple machines. Same on every machine.
+
+    Args:
+        val: a port number accessible to peer machines
+    """
+    return enable_if.unique([data_port, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.env_initialized)
+def data_port(val):
+    assert type(val) is int
+    default_env_proto.data_port = val
+
+
+from oneflow import oneflow_deprecate
+
+
+@oneflow_deprecate()
+def api_grpc_use_no_signal(val: bool = True) -> None:
+    """Set rpc use signal or not (deprecate)
+
+    Args:
+        val (bool, optional): True or False. Defaults to True.
+    """
+    print(
+        "WARNING:",
+        "oneflow.compatible.single_client.env.grpc_use_no_signal is deprecated, users no longer need to set rpc use signal or not. \n",
+        traceback.format_stack()[-2],
+    )
+    return None
+
+
+def api_log_dir(val: str) -> None:
+    """Specify a dir to store OneFlow's logging files. If not specified, it is `./log` by default.
+
+    Args:
+        val (str): string , log file path
+    """
+    return enable_if.unique([log_dir, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.env_initialized)
+def log_dir(val):
+    assert type(val) is str
+    default_env_proto.cpp_logging_conf.log_dir = val
+
+
+def api_logtostderr(val: int) -> None:
+    """Set whether log messages go to stderr instead of logfiles
+
+    Args:
+        val (int): [description]
+    """
+    return enable_if.unique([logtostderr, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.env_initialized)
+def logtostderr(val):
+    assert type(val) is int
+    default_env_proto.cpp_logging_conf.logtostderr = val
+
+
+def api_logbuflevel(val: int) -> None:
+    """Log messages at a level <= this flag are buffered.
+            Log messages at a higher level are flushed immediately.
+
+    Args:
+        val (int): int, number of level
+    """
+    return enable_if.unique([logbuflevel, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.env_initialized)
+def logbuflevel(val):
+    assert type(val) is int
+    default_env_proto.cpp_logging_conf.logbuflevel = val
+
+
+@enable_if.condition(hob.in_normal_mode & hob.env_initialized)
+def do_nothing(*args, **kwargs):
+    print("Nothing happened because environment has been initialized")
+    return False
+
+
+def CompleteEnvProto(env_proto, is_multi_client):
+    if is_multi_client:
+        _UpdateDefaultEnvProtoByMultiClientEnvVars(env_proto)
+    if env_proto.HasField("ctrl_port") == False:
+        if len(env_proto.machine) == 1:
+            env_proto.ctrl_port = _FindFreePort()
+        else:
+            raise ValueError(
+                "a ctrl_port is required if running multi-node, set it with 'oneflow.compatible.single_client.env.ctrl_port([YOUR PORT])'"
+            )
+
+
+def _MakeMachine(machines):
+    if isinstance(machines, str):
+        machines = [machines]
+    rp_machine = env_pb.EnvProto().machine
+    for m_data in machines:
+        m = rp_machine.add()
+        if isinstance(m_data, str):
+            m.addr = m_data
+        elif isinstance(m_data, dict):
+            if "addr" in m_data:
+                m.addr = m_data["addr"]
+            if "ctrl_port_agent" in m_data:
+                m.ctrl_port_agent = m_data["ctrl_port_agent"]
+            if "data_port_agent" in m_data:
+                m.data_port_agent = m_data["data_port_agent"]
+        else:
+            raise NotImplementedError
+    id = 0
+    addrs_for_check = set()
+    for m in rp_machine:
+        m.id = id
+        id += 1
+        assert m.addr not in addrs_for_check
+        addrs_for_check.add(m.addr)
+    return rp_machine
+
+
+def api_init_bootstrap_confs(*val: list, **kargs) -> None:
+    return enable_if.unique([MakeBootstrapConfs, do_nothing])(*val, **kargs)
+
+
+def _MakeBootstrapConf(bootstrap_info: dict):
+    global config_master_addr
+    assert config_master_addr.HasField("host"), "must config master host first"
+    assert config_master_addr.HasField("port"), "must config master port first"
+    assert config_world_size != 0, "must config world size first"
+    bootstrap_conf = ctrl_bootstrap_pb.BootstrapConf()
+    bootstrap_conf.master_addr.CopyFrom(config_master_addr)
+    bootstrap_conf.world_size = config_world_size
+    assert "rank" in bootstrap_info
+    bootstrap_conf.rank = bootstrap_info["rank"]
+    if "host" in bootstrap_info:
+        bootstrap_conf.host = bootstrap_info["host"]
+    global config_bootstrap_ctrl_port
+    if config_bootstrap_ctrl_port != 0:
+        bootstrap_conf.ctrl_port = config_bootstrap_ctrl_port
+    global config_node_size
+    if config_node_size != 0:
+        bootstrap_conf.node_size = config_node_size
+    return bootstrap_conf
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.env_initialized)
+def MakeBootstrapConfs(
+    node_list, master_port, world_size=0, ctrl_port=-1, node_size=-1
+):
+    """Set ctrl_bootstrap_conf' info.
+
+    For instance:
+
+        ONEFLOW_TEST_NODE_LIST=192.168.1.16,192.168.1.15 ONEFLOW_TEST_MASTER_PORT=43256
+        ONEFLOW_TEST_WORLD_SIZE=2 ONEFLOW_TEST_RANK_CTRL_PORT=34527
+
+    Args:
+        val:  `list`, First in the list is the master machine.
+    """
+    if isinstance(node_list, str):
+        node_list = [node_list]
+    global global_ctrl_bootstrap_confs
+    assert len(global_ctrl_bootstrap_confs) == 0, "ctrl_bootstrap_conf has been inited"
+    global config_master_addr
+    config_master_addr.host = node_list[0]
+    config_master_addr.port = master_port
+    global config_world_size
+    if world_size == 0:
+        config_world_size = len(node_list)
+    else:
+        assert world_size % len(node_list) == 0
+        config_world_size = world_size
+    global config_bootstrap_ctrl_port
+    if ctrl_port != -1:
+        config_bootstrap_ctrl_port = ctrl_port
+    global config_node_size
+    if node_size != -1:
+        config_node_size = node_size
+    rank = 0
+    for rank_host in node_list:
+        assert isinstance(rank_host, str)
+        bootstrap_conf = _MakeBootstrapConf({"rank": rank, "host": rank_host})
+        if rank == 0:
+            global default_env_proto
+            default_env_proto.ctrl_bootstrap_conf.CopyFrom(bootstrap_conf)
+        global_ctrl_bootstrap_confs.append(bootstrap_conf)
+        rank += 1
+    return global_ctrl_bootstrap_confs
+
+
+def _DefaultEnvProto():
+    env_proto = env_pb.EnvProto()
+    machine = env_proto.machine.add()
+    machine.id = 0
+    machine.addr = "127.0.0.1"
+    return env_proto
+
+
+def _FindFreePort():
+    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
+        s.bind(("localhost", 0))
+        s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+        return s.getsockname()[1]
+
+
+def GetEnvDefaultParallelConf(device_tag):
+    if device_tag not in device_tag2default_parallel_conf:
+        parallel_conf = placement_ctx.MakeParallelConf4Resource(
+            device_tag, c_api_util.EnvResource()
+        )
+        device_tag2default_parallel_conf[device_tag] = parallel_conf
+    return device_tag2default_parallel_conf[device_tag]
+
+
+def HasAllMultiClientEnvVars():
+    return (
+        os.getenv("MASTER_ADDR")
+        and os.getenv("MASTER_PORT")
+        and os.getenv("WORLD_SIZE")
+        and os.getenv("RANK")
+        and os.getenv("LOCAL_RANK")
+    )
+
+
+def _UpdateDefaultEnvProtoByMultiClientEnvVars(env_proto):
+    assert HasAllMultiClientEnvVars()
+
+    def str2int(env_config):
+        assert env_config.isdigit()
+        return int(env_config)
+
+    bootstrap_conf = ctrl_bootstrap_pb.BootstrapConf()
+    master_addr = ctrl_bootstrap_pb.Address()
+    master_addr.host = os.getenv("MASTER_ADDR")
+    master_addr.port = str2int(os.getenv("MASTER_PORT"))
+    bootstrap_conf.master_addr.CopyFrom(master_addr)
+    bootstrap_conf.world_size = str2int(os.getenv("WORLD_SIZE"))
+    bootstrap_conf.rank = str2int(os.getenv("RANK"))
+    env_proto.ctrl_bootstrap_conf.CopyFrom(bootstrap_conf)
+
+
+device_tag2default_parallel_conf = {}
+default_env_proto = _DefaultEnvProto()
+config_master_addr = ctrl_bootstrap_pb.Address()
+config_world_size = 0
+config_bootstrap_ctrl_port = 0
+config_node_size = 0
+global_ctrl_bootstrap_confs = []
diff --git a/python/oneflow/compatible/single_client/framework/function_desc.py b/python/oneflow/compatible/single_client/framework/function_desc.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a5961eab07d57885a151d58ad6079084c40ace3
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/function_desc.py
@@ -0,0 +1,99 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow._oneflow_internal
+from oneflow._oneflow_internal.oneflow.core.job import job_conf as job_conf_cfg
+from oneflow.compatible.single_client.framework import hob as hob
+from oneflow.compatible.single_client.framework import session_context as session_ctx
+from oneflow.compatible.single_client.support import enable_if as enable_if
+
+
+class FunctionAttribute(object):
+    def __init__(self):
+        self.default_placement_scope = None
+        self.default_distribute_strategy = None
+        self.allow_cpu_return_op = True
+
+
+class FunctionDesc(object):
+    def __init__(self, job_func=None, job_config_proto=None, function_attribute=None):
+        if job_config_proto is None:
+            job_config_proto = job_conf_cfg.JobConfigProto()
+        if function_attribute is None:
+            function_attribute = FunctionAttribute()
+        self.job_func = job_func
+        self.job_config_proto = job_config_proto
+        self.job_config_proto.mutable_predict_conf()
+        self.function_attribute = function_attribute
+
+    def IsTrainable(self):
+        if self.job_config_proto.has_train_conf():
+            return True
+        if self.job_config_proto.has_predict_conf():
+            return False
+        raise NotImplementedError
+
+    def HasAttr(self, attr_name):
+        if attr_name == "flag_name2flag_value":
+            return False
+        name2default = session_ctx.GetDefaultSession().function_flag_name2default_val
+        if attr_name in self.job_config_proto.flag_name2flag_value():
+            return True
+        return getattr(self.job_config_proto, "has_" + attr_name)()
+
+    def __getattr__(self, attr_name):
+        assert attr_name != "flag_name2flag_value"
+        flag_name2flag_value = self.job_config_proto.flag_name2flag_value()
+        name2default = session_ctx.GetDefaultSession().function_flag_name2default_val
+        if attr_name not in name2default:
+            assert getattr(self.job_config_proto, "has_" + attr_name)()
+            return getattr(self.job_config_proto, attr_name)()
+        attr_value = name2default[attr_name]
+        if attr_name in flag_name2flag_value:
+            attr_value = flag_name2flag_value[attr_name]
+        if attr_value.HasField("at_bool"):
+            return attr_value.at_bool
+        elif attr_value.HasField("at_int64"):
+            return attr_value.at_int64
+        elif attr_value.HasField("at_double"):
+            return attr_value.at_double
+        elif attr_value.HasField("at_string"):
+            return attr_value.at_string
+        else:
+            raise NotImplementedError()
+
+
+@enable_if.condition(hob.in_global_mode & hob.eager_execution_enabled)
+def GetCurrentEagerGlobalFunctionDesc():
+    sess = session_ctx.GetDefaultSession()
+    ret = sess.CurrentEagerGlobalFunctionDesc()
+    assert ret is not None
+    return ret
+
+
+@enable_if.condition(hob.in_global_mode & ~hob.eager_execution_enabled)
+def GetCurrentLazyGlobalFunctionDesc():
+    sess = session_ctx.GetDefaultSession()
+    job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+    ret = sess.GetLazyFunctionDesc(job_name)
+    assert ret is not None
+    return ret
+
+
+def api_current_global_function_desc() -> FunctionDesc:
+    api_func = enable_if.unique(
+        [GetCurrentLazyGlobalFunctionDesc, GetCurrentEagerGlobalFunctionDesc]
+    )
+    return api_func()
diff --git a/python/oneflow/compatible/single_client/framework/function_util.py b/python/oneflow/compatible/single_client/framework/function_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..f33e1658508f396164f0310ee9f6260f77f40459
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/function_util.py
@@ -0,0 +1,992 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import copy
+import functools
+import inspect
+import re
+import sys
+import traceback
+from typing import Any, Callable, Optional, Union
+
+import oneflow._oneflow_internal
+from oneflow import oneflow_deprecate
+from oneflow._oneflow_internal.oneflow.core.common import data_type as data_type_cfg
+from oneflow.compatible.single_client.framework import (
+    distribute_context as distribute_ctx,
+)
+from oneflow.compatible.single_client.framework import hob as hob
+from oneflow.compatible.single_client.framework import (
+    placement_context as placement_ctx,
+)
+from oneflow.compatible.single_client.framework import runtime_mode as rt_mode
+from oneflow.compatible.single_client.framework import session_context as session_ctx
+from oneflow.compatible.single_client.framework import typing_util as oft_util
+from oneflow.compatible.single_client.framework.function_desc import FunctionDesc
+from oneflow.compatible.single_client.support import enable_if as enable_if
+from oneflow.compatible.single_client.support import pb_util as pb_util
+
+
+class FunctionConfig(object):
+    """OneFlow function's configurations.
+    """
+
+    def __init__(self) -> None:
+        self.function_desc = FunctionDesc()
+
+    def __getattr__(
+        self, attr_name: str
+    ) -> Callable[[Optional[Union[bool, int, float, str]]], None]:
+        name2default = session_ctx.GetDefaultSession().function_flag_name2default_val
+        assert attr_name in name2default
+        flag_name2flag_value = (
+            self.function_desc.job_config_proto.mutable_flag_name2flag_value()
+        )
+        default_val = name2default[attr_name]
+
+        def FunctionConfigSetter(
+            attr_value: Optional[Union[bool, int, float, str]] = None
+        ) -> None:
+            if default_val.HasField("at_bool"):
+                if attr_value is None:
+                    attr_value = True
+                assert type(attr_value) is bool
+                flag_name2flag_value[attr_name].set_at_bool(attr_value)
+            elif default_val.HasField("at_int64"):
+                assert type(attr_value) is int
+                flag_name2flag_value[attr_name].set_at_int64(attr_value)
+            elif default_val.HasField("at_double"):
+                assert type(attr_value) is float
+                flag_name2flag_value[attr_name].set_at_double(attr_value)
+            elif default_val.HasField("at_string"):
+                assert type(attr_value) is str
+                flag_name2flag_value[attr_name].set_at_string(attr_value)
+            else:
+                raise NotImplementedError(
+                    "config_flag `%s' with type %s is not supported"
+                    % (attr_name, type(attr_value))
+                )
+
+        return FunctionConfigSetter
+
+
+def api_oneflow_function(
+    type: str = "predict", function_config: FunctionConfig = None
+) -> Callable[[Callable], Callable]:
+    """Creates a callable OneFlow global function from a Python function.
+
+    For instance::
+
+        @oneflow.compatible.single_client.global_function(flow.FunctionConfig())
+        def train():
+            # your model
+
+    Args:
+        function_config (FunctionConfig, optional): a `FunctionConfig` object. Defaults to FunctionConfig().
+
+    Returns:
+        Callable[[Callable], Callable]: a callable which is called to execute the compiled function
+    """
+    if isinstance(type, FunctionConfig):
+        function_config = type
+        print(
+            "WARNING: flow.global_function(func_config) is deprecated. Please replace it with flow.global_function(type, func_config).\n            "
+        )
+        print(traceback.format_stack()[-2])
+    else:
+        assert type in ["train", "predict"]
+        if function_config is None:
+            function_config = FunctionConfig()
+        if type == "train":
+            function_config.function_desc.job_config_proto.mutable_train_conf()
+        else:
+            function_config.function_desc.job_config_proto.mutable_predict_conf()
+    api = enable_if.unique([eager_oneflow_function, lazy_oneflow_function])
+    return api(function_config)
+
+
+@enable_if.condition(hob.in_normal_mode & hob.eager_execution_enabled)
+def eager_oneflow_function(function_config=FunctionConfig()):
+    assert isinstance(function_config, FunctionConfig)
+
+    def Decorator(job_func):
+        if not hasattr(job_func, "__oneflow_function_signature__"):
+            job_func.__oneflow_function_signature__ = inspect.signature(job_func)
+        oft_util.CheckGlobalFunctionAnnotation(job_func.__oneflow_function_signature__)
+        sess = session_ctx.GetDefaultSession()
+        function_desc = _CloneFunctionDesc(function_config.function_desc, job_func)
+
+        @functools.wraps(job_func)
+        def Func(*args, **kwargs):
+            return _RunEagerJob(sess, function_desc, *args, **kwargs)
+
+        for x in dir(job_func):
+            if x.startswith("__oneflow_"):
+                setattr(Func, x, getattr(job_func, x))
+        return Func
+
+    return Decorator
+
+
+@enable_if.condition(
+    hob.in_normal_mode & ~hob.eager_execution_enabled & ~hob.session_initialized
+)
+def lazy_oneflow_function(function_config=FunctionConfig()):
+    assert isinstance(function_config, FunctionConfig)
+
+    def Decorator(job_func):
+        if not hasattr(job_func, "__oneflow_function_signature__"):
+            job_func.__oneflow_function_signature__ = inspect.signature(job_func)
+        oft_util.CheckGlobalFunctionAnnotation(job_func.__oneflow_function_signature__)
+        sess = session_ctx.GetDefaultSession()
+
+        @functools.wraps(job_func)
+        def Func(*args, **kwargs):
+            return _RunLazyJob(sess, job_func, *args, **kwargs)
+
+        sess.AddJob(_CloneFunctionDesc(function_config.function_desc, job_func))
+        for x in dir(job_func):
+            if x.startswith("__oneflow_"):
+                setattr(Func, x, getattr(job_func, x))
+        return Func
+
+    return Decorator
+
+
+def global_function_or_identity(*args, **kwargs):
+    if rt_mode.CurrentMode() == rt_mode.NORMAL_MODE:
+        return api_oneflow_function(*args, **kwargs)
+    else:
+        assert rt_mode.CurrentMode() == rt_mode.GLOBAL_MODE
+        identity_decorator = lambda func: func
+        return identity_decorator
+
+
+def _CloneFunctionDesc(func_desc, job_func):
+    new_func_desc = FunctionDesc(job_func=job_func)
+    new_func_desc.job_config_proto.CopyFrom(func_desc.job_config_proto)
+    new_func_desc.function_attribute = copy.deepcopy(func_desc.function_attribute)
+    return new_func_desc
+
+
+def oneflow_function_config(*field_paths):
+    def Decorator(func):
+        global _class_property2return_obj_class
+        for field_path in field_paths:
+            fields = field_path.split(".")
+            assert len(fields) > 0
+            cls = FunctionConfig
+            for (index, field) in enumerate(fields):
+                assert field != "function_desc"
+                assert re.match("^[_\\w]+[_\\w\\d]*$", field)
+                if (cls, field) not in _class_property2return_obj_class:
+                    class_name = ".".join(["function_config"] + fields[: index + 1])
+
+                    def Init(self, function_desc):
+                        self.function_desc = function_desc
+
+                    config_class = type(class_name, (object,), dict(__init__=Init))
+                    setattr(cls, field, _MakeInnerJobConfigClassProperty(config_class))
+                    _class_property2return_obj_class[cls, field] = config_class
+                cls = _class_property2return_obj_class[cls, field]
+            cls.__call__ = _MakeLeafJobConfigCall(func)
+        return func
+
+    return Decorator
+
+
+_class_property2return_obj_class = {}
+
+
+def _MakeInnerJobConfigClassProperty(return_obj_class):
+    return property(lambda self: return_obj_class(self.function_desc))
+
+
+def _MakeLeafJobConfigCall(method):
+    return lambda self, *argv, **kwarg: method(self.function_desc, *argv, **kwarg)
+
+
+def _RunEagerJob(session, function_desc, *args):
+    return session.TryInit().EagerRun(function_desc, *args)
+
+
+def _RunLazyJob(session, job_func, *args, **kwargs):
+    return session.TryInit().LazyRun(job_func, *args, **kwargs)
+
+
+@oneflow_function_config("default_data_type")
+def set_default_data_type(func_desc, value):
+    """Set default data type for job
+
+    Args:
+        func_desc ([type]): job function
+        value ([type]): data type. e.g. flow.float
+    """
+    func_desc.job_config_proto.set_default_data_type(
+        data_type_cfg.DataType(
+            oneflow._oneflow_internal.deprecated.GetProtoDtype4OfDtype(value)
+        )
+    )
+
+
+@oneflow_function_config("default_initializer_conf")
+def set_default_initializer_conf(func_desc, value):
+    """Set default initial configuration for job
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    assert type(value) is dict
+    pb_util.PythonDict2CFG(
+        value, func_desc.job_config_proto.mutable_default_initializer_conf()
+    )
+
+
+@oneflow_function_config("exp_run_conf")
+def set_exp_run_conf(value):
+    """Set experimental configuration for job
+
+    Args:
+        value ([type]): [description]
+    """
+    assert type(func_desc, value) is dict
+    pb_util.PythonDict2CFG(value, func_desc.job_config_proto.mutable_exp_run_conf())
+
+
+@oneflow_function_config("static_mem_alloc_policy_white_list.has")
+def static_mem_alloc_policy_white_list_has_policy(func_desc, policy):
+    """Get items from white list related to static memory allocation policy
+
+    Args:
+        func_desc ([type]): [description]
+        policy ([type]): [description]
+
+    Returns:
+        [type]: [description]
+    """
+    return getattr(
+        func_desc.job_config_proto.mutable_memory_allocation_algorithm_conf(), policy
+    )()
+
+
+@oneflow_function_config("static_mem_alloc_policy_white_list.add")
+def static_mem_alloc_policy_white_list_add_policy(func_desc, policy):
+    """Add item to white list related to static memory allocation policy
+
+    Args:
+        func_desc ([type]): [description]
+        policy ([type]): [description]
+    """
+    getattr(
+        func_desc.job_config_proto.mutable_memory_allocation_algorithm_conf(),
+        "set_" + policy,
+    )(True)
+
+
+@oneflow_function_config("static_mem_alloc_policy_white_list.remove")
+def static_mem_alloc_policy_white_list_remove_policy(func_desc, policy):
+    """Remove item of white list related to static memory allocation policy
+
+    Args:
+        func_desc ([type]): [description]
+        policy ([type]): [description]
+    """
+    getattr(
+        func_desc.job_config_proto.mutable_memory_allocation_algorithm_conf(),
+        "set_" + policy,
+    )(False)
+
+
+@oneflow_function_config("static_mem_alloc_policy_white_list.policy_mem_size_first")
+def policy_mem_size_first(func_desc):
+    """A static memory allocation policy called: mem_size_first
+
+    Args:
+        func_desc ([type]): [description]
+
+    Returns:
+        [type]: [description]
+    """
+    return "use_mem_size_first_algo"
+
+
+@oneflow_function_config(
+    "static_mem_alloc_policy_white_list.policy_mutual_exclusion_first"
+)
+def policy_mutual_exclusion_first(func_desc):
+    """A static memory allocation policy called: mutual_exclusion_first
+
+    Args:
+        func_desc ([type]): [description]
+
+    Returns:
+        [type]: [description]
+    """
+    return "use_mutual_exclusion_first_algo"
+
+
+@oneflow_function_config("static_mem_alloc_policy_white_list.policy_time_line")
+def policy_time_line(func_desc):
+    """A static memory allocation policy called: time_line
+
+    Args:
+        func_desc ([type]): [description]
+
+    Returns:
+        [type]: [description]
+    """
+    return "use_time_line_algo"
+
+
+@oneflow_function_config("static_mem_alloc_algo_white_list.show")
+def show_static_mem_alloc_algo_white_list(func_desc):
+    """Show configuration of  static memory allocation policy,
+          including: "use_mem_size_first_algo", "use_mutual_exclusion_first_algo", "use_time_line_algo"
+
+    Args:
+        func_desc ([type]): [description]
+
+    Returns:
+        [type]: [description]
+    """
+    return [
+        "use_mem_size_first_algo",
+        "use_mutual_exclusion_first_algo",
+        "use_time_line_algo",
+    ]
+
+
+@oneflow_function_config("enable_cudnn")
+def set_enable_cudnn(func_desc, value=True):
+    """Whether use cudnn to accelerate job or not.
+
+    Args:
+        func_desc ([type]): [description]
+        value (bool, optional): [description]. Defaults to True.
+    """
+    func_desc.job_config_proto.set_enable_cudnn(value)
+
+
+@oneflow_function_config("cudnn_buf_limit_mbyte")
+def set_cudnn_buf_limit_mbyte(func_desc, value):
+    """Set cudnn buffer limit, e.g. 1024mb
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    func_desc.job_config_proto.set_cudnn_buf_limit_mbyte(value)
+
+
+@oneflow_function_config("cudnn_conv_force_fwd_algo")
+def set_cudnn_conv_force_fwd_algo(func_desc, value):
+    """Set value to cudnn conv_force_forward algorithm
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    func_desc.job_config_proto.set_cudnn_conv_force_fwd_algo(value)
+
+
+@oneflow_function_config("cudnn_conv_force_bwd_data_algo")
+def set_cudnn_conv_force_bwd_data_algo(func_desc, value):
+    """Set value to cudnn conv_force_backward_data algorithm
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    func_desc.job_config_proto.set_cudnn_conv_force_bwd_data_algo(value)
+
+
+@oneflow_function_config("cudnn_conv_force_bwd_filter_algo")
+def set_cudnn_conv_force_bwd_filter_algo(func_desc, value):
+    """Set value to cudnn conv_force_backward_filter algorithm
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    func_desc.job_config_proto.set_cudnn_conv_force_bwd_filter_algo(value)
+
+
+@oneflow_function_config("cudnn_conv_heuristic_search_algo")
+def set_cudnn_conv_heuristic_search_algo(func_desc, value):
+    """Set value to cudnn conv_heuristic_search algorithm
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    func_desc.job_config_proto.set_cudnn_conv_heuristic_search_algo(value)
+
+
+@oneflow_function_config("enable_cudnn_fused_normalization_add_relu")
+def set_enable_cudnn_fused_normalization_add_relu(func_desc, value):
+    """Whether enable cudnn_fused_normalization_add_relu.
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    func_desc.job_config_proto.set_enable_cudnn_fused_normalization_add_relu(value)
+
+
+@oneflow_function_config("enable_fuse_add_to_output")
+def set_enable_fuse_add_to_output(func_desc, value):
+    """Whether enable fuse_add_to_output.
+            If enabled, try to fuse a binary element-wise add to one of the predecessors to improve performance.
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    func_desc.job_config_proto.set_enable_fuse_add_to_output(value)
+
+
+@oneflow_function_config("enable_fuse_cast_scale")
+def set_enable_fuse_cast_scale(func_desc, value=True):
+    """Whether enable fuse_cast_scale.
+            If enabled, try to fuse cast and scalar_mul_by_tensor to improve performance.
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    func_desc.job_config_proto.set_enable_fuse_cast_scale(value)
+
+
+@oneflow_function_config("cudnn_conv_use_deterministic_algo_only")
+def set_cudnn_conv_use_deterministic_algo_only(func_desc, value):
+    """Set value to cudnn conv_use_deterministic_only algorithm
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    func_desc.job_config_proto.set_cudnn_conv_use_deterministic_algo_only(value)
+
+
+@oneflow_function_config("enable_reused_mem")
+def set_enable_reused_mem(func_desc, value=True):
+    """Whether enable reuse memory or not
+
+    Args:
+        func_desc ([type]): [description]
+        value (bool, optional): [description]. Defaults to True.
+    """
+    func_desc.job_config_proto.set_enable_reused_mem(value)
+
+
+@oneflow_function_config("enable_inplace")
+def set_enable_inplace(func_desc, value=True):
+    """Whether enable inplace  or not
+
+    Args:
+        func_desc ([type]): [description]
+        value (bool, optional): [description]. Defaults to True.
+    """
+    func_desc.job_config_proto.set_enable_inplace(value)
+
+
+@oneflow_function_config("enable_inplace_in_reduce_struct")
+def set_enable_inplace_in_reduce_struct(func_desc, value=True):
+    print(
+        "'enable_inplace_in_reduce_struct' has been deprecated, has no effect and will be removed in the future."
+    )
+
+
+@oneflow_function_config("enable_nccl")
+def set_enable_nccl(func_desc, value=True):
+    print(
+        "'enable_nccl' has been deprecated, has no effect and will be removed in the future."
+    )
+
+
+@oneflow_function_config("use_nccl_inter_node_communication")
+def set_use_nccl_inter_node_communication(func_desc, value=True):
+    print(
+        "'use_nccl_inter_node_communication' has been deprecated, has no effect and will be removed in the future."
+    )
+
+
+@oneflow_function_config("use_boxing_v2")
+def set_use_boxing_v2(func_desc, value=True):
+    print(
+        "'use_boxing_v2' has been deprecated, has no effect and will be removed in the future."
+    )
+
+
+@oneflow_function_config("do_parallel_cast_before_widening_type_cast")
+def set_do_parallel_cast_before_widening_type_cast(func_desc, value=True):
+    func_desc.job_config_proto.set_do_parallel_cast_before_widening_type_cast(value)
+
+
+@oneflow_function_config("enable_all_reduce_group")
+def set_enable_all_reduce_group(func_desc, value=True):
+    print(
+        "'enable_all_reduce_group' has been deprecated, has no effect and will be removed in the future."
+    )
+
+
+@oneflow_function_config("all_reduce_group_num")
+def set_all_reduce_group_num(func_desc, value):
+    print(
+        "'all_reduce_group_num' has been deprecated, has no effect and will be removed in the future."
+    )
+
+
+@oneflow_function_config("all_reduce_lazy_ratio")
+def set_all_reduce_lazy_ratio(func_desc, value):
+    print(
+        "'all_reduce_lazy_ratio' has been deprecated, has no effect and will be removed in the future."
+    )
+
+
+@oneflow_function_config("all_reduce_group_min_mbyte")
+def set_all_reduce_group_min_mbyte(func_desc, value):
+    print(
+        "'all_reduce_group_min_mbyte' has been deprecated, has no effect and will be removed in the future."
+    )
+
+
+@oneflow_function_config("all_reduce_group_size_warmup")
+def set_all_reduce_group_size_warmup(func_desc, value):
+    print(
+        "'all_reduce_group_size_warmup' has been deprecated, has no effect and will be removed in the future."
+    )
+
+
+@oneflow_function_config("all_reduce_fp16")
+def set_all_reduce_fp16(func_desc, value=True):
+    print(
+        "'all_reduce_fp16' has been deprecated, has no effect and will be removed in the future."
+    )
+
+
+@oneflow_function_config(
+    "optimizer_placement_optimization_mode",
+    "train.optimizer_placement_optimization_mode",
+)
+def set_optimizer_placement_optimization_mode(func_desc, mode):
+    """Enable optimizer_placement_optimization with mode 'mode'
+
+    Args:
+        func_desc ([type]): [description]
+        mode (str): [description].
+    """
+    assert mode in ["non_distributed", "distributed_split"]
+    func_desc.job_config_proto.set_optimizer_placement_optimization_mode(mode)
+
+
+@oneflow_function_config(
+    "optimizer_placement_optimization_threshold",
+    "train.optimizer_placement_optimization_threshold",
+)
+def set_optimizer_placement_optimization_threshold(func_desc, value):
+    func_desc.job_config_proto.set_optimizer_placement_optimization_threshold(value)
+
+
+@oneflow_function_config("enable_non_distributed_optimizer")
+def set_enable_non_distributed_optimizer(func_desc, value=True):
+    """Whether enable non_distributed optimizer or not
+
+    Args:
+        func_desc ([type]): [description]
+        value (bool, optional): [description]. Defaults to True.
+    """
+    if value:
+        set_optimizer_placement_optimization_mode(func_desc, "non_distributed")
+
+
+@oneflow_function_config("disable_all_reduce_sequence")
+def set_disable_all_reduce_sequence(func_desc, value=True):
+    print(
+        "'disable_all_reduce_sequence' has been deprecated, has no effect and will be removed in the future."
+    )
+
+
+@oneflow_function_config("prune_parallel_cast_ops")
+def set_prune_parallel_cast_ops(func_desc, value=True):
+    """Whether prune parallel cast  operations or not.
+
+    Args:
+        func_desc ([type]): [description]
+        value (bool, optional): [description]. Defaults to True.
+    """
+    func_desc.job_config_proto.set_prune_parallel_cast_ops(value)
+
+
+@oneflow_function_config("prune_cast_to_static_shape_ops")
+def set_prune_cast_to_static_shape_ops(func_desc, value=True):
+    """Whether or not set prune_cast to static shape opretions
+
+    Args:
+        func_desc ([type]): [description]
+        value (bool, optional): [description]. Defaults to True.
+    """
+    func_desc.job_config_proto.set_prune_cast_to_static_shape_ops(value)
+
+
+@oneflow_function_config("prune_amp_white_identity_ops")
+def set_prune_amp_white_identity_ops(func_desc, value=True):
+    """Whether prune amp_white_identity operations or not.
+
+    Args:
+        func_desc ([type]): [description]
+        value (bool, optional): [description]. Defaults to True.
+    """
+    func_desc.job_config_proto.set_prune_amp_white_identity_ops(value)
+
+
+@oneflow_function_config("non_distributed_optimizer_group_size_mbyte")
+def set_non_distributed_optimizer_group_size_mbyte(func_desc, value):
+    print(
+        "'non_distributed_optimizer_group_size_mbyte' has been deprecated, has no effect and will be removed in the future."
+    )
+
+
+@oneflow_function_config(
+    "enable_true_half_config_when_conv", "cudnn_conv_enable_true_half"
+)
+def set_cudnn_conv_enable_true_half(func_desc, value=True):
+    """Whether  use true_half mode or not during  convolution calculation process while using cudnn.
+
+    Args:
+        func_desc ([type]): [description]
+        value (bool, optional): [description]. Defaults to True.
+    """
+    func_desc.job_config_proto.set_cudnn_conv_enable_pseudo_half(not value)
+
+
+@oneflow_function_config(
+    "cudnn_conv_enable_pseudo_half", "enable_cudnn_conv_pseudo_half"
+)
+def set_cudnn_conv_enable_pseudo_half(func_desc, value):
+    """Whether  enable pseudo_half mode or not during  convolution calculation process while using cudnn
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    func_desc.job_config_proto.set_cudnn_conv_enable_pseudo_half(value)
+
+
+@oneflow_function_config("enable_float_compute_for_half_gemm")
+def set_enable_float_compute_for_half_gemm(func_desc, value=True):
+    """Whether  enable float_compute or not ,
+          if True, means that the type of intermedia value is float when compute half gemm.
+
+    Args:
+        func_desc ([type]): [description]
+        value (bool, optional): [description]. Defaults to True.
+    """
+    print(
+        "WARNING: enable_float_compute_for_half_gemm has been deprecated, because we always use float compute for half gemm. Please remove it.\n        "
+    )
+    print(traceback.format_stack()[-3])
+
+
+@oneflow_function_config("enable_quantization_aware_training")
+@oneflow_function_config("enable_qat")
+def set_enable_quantization_aware_training(func_desc, value=True):
+    """If true, then job will use quantization aware training
+
+    Args:
+        func_desc ([type]): [description]
+        value (bool, optional): [description]. Defaults to True.
+    """
+    func_desc.job_config_proto.set_enable_quantization_aware_training(value)
+
+
+@oneflow_function_config("qat.per_channel_weight_quantization")
+def set_qat_per_channel(func_desc, value=True):
+    func_desc.job_config_proto.mutable_qat_config().set_per_channel_weight_quantization(
+        value
+    )
+
+
+@oneflow_function_config("qat.symmetric")
+def set_qat_symmetric(func_desc, value=True):
+    func_desc.job_config_proto.mutable_qat_config().set_symmetric(value)
+
+
+@oneflow_function_config("qat.moving_min_max_momentum")
+def set_qat_moving_min_max_momentum(func_desc, value: float):
+    func_desc.job_config_proto.mutable_qat_config().set_moving_min_max_momentum(value)
+
+
+@oneflow_function_config("qat.moving_min_max_stop_update_after_iters")
+def set_qat_moving_min_max_momentum(func_desc, value: float):
+    func_desc.job_config_proto.mutable_qat_config().set_moving_min_max_stop_update_after_iters(
+        value
+    )
+
+
+@oneflow_function_config("qat.target_backend")
+def set_qat_symmetric(func_desc, value: str):
+    func_desc.job_config_proto.mutable_qat_config().set_target_backend(value)
+
+
+@oneflow_function_config("enable_auto_mixed_precision")
+def set_enable_auto_mixed_precision(func_desc, value=True):
+    """If true, then job will use mixed precision mode, it means use both float16 and float32 during model training.
+
+    Args:
+        func_desc ([type]): [description]
+        value (bool, optional): [description]. Defaults to True.
+    """
+    func_desc.job_config_proto.set_enable_auto_mixed_precision(value)
+
+
+@oneflow_function_config("enable_keep_header_only")
+def set_enable_keep_header_only(func_desc, value=True):
+    """deprecated api.
+
+    Args:
+        func_desc ([type]): [description]
+        value (bool, optional): [description]. Defaults to True.
+    """
+    print("Sorry! enable_keep_header_only is deprecated and it doesn't work.\n")
+
+
+@oneflow_function_config("concurrency_width")
+def set_concurrency_width(func_desc, value):
+    """Set up concurrency width
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    func_desc.job_config_proto.set_concurrency_width(value)
+
+
+@oneflow_function_config("train.model_update_conf")
+def set_model_update_conf(func_desc, value):
+    """Set up optimizer and update method of learning rate  for job
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    print(
+        "WARNING: func_config.train.* has been deprecated. Please replace it by the new optimizer api.\n        "
+    )
+    print(traceback.format_stack()[-3])
+    assert type(value) is dict
+    pb_msg = func_desc.job_config_proto.mutable_train_conf().mutable_model_update_conf()
+    pb_util.PythonDict2CFG(value, pb_msg)
+
+
+@oneflow_function_config("indexed_slices_optimizer_conf")
+def set_indexed_slices_optimizer_conf(func_desc, value):
+    """Set indexed slices configuration of optimizer
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    assert type(value) is dict
+    pb_msg = func_desc.job_config_proto.mutable_indexed_slices_optimizer_conf()
+    pb_util.PythonDict2CFG(value, pb_msg)
+
+
+@oneflow_function_config("enable_fuse_model_update_ops")
+def set_enable_fuse_model_update_ops(func_desc, value=True):
+    """Whether enable fuse_model_update_ops.
+            If enabled, try to fuse cast + scale + l1_l2_regularize_gradient + model_update to one op to improve performance.
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    func_desc.job_config_proto.set_enable_fuse_model_update_ops(value)
+
+
+@oneflow_function_config("enable_gradients_stats_aggregation")
+def set_enable_gradients_stats_aggregation(func_desc, value=True):
+    """Whether enable gradients_stats_aggregation.
+            If enabled, gradients stats ops (norm, finite, ...) will be aggregated.
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    func_desc.job_config_proto.set_enable_gradients_stats_aggregation(value)
+
+
+@oneflow_function_config("train.loss_scale_factor")
+def set_loss_scale_factor(func_desc, value):
+    """Set scale factor for loss
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    print(
+        "WARNING: func_config.train.* has been deprecated. Please replace it by the new optimizer api.\n        "
+    )
+    print(traceback.format_stack()[-3])
+    func_desc.job_config_proto.mutable_train_conf().set_loss_scale_factor(value)
+
+
+@oneflow_function_config("train.primary_lr")
+def set_primary_lr(func_desc, value):
+    """Set the primary leaning rate for job
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    print(
+        "WARNING: func_config.train.* has been deprecated. Please replace it by the new optimizer api.\n        "
+    )
+    print(traceback.format_stack()[-3])
+    func_desc.job_config_proto.mutable_train_conf().set_primary_lr(value)
+
+
+@oneflow_function_config("train.secondary_lr")
+def set_secondary_lr(func_desc, value):
+    """Set the secondary leaning rate for job
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    print(
+        "WARNING: func_config.train.* has been deprecated. Please replace it by the new optimizer api.\n        "
+    )
+    print(traceback.format_stack()[-3])
+    func_desc.job_config_proto.mutable_train_conf().set_secondary_lr(value)
+
+
+@oneflow_function_config("train.num_gradient_accumulation_steps")
+def set_num_gradient_accumulation_steps(func_desc, value):
+    func_desc.job_config_proto.set_num_gradient_accumulation_steps(value)
+
+
+@oneflow_function_config("default_placement_scope")
+def set_default_placement(func_desc, value):
+    """Set the default placement for job
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    assert isinstance(value, placement_ctx.EmptyPlacementScope)
+    func_desc.function_attribute.default_placement_scope = value
+
+
+@oneflow_function_config("use_xla_jit")
+def set_use_xla_jit(func_desc, value=True):
+    """Whether use xla  or not
+
+    Args:
+        func_desc ([type]): [description]
+        value (bool, optional): [description]. Defaults to True.
+    """
+    func_desc.job_config_proto.mutable_xrt_config().set_use_xla_jit(value)
+
+
+@oneflow_function_config("use_tensorrt")
+def set_use_tensorrt(func_desc, value=True):
+    """Whether use tensorrt or not
+
+    Args:
+        func_desc ([type]): [description]
+        value (bool, optional): [description]. Defaults to True.
+    """
+    func_desc.job_config_proto.mutable_xrt_config().set_use_tensorrt(value)
+
+
+@oneflow_function_config("tensorrt.use_fp16")
+def set_tensorrt_use_fp16(func_desc, value=True):
+    """Whether use tensorrt fp16  or not
+
+    Args:
+        func_desc ([type]): [description]
+        value (bool, optional): [description]. Defaults to True.
+    """
+    set_use_tensorrt(func_desc, True)
+    func_desc.job_config_proto.mutable_xrt_config().mutable_tensorrt_config().set_use_fp16(
+        value
+    )
+
+
+@oneflow_function_config("tensorrt.use_int8")
+def set_tensorrt_use_int8(func_desc, value=True):
+    """Whether use tensorrt int8 mode or not
+
+    Args:
+        func_desc ([type]): [description]
+        value (bool, optional): [description]. Defaults to True.
+    """
+    set_use_tensorrt(func_desc, True)
+    func_desc.job_config_proto.mutable_xrt_config().mutable_tensorrt_config().set_use_int8(
+        value
+    )
+
+
+@oneflow_function_config("tensorrt.int8_calibration")
+def set_tensorrt_int8_calibration(func_desc, value):
+    """Set up calibration of tensorrt int8
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    assert func_desc.job_config_proto.xrt_config().tensorrt_config().use_int8()
+    func_desc.job_config_proto.mutable_xrt_config().mutable_tensorrt_config().set_int8_calibration(
+        value
+    )
+
+
+@oneflow_function_config("default_logical_view")
+def set_default_distribute_strategy(func_desc, value):
+    """Set up default distribute strategy for job
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    assert isinstance(value, distribute_ctx.DistributeStrategy)
+    func_desc.function_attribute.default_distribute_strategy = value
+
+
+@oneflow_function_config("allow_cpu_return_op")
+def allow_cpu_return_op(func_desc, value):
+    """Whether allow operaions returned from cpu or  not
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    func_desc.function_attribute.allow_cpu_return_op = value
+
+
+@oneflow_function_config("default_distribute_strategy")
+@oneflow_deprecate()
+def deprecated_set_default_distribute_strategy(*args, **kwargs):
+    print(
+        "WARNING:",
+        "function_config.default_distribute_strategy",
+        "has been deprecated. Please use {} instead.".format(
+            "function_config.default_logical_view"
+        ),
+    )
+    print(traceback.format_stack()[-3], file=sys.stderr)
+    set_default_distribute_strategy(*args, **kwargs)
diff --git a/python/oneflow/compatible/single_client/framework/functional.py b/python/oneflow/compatible/single_client/framework/functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..80f58b02fbbd22e6187c99411354e5d4b56b554e
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/functional.py
@@ -0,0 +1,70 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+
+
+def RecursveDetermine(arg):
+    if isinstance(arg, flow.Tensor):
+        if not arg.is_determined:
+            arg.determine()
+        return arg._local_or_consistent_tensor
+    elif isinstance(arg, list) or isinstance(arg, tuple):
+        arg = list(arg)
+        for i in range(len(arg)):
+            arg[i] = RecursveDetermine(arg[i])
+        return arg
+    elif isinstance(arg, dict):
+        for (k, v) in arg.items():
+            arg[k] = RecursveDetermine(v)
+    else:
+        return arg
+
+
+class Function:
+    def __init__(self, func_name, handle):
+        self.func_name = func_name
+        self.handle = handle
+
+    def __call__(self, *args, **kwargs):
+        args = list(args)
+        for i in range(len(args)):
+            args[i] = RecursveDetermine(args[i])
+        for (k, v) in kwargs.items():
+            kwargs[k] = RecursveDetermine(v)
+        return self.handle(*args, **kwargs)
+
+
+def RegisterFunctionalApis():
+    import inspect
+
+    from oneflow.compatible.single_client import F
+    from oneflow.compatible.single_client.experimental import F as expr_F
+
+    for s in dir(oneflow._oneflow_internal.F):
+        f = getattr(oneflow._oneflow_internal.F, s)
+        if inspect.isbuiltin(f):
+            func_name = s
+            if s in _function_name_aliases:
+                func_name = _function_name_aliases[s]
+                setattr(F, func_name, Function(func_name, f))
+                setattr(expr_F, func_name, Function(func_name, f))
+            setattr(F, s, Function(func_name, f))
+            setattr(expr_F, s, Function(func_name, f))
+    del inspect
+
+
+_function_name_aliases = {"add_scalar": "scalar_add"}
diff --git a/python/oneflow/compatible/single_client/framework/generator.py b/python/oneflow/compatible/single_client/framework/generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5de10191c1be257f925156841e114de1939d6c3
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/generator.py
@@ -0,0 +1,33 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+
+
+def create_generator(device=None):
+    if device is None:
+        device = "auto"
+    return oneflow._oneflow_internal.create_generator(device)
+
+
+def default_generator(device=None):
+    if device is None:
+        device = "auto"
+    return oneflow._oneflow_internal.default_generator(device)
+
+
+def manual_seed(seed):
+    oneflow._oneflow_internal.manual_seed(seed)
diff --git a/python/oneflow/compatible/single_client/framework/hob.py b/python/oneflow/compatible/single_client/framework/hob.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a02d4301e2e3c4ece378f1f3ddd3a73c06bc2cf
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/hob.py
@@ -0,0 +1,83 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import runtime_mode as rt_mode
+from oneflow.compatible.single_client.framework import session_context as session_ctx
+from oneflow.compatible.single_client.support.high_order_bool import bool_functor
+
+
+@bool_functor("Current mode is %s" % rt_mode.NORMAL_MODE)
+def in_normal_mode(ctx):
+    return rt_mode.CurrentMode() == rt_mode.NORMAL_MODE
+
+
+@bool_functor("Current mode is %s" % rt_mode.GLOBAL_MODE)
+def in_global_mode(ctx):
+    return rt_mode.CurrentMode() == rt_mode.GLOBAL_MODE
+
+
+@bool_functor("Current mode is %s" % rt_mode.DEVICE_MODE)
+def in_device_mode(ctx):
+    return rt_mode.CurrentMode() == rt_mode.DEVICE_MODE
+
+
+@bool_functor("Environment initialized")
+def env_initialized(ctx):
+    assert in_normal_mode(ctx)
+    return oneflow._oneflow_internal.IsEnvInited()
+
+
+@bool_functor("Any global function defined")
+def any_global_function_defined(ctx):
+    assert in_normal_mode(ctx)
+    return session_ctx.GetDefaultSession().AnyGlobalFunctionDefined()
+
+
+@bool_functor("Eager execution enabled")
+def eager_execution_enabled(ctx):
+    return oneflow._oneflow_internal.EagerExecutionEnabled()
+
+
+@bool_functor("Session initialized")
+def session_initialized(ctx):
+    assert in_normal_mode(ctx)
+    return session_ctx.GetDefaultSession().is_running
+
+
+@bool_functor("Current global function is trainable")
+def is_trainable(ctx):
+    assert in_global_mode(ctx)
+    if oneflow._oneflow_internal.EagerExecutionEnabled():
+        return session_ctx.GetDefaultSession().CurrentEagerGlobalFunctionDesc()
+    else:
+        job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+        return session_ctx.GetDefaultSession().GetFunctionDesc(job_name)
+
+
+@bool_functor("Current machine is master")
+def is_current_machine_master(ctx):
+    return oneflow._oneflow_internal.CurrentMachineId() == 0
+
+
+@bool_functor("Consistent view enabled")
+def consistent_view_enabled(ctx):
+    return flow.scope.consistent_view_enabled()
+
+
+@bool_functor("Mirrored view enabled")
+def mirrored_view_enabled(ctx):
+    return flow.scope.mirrored_view_enabled()
diff --git a/python/oneflow/compatible/single_client/framework/id_util.py b/python/oneflow/compatible/single_client/framework/id_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee1bde48edda821bbe9974be794cc2249cd9d639
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/id_util.py
@@ -0,0 +1,20 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow._oneflow_internal
+
+
+def UniqueStr(prefix):
+    return oneflow._oneflow_internal.UniqueStr(prefix)
diff --git a/python/oneflow/compatible/single_client/framework/input_blob_def.py b/python/oneflow/compatible/single_client/framework/input_blob_def.py
new file mode 100644
index 0000000000000000000000000000000000000000..38a10372ab7f2ca2488e6b3b1a1540122b508a5e
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/input_blob_def.py
@@ -0,0 +1,286 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import sys
+import traceback
+from functools import reduce
+from typing import Any, Optional, Sequence, Union
+
+import numpy as np
+
+import oneflow._oneflow_internal
+from oneflow._oneflow_internal.oneflow.core.register import logical_blob_id as lbi_util
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import c_api_util as c_api_util
+from oneflow.compatible.single_client.framework import (
+    compile_context as compile_context,
+)
+from oneflow.compatible.single_client.framework import distribute as distribute_util
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework import (
+    placement_context as placement_ctx,
+)
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+from oneflow.core.job import sbp_parallel_pb2 as sbp_parallel_pb
+from oneflow.core.operator import interface_blob_conf_pb2 as inter_face_blob_conf_util
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+
+
+class ArgBlobDef(object):
+    def __init__(
+        self,
+        shape,
+        dtype,
+        name=None,
+        distribute=oneflow._oneflow_internal.distribute.auto(),
+    ):
+        lbi = lbi_util.LogicalBlobId()
+        if name is None:
+            name = id_util.UniqueStr("Input_")
+        lbi.set_op_name(name)
+        lbi.set_blob_name("out")
+        self.lbi_ = lbi
+        assert type(shape) is tuple
+        for dim in shape:
+            assert type(dim) is int
+            assert dim > 0
+        self.shape_ = shape
+        self.dtype_ = dtype
+        self.distribute_ = distribute
+
+    @property
+    def lbi(self):
+        return self.lbi_
+
+    @property
+    def op_name(self):
+        return self.lbi_.op_name()
+
+    @property
+    def blob_name(self):
+        return self.lbi_.blob_name()
+
+    @property
+    def unique_name(self):
+        return self.op_name + "/" + self.blob_name + self._Distribute2Str()
+
+    @property
+    def shape(self):
+        return self.shape_
+
+    @property
+    def dtype(self):
+        return self.dtype_
+
+    @property
+    def is_dynamic(self):
+        raise NotImplementedError
+
+    def with_distribute(self, distribute):
+        return type(self)(shape=self.shape_, dtype=self.dtype_, name=self.op_name)
+
+    def Clone(self, op_name=None):
+        return type(self)(shape=self.shape_, dtype=self.dtype_, name=op_name)
+
+    def AddAndInferOp(self, op_conf):
+        raise NotImplementedError
+
+    def EagerAddAndInferOp(self, op_conf):
+        raise NotImplementedError
+
+    def CheckAndAsyncPush(self, session, arg_ndarray):
+        self._CheckNdarray(arg_ndarray)
+        self._AsyncPush(session, arg_ndarray)
+
+    def _CheckNdarray(self, ndarray):
+        raise NotImplementedError
+
+    def _AsyncPush(self, session, arg_ndarray):
+        raise NotImplementedError
+
+    def ToInterfaceBlobConf(self):
+        interface_blob_conf = inter_face_blob_conf_util.InterfaceBlobConf()
+        interface_blob_conf.shape.dim.extend(self.shape_)
+        interface_blob_conf.data_type = oneflow._oneflow_internal.deprecated.GetProtoDtype4OfDtype(
+            self.dtype_
+        )
+        interface_blob_conf.is_dynamic = self.is_dynamic
+        sbp_parallel = sbp_parallel_pb.SbpParallel()
+        sbp_parallel.split_parallel.axis = 0
+        interface_blob_conf.parallel_distribution.sbp_parallel.extend([sbp_parallel])
+        return interface_blob_conf
+
+    def _Distribute2Str(self):
+        if (
+            type(self.distribute_)
+            is oneflow._oneflow_internal.distribute.AutoDistribute
+        ):
+            return ""
+        elif (
+            type(self.distribute_)
+            is oneflow._oneflow_internal.distribute.SplitDistribute
+        ):
+            return ":S" + str(self.distribute_.axis)
+        elif (
+            type(self.distribute_)
+            is oneflow._oneflow_internal.distribute.BroadcastDistribute
+        ):
+            return ":B"
+        else:
+            raise NotImplementedError
+
+
+class FixedTensorDef(ArgBlobDef):
+    def __init__(
+        self,
+        shape: Sequence[int],
+        dtype: flow.dtype = flow.float,
+        name: Optional[str] = None,
+    ) -> None:
+        ArgBlobDef.__init__(self, shape, dtype=dtype, name=name)
+
+    @property
+    def is_dynamic(self) -> bool:
+        return False
+
+    def AddAndInferOp(self, op_conf: op_conf_util.OperatorConf) -> Any:
+        return compile_context.CurJobAddConsistentOp(op_conf)
+
+    def EagerAddAndInferOp(self, op_conf: op_conf_util.OperatorConf) -> Any:
+        parallel_symbol = flow.current_scope().device_parallel_desc_symbol
+        if (
+            parallel_symbol.device_tag == "gpu"
+            and list(dict(parallel_symbol.machine_id2device_id_list).keys()) == [0]
+            and (parallel_symbol.parallel_num == 1)
+        ):
+            device_tag = "gpu"
+            device_ids = "@0:%s" % parallel_symbol.machine_id2device_id_list[0][0]
+        else:
+            device_tag = "cpu"
+            device_ids = "@0:0"
+        with flow.scope.placement(device_tag, device_ids):
+            return compile_context.CurJobAddConsistentOp(op_conf)
+
+    def _CheckNdarray(self, ndarray: np.ndarray) -> None:
+        assert isinstance(ndarray, np.ndarray)
+        assert ndarray.shape == self.shape
+
+    def _AsyncPush(self, session: object, arg_ndarray: np.ndarray) -> None:
+        session.AsyncPush(self.op_name, _MakePushNdarrayCallback(arg_ndarray))
+
+
+class MirroredTensorDef(ArgBlobDef):
+    def __init__(
+        self,
+        shape: Sequence[int],
+        dtype: flow.dtype = flow.float,
+        name: Optional[str] = None,
+    ) -> None:
+        assert type(shape) is tuple
+        ArgBlobDef.__init__(self, shape, dtype=dtype, name=name)
+        self.sub_consistent_blob_list_ = []
+
+    @property
+    def is_dynamic(self) -> bool:
+        return True
+
+    def AddAndInferOp(self, op_conf: op_conf_util.OperatorConf) -> None:
+        _AddAndInferMirroredOp(
+            self.unique_name, op_conf, self.sub_consistent_blob_list_
+        )
+
+    def EagerAddAndInferOp(self, op_conf: op_conf_util.OperatorConf) -> Any:
+        return compile_context.CurJobAddMirroredOp(op_conf)
+
+    def _CheckNdarray(self, ndarray_list: Sequence[np.ndarray]) -> None:
+        assert isinstance(ndarray_list, (list, tuple))
+        assert len(self.sub_consistent_blob_list_) == len(ndarray_list)
+
+        def GetElemCnt(shape):
+            return reduce(lambda x, y: x * y, shape, 1)
+
+        for (consistent_blob, ndarray) in zip(
+            self.sub_consistent_blob_list_, ndarray_list
+        ):
+            assert type(ndarray) is np.ndarray
+            assert len(ndarray.shape) == len(self.shape)
+            assert GetElemCnt(ndarray.shape) <= GetElemCnt(self.shape)
+
+    def _AsyncPush(self, session: object, ndarray_list: Sequence[np.ndarray]) -> None:
+        for i in range(len(ndarray_list)):
+            sub_blob = self.sub_consistent_blob_list_[i]
+            session.AsyncPush(
+                sub_blob.op_name, _MakePushNdarrayCallback(ndarray_list[i])
+            )
+
+
+def _AddAndInferMirroredOp(mirrored_lbn, op_conf, sub_consistent_blob_list):
+    compile_context.CurJobAddMirroredOp(op_conf)
+    job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+    num_sub_lbi = c_api_util.JobBuildAndInferCtx_MirroredBlobGetNumSubLbi(
+        job_name, mirrored_lbn
+    )
+    for i in range(num_sub_lbi):
+        sub_lbi = c_api_util.JobBuildAndInferCtx_MirroredBlobGetSubLbi(
+            job_name, mirrored_lbn, i
+        )
+        lbi = lbi_util.LogicalBlobId()
+        lbi.set_op_name(sub_lbi.op_name)
+        lbi.set_blob_name(sub_lbi.blob_name)
+        sub_consistent_blob_list.append(
+            oneflow._oneflow_internal.ConsistentBlob(
+                lbi, "", oneflow._oneflow_internal.distribute.auto()
+            )
+        )
+
+
+def _MakePushNdarrayCallback(ndarray):
+    copied = np.copy(ndarray, order="C")
+
+    def Copy(ofblob):
+        capacity = reduce(lambda x, y: x * y, ofblob.static_shape, 1)
+        elem_cnt = reduce(lambda x, y: x * y, copied.shape, 1)
+        assert elem_cnt <= capacity, "%s v.s. %s" % (copied.shape, ofblob.static_shape)
+        ofblob.CopyFromNdarray(copied)
+
+    return Copy
+
+
+class DeprecatedFixedTensorDef(FixedTensorDef):
+    def __init__(self, *args, **kwargs):
+        running_script = traceback.format_stack()[-2].split(",")[0].split(" ")[3]
+        if not running_script.endswith('input_blob_def.py"'):
+            print(
+                "WARNING: oneflow.compatible.single_client.FixedTensorDef has been deprecated. Please use oneflow.compatible.single_client.typing.Numpy.Placeholder instead."
+            )
+            print(
+                "For instance:\n            - def job_func(images=oneflow.compatible.single_client.FixedTensorDef((32, 1, 28, 28), dtype=flow.float))\n            + def job_func(images:oneflow.compatible.single_client.typing.Numpy.Placeholder((32, 1, 28, 28), dtype=flow.float))"
+            )
+            print(traceback.format_stack()[-2])
+        super().__init__(*args, **kwargs)
+
+
+class DeprecatedMirroredTensorDef(MirroredTensorDef):
+    def __init__(self, *args, **kwargs):
+        running_script = traceback.format_stack()[-2].split(",")[0].split(" ")[3]
+        if not running_script.endswith('input_blob_def.py"'):
+            print(
+                "WARNING: oneflow.compatible.single_client.MirroredTensorDef has been deprecated. Please use oneflow.compatible.single_client.typing.ListNumpy.Placeholder instead."
+            )
+            print(
+                "For instance:\n            - def job_func(images=oneflow.compatible.single_client.MirroredTensorDef((32, 1, 28, 28), dtype=flow.float))\n            + def job_func(images:oneflow.compatible.single_client.typing.ListNumpy.Placeholder((32, 1, 28, 28), dtype=flow.float))"
+            )
+            print(traceback.format_stack()[-2])
+        super().__init__(*args, **kwargs)
diff --git a/python/oneflow/compatible/single_client/framework/interpret_util.py b/python/oneflow/compatible/single_client/framework/interpret_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..b14052a443283d9ece0616a6bd236ee9d61f2420
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/interpret_util.py
@@ -0,0 +1,84 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.eager import gradient_util as gradient_util
+from oneflow.compatible.single_client.framework import compile_context as compile_ctx
+from oneflow.compatible.single_client.framework import hob as hob
+from oneflow.compatible.single_client.support import enable_if as enable_if
+
+blob_register = oneflow._oneflow_internal.GetDefaultBlobRegister()
+
+
+def Forward(op_conf, scope_symbol=None):
+    if scope_symbol is None:
+        scope_symbol = flow.current_scope()
+    func = enable_if.unique([LazyInfer, EagerForward])
+    return func(compile_ctx.CurJobAddOp, op_conf, scope_symbol)
+
+
+def OpKernelForward(op_conf, opkernel_object):
+    func = enable_if.unique([LazyOpKernelInfer, EagerOpKernelForward])
+    return func(compile_ctx.CurJobAddOp, op_conf, opkernel_object)
+
+
+def ConsistentForward(op_conf, scope_symbol=None):
+    if scope_symbol is None:
+        scope_symbol = flow.current_scope()
+    func = enable_if.unique([LazyInfer, EagerForward])
+    return func(compile_ctx.CurJobAddConsistentOp, op_conf, scope_symbol)
+
+
+def OpKernelConsistentForward(op_conf, opkernel_object):
+    func = enable_if.unique([LazyOpKernelInfer, EagerOpKernelForward])
+    return func(compile_ctx.CurJobAddConsistentOp, op_conf, opkernel_object)
+
+
+@enable_if.condition(hob.in_global_mode & ~hob.eager_execution_enabled)
+def LazyInfer(add_and_infer, op_conf, scope_symbol=None):
+    return add_and_infer(op_conf, scope_symbol)
+
+
+@enable_if.condition(hob.in_global_mode & ~hob.eager_execution_enabled)
+def LazyOpKernelInfer(add_and_infer, op_conf, opkernel_object):
+    return add_and_infer(op_conf, opkernel_object.scope_symbol)
+
+
+@enable_if.condition(hob.in_global_mode & hob.eager_execution_enabled)
+def EagerForward(add_and_infer, op_conf, scope_symbol=None):
+    op_attribute = add_and_infer(op_conf, scope_symbol)
+    parallel_conf = scope_symbol.device_parallel_desc_symbol.parallel_conf
+    from oneflow.compatible.single_client.eager import op_executor as op_executor
+
+    op_executor.Interpret(op_attribute, parallel_conf, blob_register)
+    bw_blob_register = gradient_util.GetDefaultBackwardBlobRegister()
+    gradient_util.TrySetBackwardUsedBlobObject(
+        op_attribute, blob_register, bw_blob_register
+    )
+    return op_attribute
+
+
+@enable_if.condition(hob.in_global_mode & hob.eager_execution_enabled)
+def EagerOpKernelForward(add_and_infer, op_conf, opkernel_object):
+    op_attribute = add_and_infer(op_conf, opkernel_object.scope_symbol)
+    from oneflow.compatible.single_client.eager import op_executor as op_executor
+
+    op_executor.OpKernelCall(opkernel_object, op_attribute, blob_register)
+    bw_blob_register = gradient_util.GetDefaultBackwardBlobRegister()
+    gradient_util.TrySetBackwardUsedBlobObject(
+        op_attribute, blob_register, bw_blob_register
+    )
+    return op_attribute
diff --git a/python/oneflow/compatible/single_client/framework/job_instance.py b/python/oneflow/compatible/single_client/framework/job_instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..220bdf3df80dba24362ab47cf1926ac30b7284bd
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/job_instance.py
@@ -0,0 +1,145 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import sys
+import traceback
+
+import oneflow._oneflow_internal
+from oneflow.compatible.single_client.framework import ofblob as ofblob
+
+
+def MakeUserJobInstance(job_name, finish_cb=None):
+    return MakeJobInstance(job_name, finish_cb=finish_cb)
+
+
+def MakePullJobInstance(job_name, op_name, pull_cb, finish_cb=None):
+    return MakeJobInstance(
+        job_name,
+        sole_output_op_name_in_user_job=op_name,
+        pull_cb=pull_cb,
+        finish_cb=finish_cb,
+    )
+
+
+def MakePushJobInstance(job_name, op_name, push_cb, finish_cb=None):
+    return MakeJobInstance(
+        job_name,
+        sole_input_op_name_in_user_job=op_name,
+        push_cb=push_cb,
+        finish_cb=finish_cb,
+    )
+
+
+def MakeArgPassJobInstance(job_name, src_op_name, dst_op_name, finish_cb=None):
+    return MakeJobInstance(
+        job_name,
+        sole_output_op_name_in_user_job=src_op_name,
+        sole_input_op_name_in_user_job=dst_op_name,
+        finish_cb=finish_cb,
+    )
+
+
+def MakeJobInstance(*arg, **kw):
+    def _DoNothing():
+        pass
+
+    if "finish_cb" not in kw or kw["finish_cb"] is None:
+        kw["finish_cb"] = _DoNothing
+    job_instance = JobInstance(*arg, **kw)
+    global _flying_job_instance
+    _flying_job_instance[id(job_instance)] = job_instance
+
+    def DereferenceJobInstance(job_instance):
+        global _flying_job_instance
+        del _flying_job_instance[id(job_instance)]
+
+    job_instance.AddPostFinishCallback(DereferenceJobInstance)
+    return job_instance
+
+
+class JobInstance(oneflow._oneflow_internal.JobInstance):
+    def __init__(
+        self,
+        job_name,
+        sole_input_op_name_in_user_job=None,
+        sole_output_op_name_in_user_job=None,
+        push_cb=None,
+        pull_cb=None,
+        finish_cb=None,
+    ):
+        oneflow._oneflow_internal.JobInstance.__init__(self)
+        self.thisown = 0
+        self.job_name_ = str(job_name)
+        self.sole_input_op_name_in_user_job_ = str(sole_input_op_name_in_user_job)
+        self.sole_output_op_name_in_user_job_ = str(sole_output_op_name_in_user_job)
+        self.push_cb_ = push_cb
+        self.pull_cb_ = pull_cb
+        self.finish_cb_ = finish_cb
+        self.post_finish_cbs_ = []
+
+    def job_name(self):
+        try:
+            return self.job_name_
+        except Exception as e:
+            print(traceback.format_exc())
+            raise e
+
+    def sole_input_op_name_in_user_job(self):
+        try:
+            return self.sole_input_op_name_in_user_job_
+        except Exception as e:
+            print(traceback.format_exc())
+            raise e
+
+    def sole_output_op_name_in_user_job(self):
+        try:
+            return self.sole_output_op_name_in_user_job_
+        except Exception as e:
+            print(traceback.format_exc())
+            raise e
+
+    def PushBlob(self, of_blob_ptr):
+        try:
+            self.push_cb_(ofblob.OfBlob(of_blob_ptr))
+        except Exception as e:
+            print(traceback.format_exc())
+            raise e
+
+    def PullBlob(self, of_blob_ptr):
+        try:
+            self.pull_cb_(ofblob.OfBlob(of_blob_ptr))
+        except Exception as e:
+            print(traceback.format_exc())
+            raise e
+
+    def Finish(self):
+        try:
+            self.finish_cb_()
+        except Exception as e:
+            print(traceback.format_exc())
+            raise e
+        finally:
+            try:
+                for post_finish_cb in self.post_finish_cbs_:
+                    post_finish_cb(self)
+            except Exception as e:
+                print(traceback.format_exc())
+                raise e
+
+    def AddPostFinishCallback(self, cb):
+        self.post_finish_cbs_.append(cb)
+
+
+_flying_job_instance = {}
diff --git a/python/oneflow/compatible/single_client/framework/job_set_util.py b/python/oneflow/compatible/single_client/framework/job_set_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c9970b4c1c817073f6787f32861a3d75677cd1d
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/job_set_util.py
@@ -0,0 +1,53 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional, TypeVar
+
+from oneflow.core.job.job_set_pb2 import JobSet
+
+_VT = TypeVar("_VT")
+
+
+def inter_job_reuse_mem_strategy(
+    strategy_str: str, job_set: Optional[JobSet] = None, **kwargs: _VT
+) -> None:
+    """Set memory sharing strategy for job set.
+
+    Args:
+        strategy_str: An optional `string` from: `mem_sharing_priority`, `parallelism_priority` 
+        or `custom_parallelism`. 
+        job_set: A `JobSet` object. If None, set default job set.
+    """
+    assert type(strategy_str) is str
+    if job_set == None:
+        job_set = _default_job_set
+    if strategy_str == "reuse_mem_priority":
+        job_set.inter_job_reuse_mem_strategy.reuse_mem_priority.SetInParent()
+        assert job_set.inter_job_reuse_mem_strategy.HasField("reuse_mem_priority")
+    elif strategy_str == "parallelism_priority":
+        job_set.inter_job_reuse_mem_strategy.parallelism_priority.SetInParent()
+        assert job_set.inter_job_reuse_mem_strategy.HasField("parallelism_priority")
+    elif strategy_str == "custom_parallelism":
+        assert kwargs["job_name_groups"] is not None
+        for job_name_group in kwargs["job_name_groups"]:
+            group = (
+                job_set.inter_job_reuse_mem_strategy.custom_parallelism.nonparallel_group.add()
+            )
+            for job_name in job_name_group:
+                assert type(job_name) is str
+                group.job_name.append(job_name)
+
+
+_default_job_set = JobSet()
diff --git a/python/oneflow/compatible/single_client/framework/local_blob.py b/python/oneflow/compatible/single_client/framework/local_blob.py
new file mode 100644
index 0000000000000000000000000000000000000000..44a54b6e7642140fe68e8df12a41bf9565f85b77
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/local_blob.py
@@ -0,0 +1,109 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import traceback
+
+import numpy as np
+
+import oneflow._oneflow_internal
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+
+
+class LocalBlob(object):
+    def __init__(self, ndarray, is_dynamic):
+        self.ndarray_ = ndarray
+        self.is_dynamic_ = is_dynamic
+
+    @property
+    def is_dynamic(self):
+        return self.is_dynamic_
+
+    def ndarray_list(self):
+        print(
+            "WARNING:",
+            "LocalBlob.ndarray_list is deprecated, please use LocalBlob.numpy()\n",
+            traceback.format_stack()[-2],
+        )
+        return self.numpy_list()
+
+    def numpy_list(self):
+        return [self.numpy()]
+
+    def ndarray(self):
+        print(
+            "WARNING:",
+            "LocalBlob.ndarray is deprecated, please use LocalBlob.numpy()\n",
+            traceback.format_stack()[-2],
+        )
+        return self.numpy()
+
+    def numpy(self, parallel_id=None):
+        assert parallel_id is None or parallel_id == 0
+        return self.ndarray_
+
+    def parallel_num(self):
+        return 1
+
+    def __getattr__(self, attr):
+        return getattr(self.numpy(), attr)
+
+
+def MakeLocalBlob4EagerBlob(eager_blob):
+    assert isinstance(eager_blob, oneflow._oneflow_internal.EagerBlobTrait)
+    if isinstance(eager_blob, oneflow._oneflow_internal.EagerMirroredBlob):
+        assert eager_blob.numpy_size() == 1
+        return LocalBlob(eager_blob.numpy(), is_dynamic=eager_blob.is_dynamic)
+    elif isinstance(eager_blob, oneflow._oneflow_internal.EagerConsistentBlob):
+        return LocalBlob(eager_blob.numpy(), is_dynamic=False)
+    else:
+        raise NotImplementedError
+
+
+non_override_field = set(
+    [
+        "__class__",
+        "__doc__",
+        "__new__",
+        "__init__",
+        "__del__",
+        "__call__",
+        "__getattr__",
+        "__getattribute__",
+        "__setattr__",
+        "__delattr__",
+        "__dir__",
+        "__get__",
+        "__set__",
+        "__delete__",
+    ]
+)
+
+
+def MakeBlobMethod(field_name):
+    def ConvertOtherArgs(args):
+        return [x.numpy() if isinstance(x, LocalBlob) else x for x in args]
+
+    return lambda self, *args: getattr(self.numpy(), field_name)(
+        *ConvertOtherArgs(args)
+    )
+
+
+for field_name in dir(np.ndarray):
+    if field_name.startswith("__") == False:
+        continue
+    if field_name in non_override_field:
+        continue
+    if hasattr(LocalBlob, field_name) == False:
+        setattr(LocalBlob, field_name, MakeBlobMethod(field_name))
diff --git a/python/oneflow/compatible/single_client/framework/model.py b/python/oneflow/compatible/single_client/framework/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d8bf6e1e27d47f8860934d22eecff6cf05e1196
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/model.py
@@ -0,0 +1,760 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+__all__ = [
+    "DataModule",
+    "NumpyDataModule",
+    "TrainingConfig",
+    "ValidationConfig",
+    "CheckpointConfig",
+    "Callback",
+    "Model",
+]
+import inspect
+from abc import ABC
+from typing import Any, List, Optional, Tuple, Union
+
+import numpy as np
+
+import oneflow._oneflow_internal
+from oneflow.compatible.single_client.framework import dtype as dtype_util
+from oneflow.compatible.single_client.framework import typing as oneflow_typing
+from oneflow.compatible.single_client.framework.check_point_v2 import (
+    GetCheckpoint,
+    LoadVariables,
+    SaveVarDict,
+)
+from oneflow.compatible.single_client.framework.function_util import (
+    FunctionConfig as ExecutionConfig,
+)
+from oneflow.compatible.single_client.framework.function_util import (
+    api_oneflow_function,
+)
+from oneflow.compatible.single_client.framework.local_blob import LocalBlob
+from oneflow.compatible.single_client.framework.session_util import (
+    api_clear_default_session,
+)
+from oneflow.compatible.single_client.framework.tensor import Tensor
+from oneflow.compatible.single_client.nn.module import Module
+from oneflow.compatible.single_client.nn.optimizer.optimizer import (
+    Optimizer as OOPOptimizer,
+)
+from oneflow.compatible.single_client.ops.optimizer import Optimizer
+
+
+class DataModule(Module):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+
+    def forward(self, step_idx: int = 0, optimizer_idx: int = 0):
+        pass
+
+    def infer_oneflow_data_placeholder(
+        self, batch: Tuple[Any] = None, optimizer_idx: int = 0
+    ):
+        return None
+
+
+class NumpyDataModule(DataModule):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+
+    def forward(self, step_idx: int = 0, optimizer_idx: int = 0):
+        pass
+
+    def __call__(self, *args):
+        ret = self.forward(*args)
+        return ret
+
+    def infer_oneflow_data_placeholder(
+        self, batch: Tuple[np.ndarray, ...] = None, optimizer_idx: int = 0
+    ):
+        assert isinstance(batch, tuple), "model.NumpyDataModule must return a tuple."
+        data_placeholder_list = []
+        for item in batch:
+            assert isinstance(
+                item, np.ndarray
+            ), "model.NumpyDataModule must return a tuple of numpy."
+            of_dtype = dtype_util.convert_numpy_dtype_to_oneflow_dtype(item.dtype)
+            numpy_placeholder = oneflow_typing.Numpy.Placeholder(
+                shape=item.shape, dtype=of_dtype
+            )
+            data_placeholder_list.append(numpy_placeholder)
+        return data_placeholder_list
+
+
+class TrainingConfig:
+    def __init__(self):
+        super().__init__()
+        self.exe_cfg = ExecutionConfig()
+        self.data = None
+        self.error_msg = ""
+
+    def config_execution(self, exe_cfg: ExecutionConfig = None):
+        self.exe_cfg = exe_cfg
+
+    def config_data(self, data: DataModule = None):
+        self.data = data
+
+    def check_valid(self):
+        is_valid = True
+        self.error_msg = ""
+        if not isinstance(self.exe_cfg, ExecutionConfig):
+            self.error_msg += "model.TrainingConfig exe_cfg is not ExecutionConfig;"
+            is_valid = False
+        if self.data is None:
+            self.error_msg += "model.TrainingConfig data is None;"
+            is_valid = False
+        if not isinstance(self.data, DataModule):
+            self.error_msg += "model.TrainingConfig data is not DataModule;"
+            is_valid = False
+        return is_valid
+
+
+class ValidationConfig:
+    def __init__(self):
+        super().__init__()
+        self.exe_cfg = ExecutionConfig()
+        self.data = None
+        self.step_interval = 10
+        self.error_msg = ""
+
+    def config_execution(self, exe_cfg: ExecutionConfig = None):
+        self.exe_cfg = exe_cfg
+
+    def config_data(self, data: DataModule = None):
+        self.data = data
+
+    def config_step_interval(self, step_interval: int = 1):
+        self.step_interval = step_interval
+
+    def check_valid(self):
+        is_valid = True
+        self.error_msg = ""
+        if self.data is None:
+            self.error_msg += "model.ValidationConfig data is None;"
+            is_valid = False
+        if not isinstance(self.data, DataModule):
+            self.error_msg += "model.ValidationConfig data is not DataModule;"
+            is_valid = False
+        if self.step_interval <= 0 or not isinstance(self.step_interval, int):
+            self.error_msg += (
+                "model.ValidationConfig step_interval is <= 0 or is not int;"
+            )
+            is_valid = False
+        return is_valid
+
+
+class CheckpointConfig(object):
+    def __init__(self):
+        self.need_load = False
+        self.load_dirpath = None
+        self.need_save = False
+        self.save_dirpath = None
+        self.save_step_interval = 1
+        self.error_msg = ""
+
+    def config_load(self, dirpath: str = None):
+        self.need_load = True
+        assert dirpath is not None, "dirpath should not be None"
+        self.load_dirpath = dirpath
+
+    def config_save(self, dirpath: str = None, step_interval: int = 1):
+        self.need_save = True
+        self.save_dirpath = dirpath
+        assert dirpath is not None, "dirpath should not be None"
+        self.save_step_interval = step_interval
+        assert step_interval > 0, "step_interval should not <= 0"
+        assert isinstance(step_interval, int), "step_interval should be int"
+
+    def check_valid(self):
+        is_valid = True
+        self.error_msg = ""
+        return is_valid
+
+
+class Callback(ABC):
+    """ Abstract base class used to build new callbacks.
+    """
+
+    def on_training_step_end(
+        self,
+        outputs: Optional[
+            Union[LocalBlob, Tuple[LocalBlob, ...], Tensor, Tuple[Tensor, ...]]
+        ],
+        step_idx: int = 0,
+        optimizer_idx: int = 0,
+    ):
+        pass
+
+    def on_validation_step_end(
+        self,
+        outputs: Optional[
+            Union[LocalBlob, Tuple[LocalBlob, ...], Tensor, Tuple[Tensor, ...]]
+        ],
+        step_idx: int = 0,
+    ):
+        pass
+
+
+class Model(ABC, Module):
+    """A high level API for model training and validation.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self._is_deprecated_function_style = (
+            kwargs["is_deprecated_function_style"]
+            if "is_deprecated_function_style" in kwargs
+            else False
+        )
+
+    def forward(self, *args, **kwargs):
+        """Same as `nn.Module.forward()`, here is to define the operations you want to use for prediction.
+        """
+        raise NotImplementedError
+
+    def training_step(self, *args, **kwargs):
+        """Operates on a single batch of data from the training set and return loss.
+        """
+        raise NotImplementedError()
+
+    def validation_step(self, *args, **kwargs):
+        """Operates on a single batch of data from the validation set.
+        """
+        raise NotImplementedError()
+
+    def configure_optimizers(self):
+        """Choose what optimizers and learning-rate schedulers to use in your optimization.
+        Normally you'd need one. But in the case of GANs or similar you might have multiple.
+        """
+        raise NotImplementedError()
+
+    def fit(
+        self,
+        training_config: Optional[TrainingConfig] = None,
+        validation_config: Optional[ValidationConfig] = None,
+        checkpoint_config: Optional[CheckpointConfig] = None,
+        callbacks: Optional[Union[Callback, List[Callback]]] = None,
+        max_steps: int = 100,
+    ):
+        """ Runs the full training and validation routine.
+        """
+        self._max_steps = max_steps
+        api_clear_default_session()
+        self._sub_models = self._get_and_check_sub_models(
+            training_config, validation_config, checkpoint_config, callbacks
+        )
+        if len(self._sub_models) == 0:
+            return
+        if self._checkpoint_model.is_valid:
+            self._checkpoint_model.load()
+        for step_idx in range(0, self._max_steps):
+            for sub_model in self._sub_models:
+                try:
+                    sub_model.step(step_idx)
+                except Exception as e:
+                    print(
+                        "Model step_idx {} {} failed.".format(step_idx, sub_model.name)
+                    )
+                    raise e
+
+    def method_overrided(self, method_name: str = None) -> bool:
+        return getattr(self.__class__, method_name) != getattr(Model, method_name)
+
+    def _get_and_check_sub_models(
+        self,
+        training_config: Optional[TrainingConfig] = None,
+        validation_config: Optional[ValidationConfig] = None,
+        checkpoint_config: Optional[CheckpointConfig] = None,
+        callbacks: Optional[Union[Callback, List[Callback]]] = None,
+    ):
+        sub_models = []
+        self._train_model = (
+            TrainModel(training_config, self, callbacks)
+            if self._is_deprecated_function_style
+            else TrainModelOOPStyle(training_config, self, callbacks)
+        )
+        if self._train_model.is_valid:
+            sub_models.append(self._train_model)
+        elif training_config is not None:
+            print(
+                self._train_model.error_msg,
+                "{}'s fit() will not do training.".format(self.__class__.__name__),
+            )
+        self._val_model = (
+            ValidateModel(validation_config, self, callbacks)
+            if self._is_deprecated_function_style
+            else ValidateModelOOPStyle(validation_config, self, callbacks)
+        )
+        if self._val_model.is_valid:
+            sub_models.append(self._val_model)
+        elif validation_config is not None:
+            print(
+                self._val_model.error_msg,
+                "{}'s fit() will not do validation.".format(self.__class__.__name__),
+            )
+        if len(sub_models) == 0:
+            print(
+                "{}'s fit() will do nothing because there has no valid configuration.".format(
+                    self.__class__.__name__
+                )
+            )
+            return sub_models
+        self._checkpoint_model = (
+            CheckpointModel(checkpoint_config, self, callbacks)
+            if self._is_deprecated_function_style
+            else CheckpointModelOOPStyle(checkpoint_config, self, callbacks)
+        )
+        if self._checkpoint_model.is_valid:
+            sub_models.append(self._checkpoint_model)
+        elif checkpoint_config is not None:
+            print(
+                self._checkpoint_model.error_msg,
+                "{}'s fit() will not do checkpoint.".format(self.__class__.__name__),
+            )
+        return sub_models
+
+
+class SubModel(ABC):
+    def __init__(self, name, cfg, model, callbacks):
+        self._cfg = cfg
+        assert isinstance(model, Model)
+        self._model = model
+        self._cbs = callbacks
+        self.name = name
+        self.is_valid = True
+        self.error_msg = (
+            self._model.__class__.__name__ + " " + self.name + " error message: "
+        )
+        if not self._get_and_check_cfg():
+            self.is_valid = False
+        if not self._get_and_check_cbs():
+            self.is_valid = False
+
+    def step(self, step_idx: int = 0):
+        raise NotImplementedError
+
+    def _get_and_check_cfg(self):
+        if self._cfg is None:
+            self.error_msg += "config is None;"
+            return False
+        if not self._cfg.check_valid():
+            self.error_msg += self._cfg.error_msg
+            return False
+        else:
+            return True
+
+    def _get_and_check_cbs(self):
+        if self._cbs is None:
+            self._cbs = []
+            return True
+        if isinstance(self._cbs, Callback):
+            self._cbs = [self._cbs]
+            return True
+        if isinstance(self._cbs, list):
+            for cb in self._cbs:
+                assert isinstance(
+                    cb, Callback
+                ), "model callbacks' type must be model.Callback or List[model.Callback]."
+            return True
+        assert (
+            False
+        ), "model callbacks' type must be model.Callback or List[model.Callback]."
+
+    def _method_callback(self, method_name: str = None, *args, **kwargs):
+        for cb in self._cbs:
+            method = getattr(cb, method_name)
+            method(*args, **kwargs)
+
+
+class TrainModel(SubModel):
+    def __init__(
+        self,
+        cfg: TrainingConfig = None,
+        model: Model = None,
+        callbacks: Optional[Union[Callback, List[Callback]]] = None,
+    ):
+        super().__init__("training", cfg, model, callbacks)
+        if not self._get_and_check_step():
+            self.is_valid = False
+        if not self._get_and_check_opts():
+            self.is_valid = False
+        if self.is_valid and (not self._get_and_check_jobs()):
+            self.is_valid = False
+
+    def step(self, step_idx: int = 0):
+        assert self.is_valid, self.error_msg
+        for optimizer_idx in range(0, len(self._opts)):
+            outputs = None
+            if self._is_numpy_input:
+                batch = None
+                if step_idx == 0:
+                    batch = self._first_numpy_batch[optimizer_idx]
+                else:
+                    batch = self._cfg.data(step_idx, optimizer_idx)
+                outputs = self._jobs[optimizer_idx](*batch).get()
+            else:
+                outputs = self._jobs[optimizer_idx]().get()
+            self._method_callback(
+                "on_training_step_end",
+                outputs=outputs,
+                step_idx=step_idx,
+                optimizer_idx=optimizer_idx,
+            )
+
+    def _get_and_check_step(self):
+        if not self._model.method_overrided("training_step"):
+            self.error_msg += "model.training_step() is empty;"
+            return False
+        else:
+            return True
+
+    def _get_and_check_opts(self):
+        self._opts = []
+        if not self._model.method_overrided("configure_optimizers"):
+            self.error_msg += "model.configure_optimizers() is empty;"
+            return False
+        opt_conf = self._model.configure_optimizers()
+        if isinstance(opt_conf, Optimizer):
+            self._opts = [opt_conf]
+        elif isinstance(opt_conf, (list, tuple)):
+            for opt in opt_conf:
+                assert isinstance(
+                    opt, Optimizer
+                ), "model.configure_optimizers() must return Optimizer                     or List[Optimizer, ...] or Tuple[Optimizer, ...]"
+            self._opts = opt_conf
+        else:
+            assert (
+                False
+            ), "model.configure_optimizers() must return Optimizer                 or List[Optimizer, ...] or Tuple[Optimizer, ...]"
+        return True
+
+    def _get_and_check_jobs(self):
+        self._is_numpy_input = (
+            True if isinstance(self._cfg.data, NumpyDataModule) else False
+        )
+        self._jobs = []
+        if self._is_numpy_input:
+            self._first_numpy_batch = []
+            for optimizer_idx in range(0, len(self._opts)):
+                batch = self._cfg.data(0, optimizer_idx)
+                self._first_numpy_batch.insert(optimizer_idx, batch)
+                self._jobs.insert(
+                    optimizer_idx, self._construct_numpy_job(batch, optimizer_idx)
+                )
+        else:
+            for optimizer_idx in range(0, len(self._opts)):
+                self._jobs.insert(optimizer_idx, self._construct_job(optimizer_idx))
+        return True
+
+    def _construct_job(self, optimizer_idx: int = 0):
+        def job():
+            batch = self._cfg.data(0, optimizer_idx)
+            outputs = self._model.training_step(
+                batch=batch, optimizer_idx=optimizer_idx
+            )
+            loss = None
+            if isinstance(outputs, tuple) and len(outputs) > 0:
+                loss = outputs[0]
+            else:
+                loss = outputs
+            self._opts[optimizer_idx].minimize(loss)
+            return outputs
+
+        job.__name__ = (
+            self._model.__class__.__name__ + "_Model_train_job_" + str(optimizer_idx)
+        )
+        deco = api_oneflow_function(type="train", function_config=self._cfg.exe_cfg)
+        return deco(job)
+
+    def _construct_numpy_job(self, batch, optimizer_idx):
+        def job(*input_batch):
+            outputs = self._model.training_step(
+                batch=input_batch, optimizer_idx=optimizer_idx
+            )
+            loss = None
+            if isinstance(outputs, tuple) and len(outputs) > 0:
+                loss = outputs[0]
+            else:
+                loss = outputs
+            self._opts[optimizer_idx].minimize(loss)
+            return outputs
+
+        _infer_job_signature(self._cfg.data, batch, optimizer_idx, job)
+        job.__name__ = (
+            self._model.__class__.__name__
+            + "_Model_train_numpy_job_"
+            + str(optimizer_idx)
+        )
+        deco = api_oneflow_function(type="train", function_config=self._cfg.exe_cfg)
+        return deco(job)
+
+
+class ValidateModel(SubModel):
+    def __init__(
+        self,
+        cfg: ValidationConfig = None,
+        model: Model = None,
+        callbacks: Optional[Union[Callback, List[Callback]]] = None,
+    ):
+        super().__init__("validation", cfg, model, callbacks)
+        if not self._get_and_check_step():
+            self.is_valid = False
+        if self.is_valid and (not self._get_and_check_job()):
+            self.is_valid = False
+
+    def step(self, step_idx: int = 0):
+        assert self.is_valid
+        if (step_idx + 1) % self._cfg.step_interval == 0:
+            outputs = None
+            if self._is_numpy_input:
+                batch = None
+                if step_idx == 0:
+                    batch = self._first_numpy_batch
+                else:
+                    batch = self._cfg.data(step_idx, 0)
+                outputs = self._job(*batch).get()
+            else:
+                outputs = self._job().get()
+            self._method_callback(
+                "on_validation_step_end", step_idx=step_idx, outputs=outputs
+            )
+
+    def _get_and_check_step(self):
+        if not self._model.method_overrided("validation_step"):
+            self.error_msg += "model.validation_step() is empty;"
+            return False
+        else:
+            return True
+
+    def _get_and_check_job(self):
+        self._is_numpy_input = (
+            True if isinstance(self._cfg.data, NumpyDataModule) else False
+        )
+        self._job = None
+        if not self._is_numpy_input:
+            self._job = self._construct_job()
+        else:
+            batch = self._cfg.data(0, 0)
+            self._first_numpy_batch = batch
+            self._job = self._construct_numpy_job(batch)
+        return True
+
+    def _construct_job(self):
+        def job():
+            batch = self._cfg.data(0, 0)
+            return self._model.validation_step(batch)
+
+        job.__name__ = self._model.__class__.__name__ + "_Model_eval_job"
+        deco = api_oneflow_function(type="predict", function_config=self._cfg.exe_cfg)
+        return deco(job)
+
+    def _construct_numpy_job(self, batch: Tuple[np.ndarray, ...] = None):
+        def job(*input_batch):
+            return self._model.validation_step(batch=input_batch)
+
+        _infer_job_signature(self._cfg.data, batch, 0, job)
+        job.__name__ = self._model.__class__.__name__ + "_Model_eval_numpy_job"
+        deco = api_oneflow_function(type="predict", function_config=self._cfg.exe_cfg)
+        return deco(job)
+
+
+class CheckpointModel(SubModel):
+    def __init__(
+        self,
+        cfg: CheckpointConfig = None,
+        model: Model = None,
+        callbacks: Optional[Union[Callback, List[Callback]]] = None,
+    ):
+        super().__init__("checkpointing", cfg, model, callbacks)
+
+    def load(self):
+        assert self.is_valid
+        if self._cfg.need_load:
+            self._load_checkpoint(self._cfg.load_dirpath)
+
+    def step(self, step_idx: int = 0):
+        assert self.is_valid
+        if self._cfg.need_save:
+            if (step_idx + 1) % self._cfg.save_step_interval == 0:
+                self._save_checkpoint(
+                    dirpath=self._cfg.save_dirpath + "-" + str(step_idx)
+                )
+
+    def _load_checkpoint(self, dirpath: str):
+        """Load model states from a checkpoint.
+        """
+        LoadVariables(GetCheckpoint(path=dirpath))
+
+    def _save_checkpoint(self, dirpath: str):
+        """Save model states as a checkpoint.
+        """
+        SaveVarDict(path=dirpath)
+
+
+class TrainModelOOPStyle(SubModel):
+    def __init__(
+        self,
+        cfg: TrainingConfig = None,
+        model: Model = None,
+        callbacks: Optional[Union[Callback, List[Callback]]] = None,
+    ):
+        super().__init__("training", cfg, model, callbacks)
+        if not self._get_and_check_step():
+            self.is_valid = False
+        if not self._get_and_check_opts():
+            self.is_valid = False
+
+    def step(self, step_idx: int = 0):
+        assert self.is_valid, self.error_msg
+        for optimizer_idx in range(0, len(self._opts)):
+            batch = self._cfg.data(step_idx, optimizer_idx)
+            outputs = self._model.training_step(
+                batch=batch, optimizer_idx=optimizer_idx
+            )
+            loss = None
+            if isinstance(outputs, tuple) and len(outputs) > 0:
+                loss = outputs[0]
+            else:
+                loss = outputs
+            loss.backward()
+            opt = self._opts[optimizer_idx]
+            opt.step()
+            opt.zero_grad()
+            self._method_callback(
+                "on_training_step_end",
+                outputs=outputs,
+                step_idx=step_idx,
+                optimizer_idx=optimizer_idx,
+            )
+
+    def _get_and_check_step(self):
+        if not self._model.method_overrided("training_step"):
+            self.error_msg += "model.training_step() is empty;"
+            return False
+        else:
+            return True
+
+    def _get_and_check_opts(self):
+        self._opts = []
+        if not self._model.method_overrided("configure_optimizers"):
+            self.error_msg += "model.configure_optimizers() is empty;"
+            return False
+        opt_conf = self._model.configure_optimizers()
+        if isinstance(opt_conf, OOPOptimizer):
+            self._opts = [opt_conf]
+        elif isinstance(opt_conf, (list, tuple)):
+            for opt in opt_conf:
+                assert isinstance(
+                    opt, OOPOptimizer
+                ), "model.configure_optimizers() must return Optimizer                     or List[Optimizer, ...] or Tuple[Optimizer, ...]"
+            self._opts = opt_conf
+        else:
+            assert (
+                False
+            ), "model.configure_optimizers() must return Optimizer                 or List[Optimizer, ...] or Tuple[Optimizer, ...]"
+        return True
+
+
+class ValidateModelOOPStyle(SubModel):
+    def __init__(
+        self,
+        cfg: ValidationConfig = None,
+        model: Model = None,
+        callbacks: Optional[Union[Callback, List[Callback]]] = None,
+    ):
+        super().__init__("validation", cfg, model, callbacks)
+        if not self._get_and_check_step():
+            self.is_valid = False
+
+    def step(self, step_idx: int = 0):
+        assert self.is_valid
+        if (step_idx + 1) % self._cfg.step_interval == 0:
+            outputs = None
+            with oneflow._oneflow_internal.autograd.no_grad():
+                inputs = self._cfg.data(step_idx, 0)
+                model_previous_mode = self._model.training
+                self._model.train()
+                outputs = self._model.validation_step(inputs)
+                self._model.train(model_previous_mode)
+            self._method_callback(
+                "on_validation_step_end", step_idx=step_idx, outputs=outputs
+            )
+
+    def _get_and_check_step(self):
+        if not self._model.method_overrided("validation_step"):
+            self.error_msg += "model.validation_step() is empty;"
+            return False
+        else:
+            return True
+
+
+class CheckpointModelOOPStyle(SubModel):
+    def __init__(
+        self,
+        cfg: CheckpointConfig = None,
+        model: Model = None,
+        callbacks: Optional[Union[Callback, List[Callback]]] = None,
+    ):
+        super().__init__("checkpointing", cfg, model, callbacks)
+
+    def load(self):
+        assert self.is_valid
+        if self._cfg.need_load:
+            self._load_checkpoint(self._cfg.load_dirpath)
+
+    def step(self, step_idx: int = 0):
+        assert self.is_valid
+        if self._cfg.need_save:
+            if (step_idx + 1) % self._cfg.save_step_interval == 0:
+                self._save_checkpoint(
+                    dirpath=self._cfg.save_dirpath + "-" + str(step_idx)
+                )
+
+    def _load_checkpoint(self, dirpath: str):
+        """Load model states from a checkpoint.
+        """
+        stat_dict = GetCheckpoint(path=dirpath)
+        self._model.load_state_dict(stat_dict)
+
+    def _save_checkpoint(self, dirpath: str):
+        """Save model states as a checkpoint.
+        """
+        stat_dict = self._model.state_dict()
+        SaveVarDict(path=dirpath, var_dict=stat_dict)
+
+
+def _infer_job_signature(data_module, batch, optimizer_idx, job):
+    para_list = []
+    placeholder_list = data_module.infer_oneflow_data_placeholder(batch, optimizer_idx)
+    for (i, placeholder) in enumerate(placeholder_list):
+        para_name = (
+            data_module.__class__.__name__
+            + "_opt_"
+            + str(optimizer_idx)
+            + "_para_"
+            + str(i)
+        )
+        para_list.append(
+            inspect.Parameter(
+                name=para_name,
+                kind=inspect.Parameter.POSITIONAL_OR_KEYWORD,
+                annotation=placeholder,
+            )
+        )
+    origin_sig = inspect.signature(job)
+    new_sig = origin_sig.replace(parameters=para_list)
+    job.__oneflow_function_signature__ = new_sig
diff --git a/python/oneflow/compatible/single_client/framework/module.py b/python/oneflow/compatible/single_client/framework/module.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a9791628b0b9268d89b6166daf6a7964b67ad7c
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/module.py
@@ -0,0 +1,45 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.framework import id_util as id_util
+
+
+class Module(object):
+    def __init__(self, name=None):
+        if name is None:
+            name = id_util.UniqueStr("Module_")
+        self.module_name_ = name
+        self.call_seq_no_ = 0
+
+    @property
+    def module_name(self):
+        return self.module_name_
+
+    @property
+    def call_seq_no(self):
+        return self.call_seq_no_
+
+    def forward(self, *args):
+        raise NotImplementedError()
+
+    def __call__(self, *args):
+        ret = self.forward(*args)
+        self.call_seq_no_ = self.call_seq_no_ + 1
+        return ret
+
+    def __del__(self):
+        assert (
+            getattr(type(self), "__call__") is Module.__call__
+        ), "do not override __call__"
diff --git a/python/oneflow/compatible/single_client/framework/ofblob.py b/python/oneflow/compatible/single_client/framework/ofblob.py
new file mode 100644
index 0000000000000000000000000000000000000000..963284d4717f5ec85d0cd8ac4a77c58c88d5cc5d
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/ofblob.py
@@ -0,0 +1,105 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import collections
+from functools import reduce
+
+import numpy as np
+from google.protobuf import text_format
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.dtype import (
+    convert_proto_dtype_to_oneflow_dtype,
+)
+from oneflow.compatible.single_client.support.box import Box
+
+
+class OfBlob(object):
+    def __init__(self, of_blob_ptr):
+        self.of_blob_ptr_ = of_blob_ptr
+
+    @property
+    def dtype(self):
+        return convert_proto_dtype_to_oneflow_dtype(
+            oneflow._oneflow_internal.Ofblob_GetDataType(self.of_blob_ptr_)
+        )
+
+    @property
+    def static_shape(self):
+        num_axes = oneflow._oneflow_internal.OfBlob_NumAxes(self.of_blob_ptr_)
+        dst_ndarray = np.ndarray(num_axes, dtype=np.int64)
+        oneflow._oneflow_internal.OfBlob_CopyStaticShapeTo(
+            self.of_blob_ptr_, dst_ndarray
+        )
+        return tuple(dst_ndarray.tolist())
+
+    @property
+    def shape(self):
+        num_axes = oneflow._oneflow_internal.OfBlob_NumAxes(self.of_blob_ptr_)
+        dst_ndarray = np.zeros(num_axes, dtype=np.int64)
+        oneflow._oneflow_internal.OfBlob_CopyShapeTo(self.of_blob_ptr_, dst_ndarray)
+        return tuple(dst_ndarray.tolist())
+
+    def set_shape(self, shape):
+        assert isinstance(shape, (list, tuple))
+        assert len(shape) == oneflow._oneflow_internal.OfBlob_NumAxes(self.of_blob_ptr_)
+        oneflow._oneflow_internal.OfBlob_CopyShapeFrom(
+            self.of_blob_ptr_, np.array(shape, dtype=np.int64)
+        )
+
+    @property
+    def num_axes(self):
+        return oneflow._oneflow_internal.OfBlob_NumAxes(self.of_blob_ptr_)
+
+    @property
+    def is_dynamic(self):
+        return oneflow._oneflow_internal.OfBlob_IsDynamic(self.of_blob_ptr_)
+
+    def CopyToNdarray(self):
+        return self._CopyToNdarray()
+
+    def CopyFromNdarray(self, src_ndarray):
+        if self.is_dynamic:
+            self.set_shape(src_ndarray.shape)
+        else:
+            shape_tensor = np.zeros(self.num_axes, dtype=np.int64)
+            oneflow._oneflow_internal.OfBlob_CopyShapeTo(
+                self.of_blob_ptr_, shape_tensor
+            )
+            shape = tuple(shape_tensor.tolist())
+            assert src_ndarray.shape == shape
+        return self._CopyBodyFromNdarray(src_ndarray)
+
+    def _CopyBodyFromNdarray(self, src_ndarray):
+        method_name = oneflow._oneflow_internal.Dtype_GetOfBlobCopyFromBufferFuncName(
+            oneflow._oneflow_internal.deprecated.GetProtoDtype4OfDtype(self.dtype)
+        )
+        copy_method = getattr(oneflow._oneflow_internal, method_name)
+        copy_method(self.of_blob_ptr_, src_ndarray)
+
+    def _CopyToNdarray(self):
+        method_name = oneflow._oneflow_internal.Dtype_GetOfBlobCopyToBufferFuncName(
+            oneflow._oneflow_internal.deprecated.GetProtoDtype4OfDtype(self.dtype)
+        )
+        copy_method = getattr(oneflow._oneflow_internal, method_name)
+        shape_tensor = np.zeros(self.num_axes, dtype=np.int64)
+        oneflow._oneflow_internal.OfBlob_CopyShapeTo(self.of_blob_ptr_, shape_tensor)
+        shape = tuple(shape_tensor.tolist())
+        tensor = np.zeros(
+            shape, dtype=flow.convert_oneflow_dtype_to_numpy_dtype(self.dtype)
+        )
+        copy_method(self.of_blob_ptr_, tensor)
+        return tensor
diff --git a/python/oneflow/compatible/single_client/framework/op_expr_util.py b/python/oneflow/compatible/single_client/framework/op_expr_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f80c8610a83e71d96421ab5d4e91d24cd66c7d4
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/op_expr_util.py
@@ -0,0 +1,45 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.attr_util import (
+    convert_to_user_attr_value,
+)
+
+
+def user_op_expr_call(self, *args, **kwargs):
+    args = list(args)
+    for i in range(len(args)):
+        arg = args[i]
+        if isinstance(arg, flow.Tensor):
+            if not arg.is_determined:
+                arg.determine()
+            args[i] = arg._local_or_consistent_tensor
+    attrs = oneflow._oneflow_internal.MutableCfgAttrMap()
+    for (attr_name, attr_value) in kwargs.items():
+        assert isinstance(attr_name, str)
+        attrs[attr_name] = convert_to_user_attr_value(
+            self.op_type_name, attr_name, attr_value
+        )
+    try:
+        results = self.apply(args, attrs)
+    except oneflow._oneflow_internal.exception.Exception:
+        raise oneflow._oneflow_internal.exception.GetThreadLocalLastError()
+    return results
+
+
+def RegisterMethod4UserOpExpr():
+    oneflow._oneflow_internal.one.UserOpExpr.__call__ = user_op_expr_call
diff --git a/python/oneflow/compatible/single_client/framework/op_util.py b/python/oneflow/compatible/single_client/framework/op_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2902b3a728548bd8f06cdf0f273c3576528c33c
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/op_util.py
@@ -0,0 +1,32 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow._oneflow_internal
+from oneflow.core.operator.op_conf_pb2 import OperatorConf
+
+
+def IsOpConfOnlyCpuSupported(op_conf):
+    assert isinstance(op_conf, OperatorConf)
+    '\n    global _cpu_only_op_type_cases\n    if _cpu_only_op_type_cases == None:\n        _cpu_only_op_type_cases = set()\n        for field in OperatorConf.DESCRIPTOR.oneofs_by_name["op_type"].fields:\n            if oneflow._oneflow_internal.IsOpTypeCaseCpuSupportOnly(field.number):\n                _cpu_only_op_type_cases.add(field.number)\n    op_type_field = op_conf.WhichOneof("op_type")\n    return OperatorConf.DESCRIPTOR.fields_by_name[op_type_field].number in _cpu_only_op_type_cases\n    '
+    op_type_field = op_conf.WhichOneof("op_type")
+    if op_type_field == "user_conf":
+        return IsUserOpOnlyCpuSupported(op_conf.user_conf.op_type_name)
+    else:
+        field_number = OperatorConf.DESCRIPTOR.fields_by_name[op_type_field].number
+        return oneflow._oneflow_internal.IsOpTypeCaseCpuSupportOnly(field_number)
+
+
+def IsUserOpOnlyCpuSupported(op_type_name):
+    return oneflow._oneflow_internal.IsOpTypeNameCpuSupportOnly(op_type_name)
diff --git a/python/oneflow/compatible/single_client/framework/ops.py b/python/oneflow/compatible/single_client/framework/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..936ee3ed159e8402eab28d6621e3df3d36060265
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/ops.py
@@ -0,0 +1,225 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional, Sequence, Union
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import (
+    compile_context as compile_context,
+)
+from oneflow.compatible.single_client.framework import distribute as distribute_util
+from oneflow.compatible.single_client.framework import hob as hob
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+from oneflow.compatible.single_client.support import enable_if as enable_if
+from oneflow.core.common import data_type_pb2 as data_type_util
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+from oneflow.core.register import logical_blob_id_pb2 as logical_blob_id_util
+
+
+def api_repeat(
+    input: oneflow._oneflow_internal.BlobDesc,
+    repeat_num: int,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    func = enable_if.unique([repeat])
+    return func(input, repeat_num, name=name)
+
+
+@enable_if.condition(hob.in_global_mode & ~hob.eager_execution_enabled)
+def repeat(input, repeat_num, name=None):
+    assert not flow.eager_execution_enabled()
+    return (
+        flow.user_op_builder(name if name is not None else id_util.UniqueStr("Repeat_"))
+        .Op("repeat")
+        .Input("in", [input])
+        .Output("out")
+        .Attr("repeat_num", repeat_num)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def api_acc(
+    one: oneflow._oneflow_internal.BlobDesc,
+    max_acc_num: int,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    func = enable_if.unique([acc])
+    return func(one, max_acc_num, name=name)
+
+
+@enable_if.condition(hob.in_global_mode & ~hob.eager_execution_enabled)
+def acc(one, max_acc_num, name=None):
+    assert not flow.eager_execution_enabled()
+    return (
+        flow.user_op_builder(name if name is not None else id_util.UniqueStr("Acc_"))
+        .Op("acc")
+        .Input("in", [one])
+        .Output("out")
+        .Attr("max_acc_num", max_acc_num)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def api_unpack(
+    input: oneflow._oneflow_internal.BlobDesc,
+    unpack_num: int,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    func = enable_if.unique([unpack])
+    return func(input, unpack_num, name=name)
+
+
+@enable_if.condition(hob.in_global_mode & ~hob.eager_execution_enabled)
+def unpack(input, unpack_num, name=None):
+    assert not flow.eager_execution_enabled()
+    return (
+        flow.user_op_builder(name if name is not None else id_util.UniqueStr("Unpack_"))
+        .Op("unpack")
+        .Input("in", [input])
+        .Output("out")
+        .Attr("unpack_num", unpack_num)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def api_pack(
+    input: oneflow._oneflow_internal.BlobDesc, pack_num: int, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    func = enable_if.unique([pack])
+    return func(input, pack_num, name=name)
+
+
+@enable_if.condition(hob.in_global_mode & ~hob.eager_execution_enabled)
+def pack(input, pack_num, name=None):
+    assert not flow.eager_execution_enabled()
+    return (
+        flow.user_op_builder(name if name is not None else id_util.UniqueStr("Pack_"))
+        .Op("pack")
+        .Input("in", [input])
+        .Output("out")
+        .Attr("pack_num", pack_num)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def api_parallel_cast(
+    input: oneflow._oneflow_internal.BlobDesc,
+    name: Optional[str] = None,
+    distribute: Optional[oneflow._oneflow_internal.distribute.Distribute] = None,
+    gradient_distribute: Optional[
+        oneflow._oneflow_internal.distribute.Distribute
+    ] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    func = enable_if.unique([parallel_cast])
+    return func(
+        input, name=name, distribute=distribute, gradient_distribute=gradient_distribute
+    )
+
+
+@enable_if.condition(hob.in_global_mode & ~hob.eager_execution_enabled)
+def parallel_cast(input, name=None, distribute=None, gradient_distribute=None):
+    if name is None:
+        name = id_util.UniqueStr("ParallelCast_")
+
+    def distribute_to_str(dist):
+        dist_str = ""
+        if dist is None:
+            pass
+        elif type(dist) is oneflow._oneflow_internal.distribute.SplitDistribute:
+            dist_str = "S({})".format(dist.axis)
+        elif type(dist) is oneflow._oneflow_internal.distribute.BroadcastDistribute:
+            dist_str = "B"
+        else:
+            raise ValueError("unsupported distribute")
+        return dist_str
+
+    sbp_parallel = distribute_to_str(distribute)
+    grad_sbp_parallel = distribute_to_str(gradient_distribute)
+    op = (
+        flow.user_op_builder(name)
+        .Op("parallel_cast")
+        .Input("in", [input])
+        .Output("out")
+        .Attr("sbp_parallel", sbp_parallel)
+        .Attr("grad_sbp_parallel", grad_sbp_parallel)
+        .Build()
+    )
+    return op.InferAndTryRun().SoleOutputBlob()
+
+
+def api_hierarchical_parallel_cast(
+    input: oneflow._oneflow_internal.BlobDesc,
+    parallel_distribution: Sequence[str],
+    grad_mode: Optional[str] = None,
+    grad_parallel_distribution: Sequence[str] = None,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    func = enable_if.unique([hierarchical_parallel_cast])
+    return func(
+        input,
+        parallel_distribution=parallel_distribution,
+        grad_mode=grad_mode,
+        grad_parallel_distribution=grad_parallel_distribution,
+        name=name,
+    )
+
+
+@enable_if.condition(hob.in_global_mode & ~hob.eager_execution_enabled)
+def hierarchical_parallel_cast(
+    input, parallel_distribution, grad_mode, grad_parallel_distribution, name
+):
+    if name is None:
+        name = id_util.UniqueStr("HierarchicalParallelCast_")
+
+    def distribute_to_str(dist):
+        if dist is None:
+            return ""
+        elif type(dist) is str:
+            return dist
+        elif type(dist) is oneflow._oneflow_internal.distribute.SplitDistribute:
+            return "S({})".format(dist.axis)
+        elif type(dist) is oneflow._oneflow_internal.distribute.BroadcastDistribute:
+            return "B"
+        else:
+            raise ValueError("unsupported distribute")
+
+    op = (
+        flow.user_op_builder(name)
+        .Op("hierarchical_parallel_cast")
+        .Input("in", [input])
+        .Output("out")
+        .Attr(
+            "parallel_distribution", list(map(distribute_to_str, parallel_distribution))
+        )
+        .Attr("grad_mode", grad_mode or "restore")
+        .Attr(
+            "grad_parallel_distribution",
+            list(map(distribute_to_str, grad_parallel_distribution))
+            if grad_parallel_distribution
+            else [],
+        )
+        .Build()
+    )
+    return op.InferAndTryRun().SoleOutputBlob()
diff --git a/python/oneflow/compatible/single_client/framework/placement_context.py b/python/oneflow/compatible/single_client/framework/placement_context.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fff3546e73d245f24ca1a95d14707671fb4e39a
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/placement_context.py
@@ -0,0 +1,120 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import collections
+import re
+
+import oneflow._oneflow_internal
+from oneflow._oneflow_internal.oneflow.core.job import placement as placement_cfg
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import c_api_util as c_api_util
+from oneflow.compatible.single_client.framework import op_util as op_util
+from oneflow.compatible.single_client.framework import session_context as session_ctx
+from oneflow.core.job import placement_pb2 as placement_pb
+
+
+class PlacementScope(object):
+    pass
+
+
+class EmptyPlacementScope(PlacementScope):
+    def __init__(self, device_tag, machine_device_ids, hierarchy):
+        if isinstance(machine_device_ids, (list, tuple)) == False:
+            machine_device_ids = [machine_device_ids]
+        self.device_tag_ = device_tag
+        self.machine_device_ids_ = machine_device_ids
+        self.hierarchy_ = hierarchy
+
+    @property
+    def device_tag(self):
+        return self.device_tag_
+
+    @property
+    def machine_device_ids(self):
+        return self.machine_device_ids_
+
+    @property
+    def hierarchy(self):
+        return self.hierarchy_
+
+    def __enter__(self):
+        pass
+
+    def __exit__(self, *args):
+        pass
+
+
+class GlobalModePlacementScope(PlacementScope):
+    def __init__(self, scope_ctx):
+        self.scope_ctx_ = scope_ctx
+
+    def __enter__(self):
+        self.scope_ctx_.__enter__()
+
+    def __exit__(self, *args):
+        self.scope_ctx_.__exit__(*args)
+
+
+def MakeParallelConf4Resource(device_tag, resource):
+    if device_tag == "gpu":
+        assert resource.HasField("gpu_device_num")
+        machine_device_ids = GetGpuMachineDeviceIds(resource)
+    elif device_tag == "cpu":
+        assert resource.HasField("cpu_device_num")
+        machine_device_ids = GetCpuMachineDeviceIds(resource)
+    else:
+        raise NotImplementedError
+    return oneflow._oneflow_internal.MakeParallelConf(device_tag, machine_device_ids)
+
+
+def MakeMachineId2DeviceIdList(parallel_conf):
+    parallel_conf_str = str(parallel_conf)
+    global _parallel_conf_str2ofrecord
+    if parallel_conf_str not in _parallel_conf_str2ofrecord:
+        ofrecord = c_api_util.GetMachine2DeviceIdListOFRecordFromParallelConf(
+            parallel_conf
+        )
+        _parallel_conf_str2ofrecord[parallel_conf_str] = {
+            int(k): list(v.int32_list.value) for (k, v) in ofrecord.feature.items()
+        }
+    return _parallel_conf_str2ofrecord[parallel_conf_str]
+
+
+def GetParallelSize(key2list):
+    size = 0
+    for (k, v) in key2list.items():
+        size += len(v)
+    return size
+
+
+def GetGpuMachineDeviceIds(resource):
+    assert resource.machine_num > 0
+    assert resource.HasField("gpu_device_num")
+    return [
+        "%s:0-%s" % (m_id, resource.gpu_device_num - 1)
+        for m_id in range(resource.machine_num)
+    ]
+
+
+def GetCpuMachineDeviceIds(resource):
+    assert resource.machine_num > 0
+    assert resource.HasField("cpu_device_num")
+    return [
+        "%s:0-%s" % (m_id, resource.cpu_device_num - 1)
+        for m_id in range(resource.machine_num)
+    ]
+
+
+_parallel_conf_str2ofrecord = {}
diff --git a/python/oneflow/compatible/single_client/framework/placement_util.py b/python/oneflow/compatible/single_client/framework/placement_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..17ba0d5f4de5c035352a4986e99b94a56620ac98
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/placement_util.py
@@ -0,0 +1,141 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import re
+import traceback
+
+import oneflow._oneflow_internal
+from oneflow import oneflow_deprecate
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import hob as hob
+from oneflow.compatible.single_client.framework import (
+    placement_context as placement_ctx,
+)
+from oneflow.compatible.single_client.framework import scope_util as scope_util
+from oneflow.compatible.single_client.framework import session_context as session_ctx
+from oneflow.compatible.single_client.support import enable_if as enable_if
+
+
+@oneflow_deprecate()
+def deprecated_placement(*args, **kwargs):
+    print(
+        "WARNING:",
+        "oneflow.compatible.single_client.device_prior_placement/oneflow.compatible.single_client.fixed_placement",
+        "will be removed in the future, use {} instead.".format(
+            "oneflow.compatible.single_client.scope.placement"
+        ),
+    )
+    print(traceback.format_stack()[-2])
+    return api_placement(*args, **kwargs)
+
+
+def api_placement(
+    device_tag: str, machine_device_ids: str, hierarchy=None
+) -> placement_ctx.PlacementScope:
+    """Create a scope. All ops within the scope will run on specified device that placed by  "device_tag" and "machine_device_ids".
+
+    Args:
+        device_tag (str): Device tag, "cpu" or "gpu" only
+        machine_device_ids (str): List of string that specifies what machine & device(s) to use, the format is "List[<NODE INDEX>:<DEVICE START INDEX>-<DEVICE END INDEX>, <NODE INDEX>:<DEVICE START INDEX>-<DEVICE END INDEX>, ...]", For example, "0:0" means use the device 0 of machine 0, and "1:4-6" means use device 4, 5, 6 of machine 1.
+
+    Returns:
+        placement_ctx.DevicePriorPlacementScope:  Placement scope
+
+    For example:
+
+    If you run program on single machine, you can assign the specified device like this:
+
+    .. code-block:: python
+
+        with flow.scope.placement("gpu", "0:0"):
+            logits = lenet(images, train=False)
+            loss = flow.nn.sparse_softmax_cross_entropy_with_logits(labels, logits, name="softmax_loss")
+            flow.losses.add_loss(loss)
+
+    Or you run distributed program, you can assign the specified devices like this:
+
+    .. code-block:: python
+
+        # configure machines ids, ips, etc.
+        with flow.scope.placement("gpu", ['0:0-7', '1:0-7']):
+            logits = lenet(images, train=False)
+            loss = flow.nn.sparse_softmax_cross_entropy_with_logits(labels, logits, name="softmax_loss")
+            flow.losses.add_loss(loss)
+
+    """
+    if oneflow._oneflow_internal.flags.with_cuda() == False and device_tag == "gpu":
+        device_tag = "cpu"
+    assert (
+        isinstance(hierarchy, (list, tuple, oneflow._oneflow_internal.Size))
+        or hierarchy is None
+    )
+    func = enable_if.unique(
+        [
+            GetEmptyPlacementScope,
+            GetNormalModePlacementScope,
+            GetGlobalModePlacementScope,
+        ]
+    )
+    return func(device_tag, machine_device_ids, hierarchy)
+
+
+@enable_if.condition(
+    hob.in_normal_mode & hob.env_initialized & ~hob.session_initialized
+)
+def GetEmptyPlacementScope(device_tag, machine_device_ids, hierarchy=None):
+    return placement_ctx.EmptyPlacementScope(device_tag, machine_device_ids, hierarchy)
+
+
+@enable_if.condition(hob.in_normal_mode & hob.session_initialized)
+def GetNormalModePlacementScope(device_tag, machine_device_ids, hierarchy=None):
+    if isinstance(machine_device_ids, tuple):
+        machine_device_ids = list(machine_device_ids)
+    if not isinstance(machine_device_ids, list):
+        machine_device_ids = [machine_device_ids]
+    sess = session_ctx.GetDefaultSession()
+    if hierarchy is not None:
+        hierarchy = oneflow._oneflow_internal.Size(tuple(hierarchy))
+    scope = scope_util.MakeScope(
+        lambda old_scope, builder: builder.BuildScopeWithNewParallelDesc(
+            old_scope, device_tag, machine_device_ids, hierarchy
+        )
+    )
+    return scope_util.ScopeContext(scope)
+
+
+@enable_if.condition(hob.in_global_mode)
+def GetGlobalModePlacementScope(device_tag, machine_device_ids, hierarchy=None):
+    if isinstance(machine_device_ids, (list, tuple)) == False:
+        machine_device_ids = [machine_device_ids]
+    sess = session_ctx.GetDefaultSession()
+    if hierarchy is not None:
+        hierarchy = oneflow._oneflow_internal.Size(tuple(hierarchy))
+
+    def BuildScope(old_scope, builder):
+        return builder.BuildScopeWithNewParallelDesc(
+            old_scope, device_tag, machine_device_ids, hierarchy
+        )
+
+    scope_ctx = scope_util.ScopeContext(scope_util.MakeScope(BuildScope))
+    return placement_ctx.GlobalModePlacementScope(scope_ctx)
+
+
+def GetDefaultMachineDeviceIds(resource):
+    if resource.HasField("gpu_device_num") and resource.gpu_device_num > 0:
+        return ("gpu", placement_ctx.GetGpuMachineDeviceIds(resource))
+    elif resource.HasField("cpu_device_num"):
+        return ("cpu", placement_ctx.GetCpuMachineDeviceIds(resource))
+    else:
+        raise NotImplementedError
diff --git a/python/oneflow/compatible/single_client/framework/profiler.py b/python/oneflow/compatible/single_client/framework/profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..96b7268edebcbbcc3cba3e4c30e56cbf45c5f453
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/profiler.py
@@ -0,0 +1,24 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow._oneflow_internal
+
+
+def RangePush(range_name):
+    oneflow._oneflow_internal.profiler.RangePush(range_name)
+
+
+def RangePop():
+    oneflow._oneflow_internal.profiler.RangePop()
diff --git a/python/oneflow/compatible/single_client/framework/pull_util.py b/python/oneflow/compatible/single_client/framework/pull_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c692179d34b6e5f766a300833f017125d3726ef
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/pull_util.py
@@ -0,0 +1,281 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import threading
+
+import numpy as np
+
+import oneflow._oneflow_internal
+from oneflow.compatible.single_client.framework import local_blob as local_blob_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+
+
+class FutureRemoteBlobs(object):
+    def __init__(self):
+        self.inited_ = False
+
+    def get(self):
+        raise NotImplementedError
+
+    def async_get(self, callback):
+        raise NotImplementedError
+
+    def SetResult(self, remote_blobs):
+        raise NotImplementedError
+
+    def Inited(self):
+        assert self.inited_ is False
+        self.inited_ = True
+        return self
+
+
+class LazyFutureRemoteBlobs(FutureRemoteBlobs):
+    def __init__(self, session):
+        super().__init__()
+        self.session_ = session
+        self.cond_var_ = threading.Condition()
+        self.out_remote_blob_pullers_ = []
+        self.finished_cnt_ = 0
+        self.data_delivered_ = False
+        self.async_get_callback_ = lambda: None
+
+    def get(self):
+        assert self.inited_
+        assert self.data_delivered_ == False
+        self._Wait()
+        self.data_delivered_ = True
+        return self._TrySyncAndGetResultNdarray(self.out_remote_blob_pullers_)
+
+    def async_get(self, callback):
+        assert self.inited_
+        assert self.data_delivered_ == False
+        pullers_cnt = self._GetPullersCnt()
+
+        def Callback():
+            assert self.finished_cnt_ <= pullers_cnt
+            if self.finished_cnt_ == pullers_cnt:
+                callback(
+                    self._TrySyncAndGetResultNdarray(self.out_remote_blob_pullers_)
+                )
+
+        try:
+            self.cond_var_.acquire()
+            if self.finished_cnt_ == pullers_cnt:
+                Callback()
+            else:
+                self.async_get_callback_ = Callback
+        finally:
+            self.cond_var_.release()
+        self.data_delivered_ = True
+
+    def SetResult(self, out_remote_blobs):
+        assert self.inited_ == False
+        assert isinstance(self.out_remote_blob_pullers_, list)
+        assert len(self.out_remote_blob_pullers_) == 0
+        pullers = self._MakeRemoteBlobPullers(out_remote_blobs)
+        self.out_remote_blob_pullers_ = pullers
+        for puller in self._FlatConsistentBlobPullers(pullers):
+            puller.AsyncPull(self._FinishCallback)
+        return self
+
+    def _FinishCallback(self):
+        self.cond_var_.acquire()
+        self.finished_cnt_ += 1
+        self.cond_var_.notify()
+        self.async_get_callback_()
+        self.cond_var_.release()
+
+    def _Wait(self):
+        pullers_cnt = self._GetPullersCnt()
+        self.cond_var_.acquire()
+        while self.finished_cnt_ != pullers_cnt:
+            self.cond_var_.wait()
+        self.cond_var_.release()
+
+    def _TrySyncAndGetResultNdarray(self, pullers):
+        if self.session_.HasAnyCallbackAfterFunctionReturn():
+            self.session_.Sync()
+        return self._GetResultLocalBlob(pullers)
+
+    def _GetResultLocalBlob(self, pullers):
+        assert self.inited_
+        if isinstance(pullers, _BlobPuller):
+            return pullers.result
+        if isinstance(pullers, (list, tuple)):
+            return type(pullers)((self._GetResultLocalBlob(x) for x in pullers))
+        if isinstance(pullers, dict):
+            return {k: self._GetResultLocalBlob(v) for (k, v) in pullers.items()}
+        raise NotImplementedError
+
+    def _GetPullersCnt(self):
+        cnt = 0
+        for _ in self._FlatConsistentBlobPullers(self.out_remote_blob_pullers_):
+            cnt += 1
+        return cnt
+
+    def _FlatConsistentBlobPullers(self, pullers):
+        if isinstance(pullers, _BlobPuller):
+            for x in pullers.FlatConsistentBlobPullers():
+                yield x
+        elif isinstance(pullers, list) or isinstance(pullers, tuple):
+            for elem in pullers:
+                for x in self._FlatConsistentBlobPullers(elem):
+                    yield x
+        elif isinstance(pullers, dict):
+            for (_, v) in pullers.items():
+                for x in self._FlatConsistentBlobPullers(v):
+                    yield x
+        else:
+            raise NotImplementedError
+
+    def _MakeRemoteBlobPullers(self, out_remote_blobs):
+        if isinstance(out_remote_blobs, oneflow._oneflow_internal.ConsistentBlob):
+            return _ConsistentBlobPuller(out_remote_blobs, self.session_)
+        if isinstance(out_remote_blobs, oneflow._oneflow_internal.MirroredBlob):
+            return _MirroredBlobPuller(out_remote_blobs, self.session_)
+        if isinstance(out_remote_blobs, list) or isinstance(out_remote_blobs, tuple):
+            return type(out_remote_blobs)(
+                (self._MakeRemoteBlobPullers(x) for x in out_remote_blobs)
+            )
+        if isinstance(out_remote_blobs, dict):
+            return {
+                k: self._MakeRemoteBlobPullers(v) for (k, v) in out_remote_blobs.items()
+            }
+        raise NotImplementedError
+
+
+class _BlobPuller(object):
+    def __init__(self, session):
+        self.session_ = session
+
+    def FlatConsistentBlobPullers(self):
+        raise NotImplementedError
+
+    @property
+    def result(self):
+        raise NotImplementedError
+
+
+class _ConsistentBlobPuller(_BlobPuller):
+    def __init__(self, consistent_blob, session):
+        _BlobPuller.__init__(self, session)
+        self.result_ = None
+        self.consistent_blob_ = consistent_blob
+
+    @property
+    def result(self):
+        assert self.result_ is not None
+        return self.result_
+
+    def FlatConsistentBlobPullers(self):
+        yield self
+
+    def AsyncPull(self, pull_cb):
+        def PullCallback(of_blob):
+            self.result_ = local_blob_util.LocalBlob(
+                of_blob.CopyToNdarray(), self.consistent_blob_.is_dynamic
+            )
+            pull_cb()
+
+        self.session_.AsyncPull(self.consistent_blob_.op_name, PullCallback)
+
+
+class _MirroredBlobPuller(_BlobPuller):
+    def __init__(self, mirrored_blob, session):
+        _BlobPuller.__init__(self, session)
+        self.mirrored_blob_ = mirrored_blob
+        self.sub_pullers_ = tuple(
+            (
+                _ConsistentBlobPuller(x, self.session_)
+                for x in mirrored_blob.sub_consistent_blob_list
+            )
+        )
+        self.local_mirrored_blob_ = None
+
+    @property
+    def result(self):
+        if self.local_mirrored_blob_ is not None:
+            return self.local_mirrored_blob_
+        local_blob_list = [x.result.numpy() for x in self.sub_pullers_]
+        local_numpy = local_blob_list[0]
+        if len(local_blob_list) > 1:
+            print("WARNING: return tensor list will concat as axis = 0.")
+            local_numpy = np.concatenate(local_blob_list, axis=0)
+        self.local_mirrored_blob_ = local_blob_util.LocalBlob(
+            local_numpy, self.mirrored_blob_.is_dynamic
+        )
+        return self.local_mirrored_blob_
+
+    def FlatConsistentBlobPullers(self):
+        for x in self.sub_pullers_:
+            yield x
+
+
+class EagerFutureRemoteBlobs(FutureRemoteBlobs):
+    def __init__(self):
+        super().__init__()
+        self.blob_getters_ = None
+
+    def get(self):
+        return self._GetResultLocalBlob(self.blob_getters_)
+
+    def async_get(self, callback):
+        assert callable(callback)
+        callback(self._GetResultLocalBlob(self.blob_getters_))
+
+    def SetResult(self, remote_blobs):
+        assert self.inited_ is False
+        assert self.blob_getters_ is None
+        self.blob_getters_ = self._MakeRemoteBlobGetters(remote_blobs)
+        return self
+
+    def _MakeRemoteBlobGetters(self, remote_blobs):
+        if isinstance(remote_blobs, (list, tuple)):
+            return type(remote_blobs)(
+                (self._MakeRemoteBlobGetters(blob) for blob in remote_blobs)
+            )
+        elif isinstance(remote_blobs, dict):
+            return {
+                k: self._MakeRemoteBlobGetters(v) for (k, v) in remote_blobs.items()
+            }
+        elif isinstance(remote_blobs, oneflow._oneflow_internal.EagerBlobTrait):
+            return _EagerBlobGetter(remote_blobs)
+        else:
+            raise NotImplementedError
+
+    def _GetResultLocalBlob(self, getter):
+        assert self.inited_
+        if isinstance(getter, _EagerBlobGetter):
+            return getter.result
+        elif isinstance(getter, (list, tuple)):
+            return type(getter)((self._GetResultLocalBlob(g) for g in getter))
+        elif isinstance(getter, dict):
+            return {k: self._GetResultLocalBlob(v) for (k, v) in getter.items()}
+        else:
+            raise NotImplementedError(type(getter))
+
+
+class _EagerBlobGetter(object):
+    def __init__(self, eager_blob):
+        assert isinstance(eager_blob, oneflow._oneflow_internal.EagerBlobTrait)
+        self.eager_blob_ = eager_blob
+        self.local_tensor_ = None
+
+    @property
+    def result(self):
+        if self.local_tensor_ is not None:
+            return self.local_tensor_
+        self.local_tensor_ = local_blob_util.MakeLocalBlob4EagerBlob(self.eager_blob_)
+        return self.local_tensor_
diff --git a/python/oneflow/compatible/single_client/framework/push_util.py b/python/oneflow/compatible/single_client/framework/push_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..45a19d1b5a80cb66819ac2f16e33a1efb68782e9
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/push_util.py
@@ -0,0 +1,288 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from functools import reduce
+
+import numpy
+
+import oneflow._oneflow_internal
+from oneflow._oneflow_internal.oneflow.core.register import logical_blob_id as lbi_util
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.eager import boxing_util as boxing_util
+from oneflow.compatible.single_client.framework import (
+    balanced_splitter as balanced_splitter,
+)
+from oneflow.compatible.single_client.framework import dtype as dtype_util
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework import input_blob_def as input_blob_def
+from oneflow.compatible.single_client.framework import (
+    python_callback as python_callback,
+)
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+from oneflow.core.register import logical_blob_id_pb2 as logical_blob_id_util
+
+blob_register = oneflow._oneflow_internal.GetDefaultBlobRegister()
+
+
+def AsyncPush(session, job_func, *arg):
+    assert len(arg) == len(job_func.__oneflow_input_blob_defs__)
+    for i in range(len(arg)):
+        _AsyncPushArg(session, job_func.__oneflow_input_blob_defs__[i], arg[i])
+
+
+def _AsyncPushArg(session, arg_blob_def, arg_ndarray):
+    if isinstance(arg_blob_def, (list, tuple)):
+        assert isinstance(arg_ndarray, (list, tuple)), "type(arg_ndarray): %s" % type(
+            arg_ndarray
+        )
+        assert len(arg_blob_def) == len(arg_ndarray), "%s v.s. %s" % (
+            len(arg_blob_def),
+            len(arg_ndarray),
+        )
+        for (blob_def, ndarray) in zip(arg_blob_def, arg_ndarray):
+            _AsyncPushArg(session, blob_def, ndarray)
+    elif isinstance(arg_blob_def, dict):
+        assert type(arg_blob_def) is type(arg_ndarray)
+        assert set(arg_blob_def.keys()) == set(arg_ndarray.keys())
+        for (k, blob_def) in arg_blob_def.items():
+            _AsyncPushArg(session, blob_def, arg_ndarray[k])
+    else:
+        assert isinstance(arg_blob_def, input_blob_def.ArgBlobDef)
+        arg_blob_def.CheckAndAsyncPush(session, arg_ndarray)
+
+
+def MakeEagerInputBlobs(arg_blob_def, arg_ndarray):
+    if isinstance(arg_blob_def, (list, tuple)):
+        assert isinstance(arg_ndarray, (list, tuple)), "type(arg_ndarray): %s" % type(
+            arg_ndarray
+        )
+        assert len(arg_blob_def) == len(arg_ndarray)
+        return type(arg_blob_def)(
+            (
+                MakeEagerInputBlobs(blob_def, ndarray)
+                for (blob_def, ndarray) in zip(arg_blob_def, arg_ndarray)
+            )
+        )
+    elif isinstance(arg_blob_def, dict):
+        assert type(arg_blob_def) is type(arg_ndarray)
+        assert set(arg_blob_def.keys()) == set(arg_ndarray.keys())
+        return {
+            k: MakeEagerInputBlobs(blob_def, arg_ndarray[k])
+            for (k, blob_def) in arg_blob_def.items()
+        }
+    else:
+        return _CreateEagerInputBlobAndFeedValue(arg_blob_def, arg_ndarray)
+
+
+def _CheckInputArgBlobDefValueMatch(arg_blob_def, arg_value):
+    if isinstance(arg_blob_def, input_blob_def.FixedTensorDef):
+        assert isinstance(arg_value, numpy.ndarray)
+        assert arg_blob_def.shape == arg_value.shape
+    elif isinstance(arg_blob_def, input_blob_def.MirroredTensorDef):
+        assert isinstance(arg_value, (list, tuple))
+        for v in arg_value:
+            assert isinstance(v, numpy.ndarray)
+            assert len(v.shape) == len(arg_blob_def.shape)
+            assert numpy.prod(v.shape) <= numpy.prod(arg_blob_def.shape)
+    else:
+        raise NotImplementedError
+
+
+def FeedValueToEagerBlob(blob_object, blob_def, ndarray):
+    physical_blob_objects = _GetPhysicalBlobObjects(blob_object, None)
+    feed_ctx = FeedContext(blob_object.op_arg_parallel_attr, ndarray)
+    for (i, physical_blob_object) in enumerate(physical_blob_objects):
+        feed_ctx.set_rank(i)
+        _FeedValueToInputPhysicalBlob(feed_ctx, blob_def, physical_blob_object)
+
+
+def _CreateEagerInputBlobAndFeedValue(arg_blob_def, arg_ndarray):
+    _CheckInputArgBlobDefValueMatch(arg_blob_def, arg_ndarray)
+    (arg_blob_object, lbi) = _MakeInputBlobObject(arg_blob_def)
+    FeedValueToEagerBlob(arg_blob_object, arg_blob_def, arg_ndarray)
+    get_blob = None
+    if not isinstance(lbi, lbi_util.LogicalBlobId):
+        cfg_lbi = lbi_util.LogicalBlobId()
+        cfg_lbi.set_op_name(lbi.op_name)
+        cfg_lbi.set_blob_name(lbi.blob_name)
+        lbi = cfg_lbi
+    if isinstance(arg_blob_def, input_blob_def.FixedTensorDef):
+
+        def get_blob(lbi, blob_object, blob_register):
+            blob = oneflow._oneflow_internal.EagerConsistentBlob(
+                lbi, blob_object, blob_register
+            )
+            with flow.scope.consistent_view():
+                return flow.identity(blob)
+
+    elif isinstance(arg_blob_def, input_blob_def.MirroredTensorDef):
+        get_blob = oneflow._oneflow_internal.EagerMirroredBlob
+    else:
+        raise NotImplementedError
+    return get_blob(lbi, blob_object=arg_blob_object, blob_register=blob_register)
+
+
+def _MakeInputBlobObject(arg_blob_def):
+    (input_op_conf, lbi) = _MakeInputOpConfAndRetLbi(arg_blob_def)
+    bn_in_op2blob_object = oneflow._oneflow_internal.deprecated.BnInOp2BlobObject()
+
+    def BuildInputInstruction(builder):
+        op_attribute = arg_blob_def.EagerAddAndInferOp(input_op_conf)
+        scope = flow.current_scope()
+        parallel_conf = scope.device_parallel_desc_symbol.parallel_conf
+        cfg_op_attribute = oneflow._oneflow_internal.deprecated.MakeOpAttributeByString(
+            str(op_attribute)
+        )
+        builder.StatelessCall(
+            cfg_op_attribute, parallel_conf, bn_in_op2blob_object, boxing_util.BoxingTo
+        )
+
+    oneflow._oneflow_internal.deprecated.LogicalRun(BuildInputInstruction)
+    return (bn_in_op2blob_object["out"], lbi)
+
+
+def _GetPhysicalBlobObjects(logical_blob_object, lbi):
+    blob_register = oneflow._oneflow_internal.GetDefaultBlobRegister()
+    physical_blob_objects = None
+
+    def BuildLogical2PhysicalInstruction(builder):
+        nonlocal physical_blob_objects
+        physical_blob_objects = builder.UnpackLogicalBlobToPhysicalBlobs(
+            logical_blob_object
+        )
+
+    oneflow._oneflow_internal.deprecated.LogicalRun(BuildLogical2PhysicalInstruction)
+    return physical_blob_objects
+
+
+def _MakeInputOpConfAndRetLbi(arg_blob_def):
+    assert isinstance(arg_blob_def, input_blob_def.ArgBlobDef)
+    op_conf = op_conf_util.OperatorConf()
+    op_conf.name = id_util.UniqueStr("Input_")
+    op_conf.input_conf.out = "out"
+    op_conf.input_conf.blob_conf.CopyFrom(arg_blob_def.ToInterfaceBlobConf())
+    lbi = logical_blob_id_util.LogicalBlobId()
+    lbi.op_name = op_conf.name
+    lbi.blob_name = op_conf.input_conf.out
+    return (op_conf, lbi)
+
+
+class FeedContext(object):
+    def __init__(self, op_arg_parallel_attr, arg_ndarray, rank=0):
+        self.op_arg_parallel_attr_ = op_arg_parallel_attr
+        self.arg_ndarray_ = arg_ndarray
+        self.rank_ = rank
+        self.balanced_range_ = None
+
+    def set_rank(self, rank):
+        self.rank_ = rank
+
+    def GetFixedTensor(self, logical_shape):
+        assert isinstance(self.arg_ndarray_, numpy.ndarray)
+        assert self.arg_ndarray_.shape == logical_shape, "%s v.s. %s" % (
+            self.arg_ndarray_.shape,
+            logical_shape,
+        )
+        sbp_parallel = self.op_arg_parallel_attr_.sbp_parallel
+        parallel_num = self.op_arg_parallel_attr_.parallel_desc_symbol.parallel_num
+        if sbp_parallel.has_broadcast_parallel() or parallel_num == 1:
+            return self._AsContiguousNdArray(self.arg_ndarray_)
+        elif sbp_parallel.has_split_parallel():
+            axis = sbp_parallel.split_parallel().axis()
+            (start, end) = self._GetBalancedRanges(logical_shape[axis])[self.rank_]
+            slc = [slice(None)] * len(logical_shape)
+            slc[axis] = slice(start, end)
+            ndarray = self.arg_ndarray_[tuple(slc)]
+            return self._AsContiguousNdArray(ndarray)
+        else:
+            raise NotImplementedError
+
+    def _GetBalancedRanges(self, dim):
+        parallel_num = self.op_arg_parallel_attr_.parallel_desc_symbol.parallel_num
+        if self.balanced_range_ is None:
+            self.balanced_range_ = balanced_splitter.BalancedRanges(dim, parallel_num)
+        return self.balanced_range_
+
+    def GetMirroredTensor(self, static_shape):
+        capacity = reduce(lambda x, y: x * y, static_shape, 1)
+        assert isinstance(self.arg_ndarray_, (list, tuple))
+        parallel_num = self.op_arg_parallel_attr_.parallel_desc_symbol.parallel_num
+        assert len(self.arg_ndarray_) == parallel_num
+        assert all((isinstance(a, numpy.ndarray) for a in self.arg_ndarray_))
+        assert self.rank_ >= 0
+        assert self.rank_ < parallel_num
+        ndarray = self.arg_ndarray_[self.rank_]
+        elem_cnt = reduce(lambda x, y: x * y, ndarray.shape, 1)
+        assert elem_cnt <= capacity, "%s v.s. %s" % (ndarray.shape, static_shape)
+        return self._AsContiguousNdArray(ndarray)
+
+    def _AsContiguousNdArray(self, ndarray):
+        if isinstance(ndarray, numpy.ndarray):
+            return (
+                ndarray
+                if ndarray.flags["C_CONTIGUOUS"]
+                else numpy.ascontiguousarray(ndarray)
+            )
+        elif isinstance(ndarray, (tuple, list)):
+            return type(ndarray)((self._AsContiguousNdArray(a) for a in ndarray))
+        else:
+            raise NotImplementedError
+
+
+def _FeedValueToInputPhysicalBlob(feed_ctx, blob_def, blob_object):
+    assert isinstance(blob_def, input_blob_def.ArgBlobDef)
+    assert isinstance(blob_object, oneflow._oneflow_internal.BlobObject)
+    FeedBlob = _MakeFeedBlobCallback(feed_ctx, blob_def, blob_object)
+    assert callable(FeedBlob)
+
+    def BuildFeedInstruction(builder):
+        builder.FeedBlob(
+            blob_object, python_callback.GetIdForRegisteredCallback(FeedBlob)
+        )
+        builder.InsertRemoveForeignCallbackInstruction(
+            blob_object.object_id, python_callback.GetIdForRegisteredCallback(FeedBlob)
+        )
+
+    oneflow._oneflow_internal.deprecated.PhysicalRun(BuildFeedInstruction)
+
+
+def _MakeFeedBlobCallback(feed_ctx, blob_def, blob_object):
+    if isinstance(blob_def, input_blob_def.FixedTensorDef):
+
+        def FeedBlob(ofblob):
+            ndarray = feed_ctx.GetFixedTensor(blob_def.shape)
+            dtype = dtype_util.convert_oneflow_dtype_to_numpy_dtype(ofblob.dtype)
+            assert ndarray.dtype == dtype, "%s v.s. %s" % (ndarray.dtype, dtype)
+            assert ndarray.shape == ofblob.static_shape, "%s v.s. %s" % (
+                ndarray.shape,
+                ofblob.static_shape,
+            )
+            if ofblob.CopyFromNdarray(ndarray) is False:
+                raise ValueError
+
+    elif isinstance(blob_def, input_blob_def.MirroredTensorDef):
+
+        def FeedBlob(ofblob):
+            ndarray = feed_ctx.GetMirroredTensor(ofblob.static_shape)
+            assert isinstance(ndarray, numpy.ndarray)
+            dtype = dtype_util.convert_oneflow_dtype_to_numpy_dtype(ofblob.dtype)
+            assert ndarray.dtype == dtype, "%s v.s. %s" % (ndarray.dtype, dtype)
+            if ofblob.CopyFromNdarray(ndarray) is False:
+                raise ValueError
+
+    else:
+        raise NotImplementedError
+    return FeedBlob
diff --git a/python/oneflow/compatible/single_client/framework/python_callback.py b/python/oneflow/compatible/single_client/framework/python_callback.py
new file mode 100644
index 0000000000000000000000000000000000000000..c797d01626af0aacabf3b5259b96490b360ab21d
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/python_callback.py
@@ -0,0 +1,101 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import traceback
+
+import oneflow._oneflow_internal
+from oneflow._oneflow_internal.oneflow.core.job import job_conf as job_conf_cfg
+from oneflow._oneflow_internal.oneflow.core.job import placement as placement_cfg
+from oneflow._oneflow_internal.oneflow.core.job import scope as scope_cfg
+from oneflow._oneflow_internal.oneflow.core.operator import (
+    op_attribute as op_attribute_cfg,
+)
+from oneflow.compatible.single_client.framework import ofblob as ofblob
+
+
+def GetIdForRegisteredCallback(cb):
+    assert callable(cb)
+    global unique_id2handler
+    unique_id2handler[id(cb)] = cb
+    return id(cb)
+
+
+def DeleteRegisteredCallback(cb):
+    global unique_id2handler
+    assert id(cb) in unique_id2handler
+    del unique_id2handler[id(cb)]
+
+
+class PythonCallback(oneflow._oneflow_internal.ForeignCallback):
+    def __init__(self):
+        oneflow._oneflow_internal.ForeignCallback.__init__(self)
+
+    def OfBlobCall(self, unique_id, of_blob_ptr):
+        try:
+            _WatcherHandler(unique_id, of_blob_ptr)
+        except Exception as e:
+            print(traceback.format_exc())
+            raise e
+
+    def RemoveForeignCallback(self, unique_id):
+        global unique_id2handler
+        try:
+            del unique_id2handler[unique_id]
+        except Exception as e:
+            print(traceback.format_exc())
+            raise e
+
+    def EagerInterpretCompletedOp(self, op_attribute, parallel_conf):
+        try:
+            interpreter_callback.InterpretCompletedOp(str(op_attribute), parallel_conf)
+        except Exception as e:
+            print(traceback.format_exc())
+            raise e
+
+    def EagerMirroredCast(self, op_attribute, parallel_conf):
+        try:
+            interpreter_callback.MirroredCast(str(op_attribute), parallel_conf)
+        except Exception as e:
+            print(traceback.format_exc())
+            raise e
+
+    def MakeScopeSymbol(self, job_conf, parallel_conf, is_mirrored):
+        try:
+            return interpreter_callback.MakeScopeSymbol(
+                job_conf, parallel_conf, is_mirrored
+            )
+        except Exception as e:
+            print(traceback.format_exc())
+            raise e
+
+    def MakeParallelDescSymbol(self, parallel_conf):
+        try:
+            return interpreter_callback.MakeParallelDescSymbol(parallel_conf)
+        except Exception as e:
+            print(traceback.format_exc())
+            raise e
+
+
+def _WatcherHandler(unique_id, of_blob_ptr):
+    global unique_id2handler
+    assert unique_id in unique_id2handler
+    handler = unique_id2handler[unique_id]
+    assert callable(handler)
+    handler(ofblob.OfBlob(of_blob_ptr))
+
+
+unique_id2handler = {}
+global_python_callback = PythonCallback()
+interpreter_callback = None
diff --git a/python/oneflow/compatible/single_client/framework/register_class_method_util.py b/python/oneflow/compatible/single_client/framework/register_class_method_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7ab35fc34048d9231adb8d773b197c02f4970fa
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/register_class_method_util.py
@@ -0,0 +1,37 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow._oneflow_internal
+from oneflow.compatible.single_client.eager import eager_blob_util as eager_blob_util
+from oneflow.compatible.single_client.framework import blob_trait as blob_trait
+from oneflow.compatible.single_client.framework import functional as functional
+from oneflow.compatible.single_client.framework import generator as generator
+from oneflow.compatible.single_client.framework import op_expr_util as op_expr_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+
+
+def RegisterMethod4Class():
+    op_expr_util.RegisterMethod4UserOpExpr()
+    functional.RegisterFunctionalApis()
+    eager_blob_util.RegisterMethod4EagerPhysicalBlob()
+    blob_trait.RegisterBlobOperatorTraitMethod(
+        oneflow._oneflow_internal.EagerPhysicalBlob
+    )
+    blob_trait.RegisterBlobOperatorTraitMethod(oneflow._oneflow_internal.ConsistentBlob)
+    blob_trait.RegisterBlobOperatorTraitMethod(oneflow._oneflow_internal.MirroredBlob)
+    remote_blob_util.RegisterMethod4EagerBlobTrait()
+    remote_blob_util.RegisterMethod4LazyConsistentBlob()
+    remote_blob_util.RegisterMethod4LazyMirroredBlob()
+    remote_blob_util.RegisterMethod4EagerConsistentBlob()
diff --git a/python/oneflow/compatible/single_client/framework/register_python_callback.py b/python/oneflow/compatible/single_client/framework/register_python_callback.py
new file mode 100644
index 0000000000000000000000000000000000000000..93168a1ee9e5ea9784cb66e287997b7ba95407f4
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/register_python_callback.py
@@ -0,0 +1,24 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow._oneflow_internal
+from oneflow.compatible.single_client.eager import (
+    interpreter_callback as interpreter_callback,
+)
+from oneflow.compatible.single_client.framework import (
+    python_callback as python_callback,
+)
+
+python_callback.interpreter_callback = interpreter_callback
diff --git a/python/oneflow/compatible/single_client/framework/remote_blob.py b/python/oneflow/compatible/single_client/framework/remote_blob.py
new file mode 100644
index 0000000000000000000000000000000000000000..d46637bb308480dc271c136734e6341918f4bf61
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/remote_blob.py
@@ -0,0 +1,240 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import sys
+import traceback
+
+import oneflow._oneflow_internal
+from oneflow._oneflow_internal.oneflow.core.job import placement as placement_cfg
+from oneflow._oneflow_internal.oneflow.core.register import logical_blob_id as lbi_util
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.eager import boxing_util as boxing_util
+from oneflow.compatible.single_client.eager import eager_blob_util as eager_blob_util
+from oneflow.compatible.single_client.eager import gradient_util as gradient_util
+from oneflow.compatible.single_client.framework import blob_trait as blob_trait
+from oneflow.compatible.single_client.framework import c_api_util as c_api_util
+from oneflow.compatible.single_client.framework import hob as hob
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework import (
+    placement_context as placement_ctx,
+)
+from oneflow.compatible.single_client.framework.dtype import (
+    convert_proto_dtype_to_oneflow_dtype,
+)
+from oneflow.compatible.single_client.support import enable_if as enable_if
+from oneflow.core.register import logical_blob_id_pb2 as logical_blob_id_util
+
+blob_register = oneflow._oneflow_internal.GetDefaultBlobRegister()
+
+
+def RemoteBlob(lbi, **kw):
+    api = enable_if.unique([EagerLogicalBlob, LazyRemoteBlob])
+    return api(lbi, **kw)
+
+
+@enable_if.condition(hob.in_global_mode & hob.eager_execution_enabled)
+def EagerLogicalBlob(lbi, **kw):
+    job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+    lbn = lbi.op_name + "/" + lbi.blob_name
+    if not isinstance(lbi, lbi_util.LogicalBlobId):
+        cfg_lbi = lbi_util.LogicalBlobId()
+        cfg_lbi.set_op_name(lbi.op_name)
+        cfg_lbi.set_blob_name(lbi.blob_name)
+        lbi = cfg_lbi
+    blob_type = oneflow._oneflow_internal.EagerConsistentBlob
+    if c_api_util.JobBuildAndInferCtx_IsMirroredBlob(job_name, lbn):
+        blob_type = oneflow._oneflow_internal.EagerMirroredBlob
+    job_name = ""
+    if "job_name" in kw and kw["job_name"] is not None:
+        job_name = kw["job_name"]
+    blob_object = None
+    if "blob_object" in kw:
+        blob_object = kw["blob_object"]
+    distribute = oneflow._oneflow_internal.distribute.auto()
+    if "distribute" in kw:
+        distribute = kw["distribute"]
+    return blob_type(lbi, blob_object, blob_register, job_name, distribute)
+
+
+@enable_if.condition(~hob.eager_execution_enabled)
+def LazyRemoteBlob(lbi, **kw):
+    job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+    lbn = lbi.op_name + "/" + lbi.blob_name
+    blob_type = oneflow._oneflow_internal.LazyConsistentBlob
+    if c_api_util.JobBuildAndInferCtx_IsMirroredBlob(job_name, lbn):
+        blob_type = oneflow._oneflow_internal.LazyMirroredBlob
+    if not isinstance(lbi, lbi_util.LogicalBlobId):
+        cfg_lbi = lbi_util.LogicalBlobId()
+        cfg_lbi.set_op_name(lbi.op_name)
+        cfg_lbi.set_blob_name(lbi.blob_name)
+        lbi = cfg_lbi
+    job_name = ""
+    if "job_name" in kw and kw["job_name"] is not None:
+        job_name = kw["job_name"]
+    distribute = oneflow._oneflow_internal.distribute.auto()
+    if "distribute" in kw:
+        distribute = kw["distribute"]
+    return blob_type(lbi, job_name, distribute)
+
+
+@property
+def dtype(self):
+    ret = convert_proto_dtype_to_oneflow_dtype(self.get_dtype())
+    assert isinstance(ret, flow.dtype)
+    return ret
+
+
+def with_distribute(self, distribute):
+    new = type(self)(
+        self.lbi, self.job_name, oneflow._oneflow_internal.distribute.auto()
+    )
+    new.set_distribute(distribute)
+    return new
+
+
+def with_gradient_distribute(self, distribute):
+    return flow.parallel_cast(self, gradient_distribute=distribute)
+
+
+def get_lazy_shape_log_warning(self):
+    if flow.scope.mirrored_view_enabled():
+        return "%s\n%s\n%s" % (
+            "WARNING:",
+            "You access a consistent blob shape in mirrored view, there may be problems,",
+            "you should add 'x = flow.cast_to_current_logical_view(x)'.",
+        )
+    else:
+        return ""
+
+
+def get_mirror_shape_log_warning(self):
+    if flow.scope.consistent_view_enabled():
+        return "%s\n%s\n%s" % (
+            "WARNING:",
+            "You access a mirrored blob shape in consistent view, there may be problems,",
+            "you should add 'x = flow.cast_to_current_logical_view(x)'.",
+        )
+    else:
+        return ""
+
+
+def RegisterMethod4BlobDef(blob_class):
+    blob_class.dtype = dtype
+    blob_class.with_distribute = with_distribute
+    blob_class.with_gradient_distribute = with_gradient_distribute
+
+
+def RegisterMethod4LazyConsistentBlob():
+    RegisterMethod4BlobDef(oneflow._oneflow_internal.LazyConsistentBlob)
+    oneflow._oneflow_internal.LazyConsistentBlob.get_lazy_shape_log_warning = (
+        get_lazy_shape_log_warning
+    )
+
+
+def RegisterMethod4LazyMirroredBlob():
+    RegisterMethod4BlobDef(oneflow._oneflow_internal.LazyMirroredBlob)
+    oneflow._oneflow_internal.LazyMirroredBlob.get_mirror_shape_log_warning = (
+        get_mirror_shape_log_warning
+    )
+
+
+@property
+def sub_consistent_blob_list(self):
+    raise NotImplementedError
+
+
+def numpy(self, rank=None):
+    assert rank is None or rank == 0
+    return self._Numpy()
+
+
+def numpy_list(self, rank=None):
+    assert rank is None or rank == 0
+    return [self._Numpy()]
+
+
+def BlobObjectNumpy(blob_object, tmp_name=None):
+    if tmp_name is None:
+        tmp_name = id_util.UniqueStr("numpy-tmp-")
+
+    def FetchBlobNumpy(blob_object):
+        consistent_blob_name = None
+
+        def BoxingToSingleDevice(builder):
+            parallel_conf = placement_cfg.ParallelConf()
+            parallel_conf.set_device_tag(blob_object.parallel_desc_symbol.device_tag)
+            parallel_conf.add_device_name("{}:{}".format(0, 0))
+            tmp_parallel_desc_symbol = builder.GetParallelDescSymbol(parallel_conf)
+            tmp_op_arg_parallel_attr = oneflow._oneflow_internal.OpArgParallelAttribute(
+                tmp_parallel_desc_symbol,
+                str(blob_object.op_arg_parallel_attr.sbp_parallel),
+                str(blob_object.op_arg_parallel_attr.opt_mirrored_parallel),
+            )
+            with flow.scope.placement(
+                parallel_conf.device_tag(), list(parallel_conf.device_name())
+            ):
+                tmp_blob_object = boxing_util.BoxingTo(
+                    builder, blob_object, tmp_op_arg_parallel_attr
+                )
+            nonlocal consistent_blob_name
+            consistent_blob_name = tmp_name
+            if not blob_register.HasObject4BlobName(consistent_blob_name):
+                blob_register.SetObject4BlobName(consistent_blob_name, tmp_blob_object)
+
+        oneflow._oneflow_internal.deprecated.LogicalRun(BoxingToSingleDevice)
+        return oneflow._oneflow_internal.EagerPhysicalBlob(
+            consistent_blob_name,
+            blob_register,
+            eager_blob_util._GetPhysicalBlobHeaderCache,
+        ).numpy()
+
+    return FetchBlobNumpy(blob_object)
+
+
+def _Numpy(self):
+    tmp_name = "{}-consistent".format(self.logical_blob_name)
+    return BlobObjectNumpy(self.blob_object, tmp_name)
+
+
+def RegisterMethod4EagerBlobTrait():
+    oneflow._oneflow_internal.EagerBlobTrait.sub_consistent_blob_list = (
+        sub_consistent_blob_list
+    )
+    oneflow._oneflow_internal.EagerBlobTrait.dtype = dtype
+    oneflow._oneflow_internal.EagerBlobTrait._Numpy = _Numpy
+    oneflow._oneflow_internal.EagerBlobTrait.numpy = numpy
+    oneflow._oneflow_internal.EagerBlobTrait.numpy_list = numpy_list
+
+
+def eager_with_distribute(self, distribute):
+    new = type(self)(
+        self.lbi,
+        blob_object=self.blob_object,
+        blob_register=blob_register,
+        job_name=self.job_name,
+        distribute=self.distribute,
+    )
+    new.set_distribute(distribute)
+    return new
+
+
+def RegisterMethod4EagerConsistentBlob():
+    oneflow._oneflow_internal.EagerConsistentBlob.dtype = dtype
+    oneflow._oneflow_internal.EagerConsistentBlob.with_distribute = (
+        eager_with_distribute
+    )
+    oneflow._oneflow_internal.EagerConsistentBlob.with_gradient_distribute = (
+        with_gradient_distribute
+    )
diff --git a/python/oneflow/compatible/single_client/framework/runtime_mode.py b/python/oneflow/compatible/single_client/framework/runtime_mode.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e063b035e2201dc89e4a9e7e03f19b8d267937a
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/runtime_mode.py
@@ -0,0 +1,41 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from contextlib import contextmanager
+
+NORMAL_MODE = "NORMAL_MODE"
+GLOBAL_MODE = "GLOBAL_MODE"
+DEVICE_MODE = "DEVICE_MODE"
+
+
+def CurrentMode():
+    return mode_statck[0]
+
+
+def IsValidMode(mode):
+    return mode == NORMAL_MODE or mode == GLOBAL_MODE or mode == DEVICE_MODE
+
+
+@contextmanager
+def ModeScope(mode):
+    global mode_statck
+    mode_statck.insert(0, mode)
+    try:
+        yield
+    finally:
+        mode_statck.pop(0)
+
+
+mode_statck = [NORMAL_MODE]
diff --git a/python/oneflow/compatible/single_client/framework/scope_symbol.py b/python/oneflow/compatible/single_client/framework/scope_symbol.py
new file mode 100644
index 0000000000000000000000000000000000000000..59e88ea93445bba54026aa591c36e1924e34ac25
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/scope_symbol.py
@@ -0,0 +1,160 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import collections
+import re
+
+import oneflow._oneflow_internal
+from oneflow._oneflow_internal.oneflow.core.job import placement as placement_cfg
+from oneflow._oneflow_internal.oneflow.core.job import scope as scope_cfg
+from oneflow.compatible.single_client.eager import symbol_storage as symbol_storage
+from oneflow.compatible.single_client.eager.symbol import Symbol
+
+
+class ScopeSymbol(Symbol):
+    def __init__(self, symbol_id, scope_proto, parent_scope_symbol=None):
+        Symbol.__init__(self, symbol_id, scope_proto)
+        self.parent_scope_symbol_ = parent_scope_symbol
+        self.job_desc_symbol_ = oneflow._oneflow_internal.GetJobConfSymbol(
+            scope_proto.job_desc_symbol_id()
+        )
+        self.device_parallel_desc_symbol_ = oneflow._oneflow_internal.GetPlacementSymbol(
+            scope_proto.device_parallel_desc_symbol_id()
+        )
+        self.host_parallel_desc_symbol_ = oneflow._oneflow_internal.GetPlacementSymbol(
+            scope_proto.host_parallel_desc_symbol_id()
+        )
+        self.auto_increment_id_ = 0
+
+    def auto_increment_id(self):
+        self.auto_increment_id_ = self.auto_increment_id_ + 1
+        return self.auto_increment_id_
+
+    @property
+    def session_id(self):
+        return self.data.session_id()
+
+    @property
+    def job_desc_symbol(self):
+        return self.job_desc_symbol_
+
+    @property
+    def device_parallel_desc_symbol(self):
+        return self.device_parallel_desc_symbol_
+
+    @property
+    def parent_scope_symbol(self):
+        return self.parent_scope_symbol_
+
+    def BuildBySetter(self, instruction_builder, setter):
+        scope_proto = self._CloneScopeProto()
+        setter(scope_proto)
+        return instruction_builder.GetScopeSymbol(scope_proto)
+
+    def BuildWithNewParallelDesc(
+        self, instruction_builder, device_tag, machine_device_ids
+    ):
+        if isinstance(machine_device_ids, str):
+            machine_device_ids = [machine_device_ids]
+
+        def SetScopeProto(scope_proto):
+            parallel_conf = MakeParallelConf(device_tag, machine_device_ids)
+            device_parallel_desc_sym = instruction_builder.GetParallelDescSymbol(
+                parallel_conf
+            )
+            parallel_conf = MakeParallelConf("cpu", machine_device_ids)
+            host_parallel_desc_sym = instruction_builder.GetParallelDescSymbol(
+                parallel_conf
+            )
+            scope_proto.set_device_parallel_desc_symbol_id(
+                device_parallel_desc_sym.symbol_id
+            )
+            scope_proto.set_host_parallel_desc_symbol_id(
+                host_parallel_desc_sym.symbol_id
+            )
+
+        return self.BuildBySetter(instruction_builder, SetScopeProto)
+
+    def BuildWithNewParallelConf(self, instruction_builder, parallel_conf):
+        (
+            device_tag,
+            machine_device_ids,
+            hierarchy,
+        ) = oneflow._oneflow_internal.GetDeviceTagAndMachineDeviceIdsAndHierarchy(
+            parallel_conf
+        )
+        return self.BuildWithNewParallelDesc(
+            instruction_builder, device_tag, machine_device_ids
+        )
+
+    def BuildWithNewIsMirrored(self, instruction_builder, is_mirrored):
+        def SetScopeProto(scope_proto):
+            if is_mirrored:
+                scope_proto.mutable_opt_mirrored_parallel_conf().mutable_mirrored_parallel()
+            else:
+                scope_proto.mutable_opt_mirrored_parallel_conf().clear_mirrored_parallel()
+
+        return self.BuildBySetter(instruction_builder, SetScopeProto)
+
+    def BuildWithNewScopeName(self, instruction_builder, scope_name):
+        def SetScopeProto(scope_proto):
+            scope_proto.add_scope_op_name_prefixes(scope_name)
+
+        return self.BuildBySetter(instruction_builder, SetScopeProto)
+
+    def _CloneScopeProto(self):
+        scope_proto = scope_cfg.ScopeProto()
+        scope_proto.CopyFrom(self.data)
+        return scope_proto
+
+
+def BuildInitialScope(
+    instruction_builder,
+    session_id,
+    job_conf,
+    device_tag,
+    machine_device_ids,
+    is_mirrored,
+):
+    scope_proto = scope_cfg.ScopeProto()
+    scope_proto.set_session_id(session_id)
+    job_conf_sym = instruction_builder.GetJobConfSymbol(job_conf)
+    scope_proto.set_job_desc_symbol_id(job_conf_sym.symbol_id)
+    parallel_conf = MakeParallelConf(device_tag, machine_device_ids)
+    device_parallel_desc_sym = instruction_builder.GetParallelDescSymbol(parallel_conf)
+    scope_proto.set_device_parallel_desc_symbol_id(device_parallel_desc_sym.symbol_id)
+    parallel_conf = MakeParallelConf("cpu", machine_device_ids)
+    host_parallel_desc_sym = instruction_builder.GetParallelDescSymbol(parallel_conf)
+    scope_proto.set_host_parallel_desc_symbol_id(host_parallel_desc_sym.symbol_id)
+    if is_mirrored:
+        scope_proto.mutable_opt_mirrored_parallel_conf().mutable_mirrored_parallel()
+    else:
+        scope_proto.mutable_opt_mirrored_parallel_conf().clear_mirrored_parallel()
+    return instruction_builder.GetScopeSymbol(scope_proto)
+
+
+def MakeParallelConf(device_tag, machine_device_ids):
+    assert isinstance(machine_device_ids, (list, tuple))
+    parallel_conf = placement_cfg.ParallelConf()
+    parallel_conf.set_device_tag(device_tag)
+    for machine_device_id in machine_device_ids:
+        assert isinstance(
+            machine_device_id, str
+        ), "type of machine_device_id (%s) is not string" % type(machine_device_id)
+        assert re.match("^\\d+:\\d+(-\\d+)?$", machine_device_id) is not None, (
+            "machine_device_id: %s is not valid" % machine_device_id
+        )
+        parallel_conf.add_device_name(machine_device_id)
+    return parallel_conf
diff --git a/python/oneflow/compatible/single_client/framework/scope_util.py b/python/oneflow/compatible/single_client/framework/scope_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..c36936d770367c9024517bf9632375b611c633a1
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/scope_util.py
@@ -0,0 +1,114 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import traceback
+from contextlib import contextmanager
+
+import oneflow._oneflow_internal
+from oneflow import oneflow_deprecate
+from oneflow._oneflow_internal.oneflow.core.job import job_conf as job_conf_cfg
+from oneflow.compatible.single_client.framework import attr_util as attr_util
+from oneflow.compatible.single_client.framework import session_context as session_ctx
+
+
+def api_scope_config(**kwargs):
+    name2default = session_ctx.GetDefaultSession().scope_attr_name2default_val
+
+    def SetScopeProto(scope_proto):
+        for (attr_name, py_value) in kwargs.items():
+            assert attr_name in name2default
+            attr_util.SetAttrValue(
+                scope_proto.mutable_attr_name2attr_value()[attr_name],
+                py_value,
+                name2default[attr_name],
+            )
+
+    sess = session_ctx.GetDefaultSession()
+    scope = MakeScope(
+        lambda old_scope, builder: builder.BuildScopeByProtoSetter(
+            old_scope, SetScopeProto
+        )
+    )
+    return ScopeContext(scope)
+
+
+def api_current_scope():
+    """ Return current scope
+    """
+    return oneflow._oneflow_internal.GetCurrentScope()
+
+
+from oneflow import oneflow_deprecate
+
+
+@oneflow_deprecate()
+def deprecated_current_scope(*args, **kwargs):
+    print(
+        "WARNING:",
+        "oneflow.compatible.single_client.scope.current_scope",
+        "will be removed in the future, use {} instead.".format(
+            "oneflow.compatible.single_client.current_scope"
+        ),
+    )
+    print(traceback.format_stack()[-2])
+    return api_current_scope(*args, **kwargs)
+
+
+def MakeScope(build_func):
+    scope = None
+    old_scope = oneflow._oneflow_internal.GetCurrentScope()
+    assert old_scope is not None
+
+    def BuildScope(builder):
+        nonlocal scope
+        scope = build_func(old_scope, builder)
+        assert scope is not None
+
+    oneflow._oneflow_internal.deprecated.LogicalRun(BuildScope)
+    return scope
+
+
+def MakeInitialScope(job_conf, device_tag, machine_device_ids, hierarchy, is_mirrored):
+    scope = None
+
+    def BuildInitialScope(builder):
+        nonlocal scope
+        session_id = session_ctx.GetDefaultSession().id
+        scope = builder.BuildInitialScope(
+            session_id, job_conf, device_tag, machine_device_ids, hierarchy, is_mirrored
+        )
+
+    oneflow._oneflow_internal.deprecated.LogicalRun(BuildInitialScope)
+    return scope
+
+
+def InitScopeStack():
+    job_conf = job_conf_cfg.JobConfigProto()
+    job_conf.mutable_predict_conf()
+    job_conf.set_job_name("")
+    scope = MakeInitialScope(job_conf, "cpu", ["0:0"], None, is_mirrored=False)
+    oneflow._oneflow_internal.InitGlobalScopeStack(scope)
+
+
+@contextmanager
+def ScopeContext(scope):
+    old_scope = oneflow._oneflow_internal.GetCurrentScope()
+    oneflow._oneflow_internal.GlobalScopeStackPush(scope)
+    try:
+        yield
+    finally:
+        assert oneflow._oneflow_internal.GetCurrentScope() is scope
+        oneflow._oneflow_internal.GlobalScopeStackPop()
+        assert oneflow._oneflow_internal.GetCurrentScope() is old_scope
diff --git a/python/oneflow/compatible/single_client/framework/session_context.py b/python/oneflow/compatible/single_client/framework/session_context.py
new file mode 100644
index 0000000000000000000000000000000000000000..9150f56ff22a1bc9976b43ed8368d66d43c77635
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/session_context.py
@@ -0,0 +1,66 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import functools
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+
+
+class SessionStatus:
+    OPEN = "OPEN"
+    RUNNING = "RUNNING"
+    CLOSED = "CLOSED"
+
+
+def GetDefaultSession():
+    global _sess_id2sess
+    default_sess_id = oneflow._oneflow_internal.GetDefaultSessionId()
+    assert default_sess_id in _sess_id2sess
+    return _sess_id2sess[default_sess_id]
+
+
+def OpenDefaultSession(sess):
+    global _sess_id2sess
+    assert sess.id not in _sess_id2sess
+    _sess_id2sess[sess.id] = sess
+
+
+def TryCloseDefaultSession():
+    global _sess_id2sess
+    default_sess_id = oneflow._oneflow_internal.GetDefaultSessionId()
+    assert default_sess_id in _sess_id2sess
+    if default_sess_id in _sess_id2sess:
+        _sess_id2sess[default_sess_id].TryClose()
+    del _sess_id2sess[default_sess_id]
+
+
+def TryCloseAllSession():
+    global _sess_id2sess
+    for sess_id in _sess_id2sess.keys():
+        _sess_id2sess[sess_id].TryClose()
+    _sess_id2sess.clear()
+
+
+def try_init_default_session(func):
+    @functools.wraps(func)
+    def Func(*args, **kwargs):
+        GetDefaultSession().TryInit()
+        return func(*args, **kwargs)
+
+    return Func
+
+
+_sess_id2sess = {}
diff --git a/python/oneflow/compatible/single_client/framework/session_util.py b/python/oneflow/compatible/single_client/framework/session_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..d80584fd889211af3506dd7ba0552a4dce0f0f60
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/session_util.py
@@ -0,0 +1,506 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import inspect
+import threading
+import traceback
+from contextlib import contextmanager
+from typing import Callable
+
+from google.protobuf import text_format
+
+import oneflow._oneflow_internal
+from oneflow import oneflow_deprecate
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.eager import op_executor as op_executor
+from oneflow.compatible.single_client.experimental import interface_op_read_and_write
+from oneflow.compatible.single_client.framework import c_api_util as c_api_util
+from oneflow.compatible.single_client.framework import check_point_v2 as check_point_v2
+from oneflow.compatible.single_client.framework import compiler as compiler
+from oneflow.compatible.single_client.framework import config_util as config_util
+from oneflow.compatible.single_client.framework import env_util as env_util
+from oneflow.compatible.single_client.framework import hob as hob
+from oneflow.compatible.single_client.framework import job_instance as job_instance_util
+from oneflow.compatible.single_client.framework import module as module_util
+from oneflow.compatible.single_client.framework import push_util as push_util
+from oneflow.compatible.single_client.framework import session_context as session_ctx
+from oneflow.compatible.single_client.framework import typing_util as oft_util
+from oneflow.compatible.single_client.framework.check_point import SnapshotManager
+from oneflow.compatible.single_client.framework.function_desc import FunctionDesc
+from oneflow.compatible.single_client.framework.pull_util import (
+    EagerFutureRemoteBlobs,
+    LazyFutureRemoteBlobs,
+)
+from oneflow.compatible.single_client.framework.session_context import SessionStatus
+from oneflow.compatible.single_client.support import enable_if as enable_if
+from oneflow.core.job import job_set_pb2 as job_set_util
+from oneflow.core.job.job_set_pb2 import ConfigProto
+
+
+class Session(object):
+    def __init__(self, sess_id):
+        self.job_name2function_desc_ = {}
+        self.job_name2job_ = {}
+        self.status_ = SessionStatus.OPEN
+        self.cond_var_ = threading.Condition()
+        self.running_job_cnt_ = 0
+        self.inter_user_job_info_ = None
+        self.uuid2watch_handler_ = {}
+        self.config_proto_ = None
+        self.resource_ = None
+        self.job_name2var_name2var_blob_ = {}
+        self.job_name2module_name2module_ = {}
+        self.existed_module_names_ = set()
+        self.var_name2var_blob_ = {}
+        self.interface_op_name2op_attr_ = {}
+        self.interface_op_name2job_name_ = {}
+        self.lazy_interface_op_name2parallel_conf_ = {}
+        self.op_name2lazy_blob_cache_ = {}
+        self.job_name2name_scope_stack_ = {}
+        self.eager_global_function_desc_stack_ = []
+        self.function_flag_name2default_val_ = {}
+        self._UpdateFunctionFlagName2DefaultVal()
+        self.scope_attr_name2default_val_ = {}
+        self._UpdateScopeAttrName2DefaultVal()
+        self.sess_ = oneflow._oneflow_internal.RegsiterSession(sess_id)
+        self.backward_blob_register_ = oneflow._oneflow_internal.BlobRegister()
+        self.snapshot_mgr_ = SnapshotManager()
+        self.eager_config_proto_ctx_ = None
+
+    @property
+    def id(self):
+        return self.sess_.id
+
+    @property
+    def status(self):
+        return self.status_
+
+    @property
+    def is_running(self):
+        return self.status_ is SessionStatus.RUNNING
+
+    @property
+    def config_proto(self):
+        if self.config_proto_ is None:
+            self.config_proto_ = _GetDefaultConfigProto()
+        return self.config_proto_
+
+    @property
+    def resource(self):
+        if self.resource_ is None:
+            return flow.env.current_resource()
+        else:
+            return self.resource_
+
+    @property
+    def uuid2watch_handler(self):
+        return self.uuid2watch_handler_
+
+    @property
+    def function_flag_name2default_val(self):
+        return self.function_flag_name2default_val_
+
+    @property
+    def scope_attr_name2default_val(self):
+        return self.scope_attr_name2default_val_
+
+    @property
+    def inter_user_job_info(self):
+        return self.inter_user_job_info_
+
+    @property
+    def job_name2name_scope_stack(self):
+        return self.job_name2name_scope_stack_
+
+    @property
+    def backward_blob_register(self):
+        return self.backward_blob_register_
+
+    @property
+    def snapshot_mgr(self):
+        return self.snapshot_mgr_
+
+    @property
+    def var_name2var_blob(self):
+        return self.var_name2var_blob_
+
+    def GetLazyFunctionDesc(self, job_name):
+        if job_name in self.job_name2function_desc_:
+            return self.job_name2function_desc_[job_name]
+        return None
+
+    def AnyGlobalFunctionDefined(self):
+        return len(self.job_name2function_desc_) > 0
+
+    def GetJobConfigProto(self, job_name):
+        return self.job_name2function_desc_[job_name].job_config_proto
+
+    def GetFunctionDesc(self, job_name):
+        return self.job_name2function_desc_[job_name]
+
+    def _UpdateFunctionFlagName2DefaultVal(self):
+        items = c_api_util.GetFunctionConfigDef().attr_name2attr_def.items()
+        self.function_flag_name2default_val_ = {k: v.default_val for (k, v) in items}
+
+    def _UpdateScopeAttrName2DefaultVal(self):
+        items = c_api_util.GetScopeConfigDef().attr_name2attr_def.items()
+        self.scope_attr_name2default_val_ = {k: v.default_val for (k, v) in items}
+
+    def TryInit(self):
+        if self.status_ is SessionStatus.OPEN:
+            self.Init()
+        return self
+
+    def UpdateInfo4InterfaceOp(self):
+        for op_attr in c_api_util.GetInterfaceOpAttributes().op_attribute:
+            self.interface_op_name2op_attr_[op_attr.op_conf.name] = op_attr
+        for job in c_api_util.GetJobSet().job:
+            op_name2parallel_conf = {}
+            for placement_group in job.placement.placement_group:
+                for op_name in placement_group.op_set.op_name:
+                    op_name2parallel_conf[op_name] = placement_group.parallel_conf
+            for op_conf in job.net.op:
+                if c_api_util.IsInterfaceOpConf(op_conf):
+                    self.interface_op_name2job_name_[
+                        op_conf.name
+                    ] = job.job_conf.job_name
+                    self.lazy_interface_op_name2parallel_conf_[
+                        op_conf.name
+                    ] = op_name2parallel_conf[op_conf.name]
+
+    def Init(self):
+        assert self.status_ is SessionStatus.OPEN
+        self.status_ = SessionStatus.RUNNING
+        if not oneflow._oneflow_internal.IsEnvInited():
+            flow.env.init()
+        _TryCompleteConfigProto(self.config_proto)
+        self.resource_ = self.config_proto.resource
+        if not oneflow._oneflow_internal.EagerExecutionEnabled():
+            c_api_util.InitLazyGlobalSession(self.config_proto)
+            for (job_name, func_desc) in self.job_name2function_desc_.items():
+                compiler.Compile(self, func_desc, self.config_proto)
+                self.existed_module_names_ = set()
+            self.job_name2var_name2var_blob_ = dict()
+            assert len(self.job_name2function_desc_.items()) > 0
+            oneflow._oneflow_internal.StartLazyGlobalSession()
+            self.inter_user_job_info_ = c_api_util.GetInterUserJobInfo()
+            self.UpdateInfo4InterfaceOp()
+            if not config_util.api_legacy_model_io_enabled():
+                check_point_v2.Init()
+        else:
+            self.eager_config_proto_ctx_ = oneflow._oneflow_internal.LogicalConfigProtoContext(
+                str(self.config_proto)
+            )
+        return self
+
+    def FindOrCreateLazyBlob(self, op_name, Create):
+        if op_name not in self.op_name2lazy_blob_cache_:
+            self.op_name2lazy_blob_cache_[op_name] = Create()
+        return self.op_name2lazy_blob_cache_[op_name]
+
+    def TryClose(self):
+        if self.status_ is SessionStatus.RUNNING:
+            self.Close()
+        if self.status_ != SessionStatus.CLOSED:
+            oneflow._oneflow_internal.ClearSessionById(self.id)
+        self.status_ = SessionStatus.CLOSED
+
+    def Close(self):
+        assert self.status_ is SessionStatus.RUNNING
+        self.Sync()
+        assert len(self.job_name2var_name2var_blob_) == 0
+        del self.var_name2var_blob_
+        del self.job_name2module_name2module_
+        self.ReleaseLazyRefBlob()
+        self.ForceReleaseEagerBlobs()
+        oneflow._oneflow_internal.StopLazyGlobalSession()
+        oneflow._oneflow_internal.DestroyLazyGlobalSession()
+        self.resource_ = None
+        if self.eager_config_proto_ctx_:
+            del self.eager_config_proto_ctx_
+
+    def AddJob(self, function_desc):
+        assert self.status_ is SessionStatus.OPEN
+        assert isinstance(function_desc, FunctionDesc)
+        self.job_name2function_desc_[function_desc.job_func.__name__] = function_desc
+
+    def StashJob(self, job_name=None, key=None):
+        assert self.status_ is SessionStatus.RUNNING, "current status {}".format(
+            self.status_
+        )
+        job = c_api_util.GetCurrentJob()
+        if job_name is not None:
+            assert (
+                job.job_conf.job_name == job_name
+            ), "{} is not current job name".format(job_name)
+        else:
+            job_name = job.job_conf.job_name
+        if key is None:
+            key = job_name
+        self.job_name2job_[key] = job
+
+    def Job(self, job_name):
+        assert self.status_ is SessionStatus.RUNNING
+        if job_name not in self.job_name2job_:
+            return None
+        return self.job_name2job_[job_name]
+
+    def Sync(self):
+        assert self.status_ is SessionStatus.RUNNING
+        self.cond_var_.acquire()
+        while self.running_job_cnt_ > 0:
+            self.cond_var_.wait()
+        assert self.running_job_cnt_ == 0
+        self.cond_var_.release()
+
+    def ReleaseLazyRefBlob(self):
+        self.op_name2lazy_blob_cache_.clear()
+
+    def ForceReleaseEagerBlobs(self):
+        oneflow._oneflow_internal.GetDefaultBlobRegister().ForceReleaseAll()
+        self.backward_blob_register_.ForceReleaseAll()
+
+    def LazyRun(self, job_func, *arg):
+        assert self.status_ is SessionStatus.RUNNING
+        remote_blobs = self.LaunchUserJob(job_func, *arg)
+        if remote_blobs is None:
+            return
+        future_blob = LazyFutureRemoteBlobs(self).SetResult(remote_blobs).Inited()
+        annotation = inspect.signature(job_func).return_annotation
+        return oft_util.TransformGlobalFunctionResult(future_blob, annotation)
+
+    def EagerRun(self, function_desc, *arg):
+        with self._EagerGlobalFunctionDescScope(function_desc):
+            remote_blobs = compiler.EagerRun(
+                self, function_desc, self.config_proto, arg
+            )
+            if remote_blobs is None:
+                return
+            future_blob = EagerFutureRemoteBlobs().SetResult(remote_blobs).Inited()
+        annotation = inspect.signature(function_desc.job_func).return_annotation
+        return oft_util.TransformGlobalFunctionResult(future_blob, annotation)
+
+    def LaunchUserJob(self, job_func, *arg):
+        assert self.status_ is SessionStatus.RUNNING
+        job_name = job_func.__name__
+        push_util.AsyncPush(self, job_func, *arg)
+        self.LaunchJob(job_instance_util.MakeUserJobInstance(job_name))
+        return job_func.__oneflow_output_remote_blobs__
+
+    def LaunchJob(self, job_instance):
+        assert self.status_ is SessionStatus.RUNNING
+        self._IncRunningJobCnt()
+        job_instance.AddPostFinishCallback(lambda _: self._DecRunningJobCnt())
+        oneflow._oneflow_internal.LaunchJob(job_instance)
+
+    def AsyncPush(self, op_name, push_data_cb):
+        assert self.status_ is SessionStatus.RUNNING
+        push_job_name = self.inter_user_job_info.input_or_var_op_name2push_job_name[
+            op_name
+        ]
+        self.LaunchJob(
+            job_instance_util.MakePushJobInstance(push_job_name, op_name, push_data_cb)
+        )
+
+    def AsyncPull(self, op_name, pull_data_cb):
+        assert self.status_ is SessionStatus.RUNNING
+        pull_job_name = self.inter_user_job_info.output_or_var_op_name2pull_job_name[
+            op_name
+        ]
+        self.LaunchJob(
+            job_instance_util.MakePullJobInstance(pull_job_name, op_name, pull_data_cb)
+        )
+
+    def HasAnyCallbackAfterFunctionReturn(self):
+        return len(self.uuid2watch_handler) > 0
+
+    def StashVariableBlob4Job(self, job_name, var_name, var_blob):
+        if var_name not in self.var_name2var_blob_:
+            self.var_name2var_blob_[var_name] = var_blob
+        if job_name not in self.job_name2var_name2var_blob_:
+            self.job_name2var_name2var_blob_[job_name] = dict()
+        assert var_name not in self.job_name2var_name2var_blob_[job_name]
+        self.job_name2var_name2var_blob_[job_name][var_name] = var_blob
+
+    def AddInfo4InterfaceOpName(self, interface_op_name, op_attribute):
+        if flow.eager_execution_enabled():
+            self.interface_op_name2op_attr_[interface_op_name] = op_attribute
+            self.interface_op_name2job_name_[
+                interface_op_name
+            ] = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+        else:
+            pass
+
+    def OpAttribute4InterfaceOpName(self, interface_op_name):
+        return self.interface_op_name2op_attr_[interface_op_name]
+
+    def ParallelConf4LazyInterfaceOpName(self, interface_op_name):
+        return self.lazy_interface_op_name2parallel_conf_[interface_op_name]
+
+    def JobName4InterfaceOpName(self, interface_op_name):
+        return self.interface_op_name2job_name_[interface_op_name]
+
+    @property
+    def interface_ops(self):
+        return self.interface_op_name2op_attr_.keys()
+
+    def TryGetVariableBlobOfJobFromStash(self, job_name, var_name):
+        if var_name not in self.var_name2var_blob_:
+            return (None, None)
+        global_variable_blob = self.var_name2var_blob_[var_name]
+        if job_name not in self.job_name2var_name2var_blob_:
+            return (global_variable_blob, None)
+        var_name2var_blob = self.job_name2var_name2var_blob_[job_name]
+        if var_name not in var_name2var_blob:
+            return (global_variable_blob, None)
+        return (global_variable_blob, var_name2var_blob[var_name])
+
+    def CurrentEagerGlobalFunctionDesc(self):
+        if len(self.eager_global_function_desc_stack_) == 0:
+            return None
+        return self.eager_global_function_desc_stack_[0]
+
+    def has_empty_is_mirrored_strategy_enabled_stack(self):
+        return self.sess_.is_mirrored_strategy_enabled_stack_size() == 0
+
+    def push_mirrored_strategy_enabled(self, val):
+        assert isinstance(val, bool)
+        self.sess_.push_mirrored_strategy_enabled(val)
+
+    def pop_mirrored_strategy_enabled(self):
+        self.sess_.pop_mirrored_strategy_enabled()
+
+    def is_mirrored_strategy_enabled(self):
+        return self.sess_.is_mirrored_strategy_enabled()
+
+    def is_consistent_strategy_enabled(self):
+        return self.sess_.is_consistent_strategy_enabled()
+
+    @contextmanager
+    def _EagerGlobalFunctionDescScope(self, function_desc):
+        assert len(self.backward_blob_register.blob_name2object) == 0
+        assert len(self.job_name2var_name2var_blob_) == 0
+        self.eager_global_function_desc_stack_.insert(0, function_desc)
+        try:
+            yield
+        finally:
+            self.existed_module_names_ = set()
+            self.job_name2var_name2var_blob_ = dict()
+            self.eager_global_function_desc_stack_.pop(0)
+            keys = list(dict(self.backward_blob_register.blob_name2object).keys())
+            for key in keys:
+                self.backward_blob_register.ClearObject4BlobName(key)
+
+    def _IncRunningJobCnt(self):
+        assert self.status_ is SessionStatus.RUNNING
+        self.cond_var_.acquire()
+        self.running_job_cnt_ += 1
+        self.cond_var_.release()
+
+    def _DecRunningJobCnt(self):
+        self.cond_var_.acquire()
+        self.running_job_cnt_ -= 1
+        self.cond_var_.notify()
+        self.cond_var_.release()
+
+    def __del__(self):
+        self.TryClose()
+
+
+def api_find_or_create_module(
+    module_name: str, create: Callable[[], None], reuse: bool = False
+):
+    func = enable_if.unique([find_or_create_module])
+    return func(module_name, create, reuse)
+
+
+@enable_if.condition(hob.in_global_mode)
+def find_or_create_module(module_name, create, reuse=False):
+    assert callable(create)
+    sess = session_ctx.GetDefaultSession()
+    job_name = flow.current_global_function_desc().job_config_proto.job_name()
+    if job_name not in sess.job_name2module_name2module_:
+        sess.job_name2module_name2module_[job_name] = {}
+    module_name2module = sess.job_name2module_name2module_[job_name]
+    if module_name not in module_name2module:
+        module = create()
+        assert isinstance(module, module_util.Module)
+        module_name2module[module_name] = module
+    elif not reuse:
+        assert module_name not in sess.existed_module_names_, (
+            "duplicated module_name `%s' in global_function `%s'"
+            % (module_name, job_name)
+        )
+    else:
+        pass
+    sess.existed_module_names_.add(module_name)
+    return module_name2module[module_name]
+
+
+def api_eager_execution_enabled() -> bool:
+    """Get current setting of the job, if enable eager execution mode ,then return True
+
+    Returns:
+        bool: [description]
+    """
+    return oneflow._oneflow_internal.EagerExecutionEnabled()
+
+
+def api_clear_default_session() -> None:
+    """Clear the default session. All compiled OneFlow functions will be deleted.
+    """
+    func = enable_if.unique([clear_default_session])
+    return func()
+
+
+@enable_if.condition(hob.in_normal_mode)
+def clear_default_session():
+    session_ctx.TryCloseDefaultSession()
+    session_ctx.OpenDefaultSession(Session(oneflow._oneflow_internal.NewSessionId()))
+
+
+def api_sync_default_session() -> None:
+    """Synchronize the default session. Block until every synchronous OneFlow function and its callback finishes running.
+    """
+    func = enable_if.unique([sync_default_session])
+    return func()
+
+
+@enable_if.condition(hob.in_normal_mode)
+def sync_default_session() -> None:
+    session_ctx.GetDefaultSession().Sync()
+
+
+def _TryCompleteConfigProto(config_proto):
+    if config_proto.resource.machine_num == 0:
+        config_proto.resource.machine_num = oneflow._oneflow_internal.GetNodeSize()
+
+
+def _GetDefaultConfigProto():
+    config_proto = job_set_util.ConfigProto()
+    config_proto.resource.machine_num = 0
+    if oneflow._oneflow_internal.flags.with_cuda():
+        config_proto.resource.gpu_device_num = 1
+    else:
+        config_proto.resource.cpu_device_num = 1
+        config_proto.resource.gpu_device_num = 0
+    config_proto.session_id = session_ctx.GetDefaultSession().id
+    return config_proto
+
+
+def TmpInitEagerGlobalSession():
+    config_pb = _GetDefaultConfigProto()
+    config_proto_str = text_format.MessageToString(config_pb)
+    oneflow._oneflow_internal.InitEagerGlobalSession(config_proto_str)
diff --git a/python/oneflow/compatible/single_client/framework/sysconfig.py b/python/oneflow/compatible/single_client/framework/sysconfig.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2629c95b11b1cac0b93ff84f6274ddbb0c76204
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/sysconfig.py
@@ -0,0 +1,71 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import imp
+import importlib.util
+import os
+from typing import List
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+
+
+def get_include() -> str:
+    return os.path.join(os.path.dirname(oneflow.__file__), "include")
+
+
+def get_lib() -> str:
+    return os.path.dirname(oneflow.__file__)
+
+
+def get_compile_flags() -> List[str]:
+    flags = []
+    flags.append("-I{}".format(get_include()))
+    flags.append("-DHALF_ENABLE_CPP11_USER_LITERALS=0")
+    if oneflow._oneflow_internal.flags.with_cuda():
+        flags.append("-DWITH_CUDA")
+    if oneflow._oneflow_internal.flags.use_cxx11_abi():
+        flags.append("-D_GLIBCXX_USE_CXX11_ABI=1")
+    else:
+        flags.append("-D_GLIBCXX_USE_CXX11_ABI=0")
+    return flags
+
+
+def get_link_flags() -> List[str]:
+    flags = []
+    flags.append("-L{}".format(get_lib()))
+    (file, oneflow_internal_lib_path, _) = imp.find_module(
+        "_oneflow_internal", [get_lib()]
+    )
+    if file:
+        file.close()
+    flags.append("-l:{}".format(os.path.basename(oneflow_internal_lib_path)))
+    return flags
+
+
+def with_cuda() -> bool:
+    return oneflow._oneflow_internal.flags.with_cuda()
+
+
+def with_xla() -> bool:
+    return oneflow._oneflow_internal.flags.with_xla()
+
+
+def has_rpc_backend_grpc() -> bool:
+    return oneflow._oneflow_internal.flags.has_rpc_backend_grpc()
+
+
+def has_rpc_backend_local() -> bool:
+    return oneflow._oneflow_internal.flags.has_rpc_backend_local()
diff --git a/python/oneflow/compatible/single_client/framework/tensor.py b/python/oneflow/compatible/single_client/framework/tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..8951c45916d16e26ad36aef6829b329ed35e31ea
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/tensor.py
@@ -0,0 +1,944 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import inspect
+from typing import Union
+
+import numpy as np
+
+import oneflow._oneflow_internal
+from oneflow._oneflow_internal.oneflow.core.job import placement as placement_cfg
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import check_point_v2 as check_point_v2
+from oneflow.compatible.single_client.framework import dtype as dtype_util
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework import ofblob as ofblob_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+from oneflow.compatible.single_client.framework import runtime_mode as rt_mode
+from oneflow.compatible.single_client.framework import tensor_str as tensor_str_util
+from oneflow.compatible.single_client.framework.function_util import (
+    global_function_or_identity,
+)
+from oneflow.compatible.single_client.ops import initializer_util as initializer_util
+from oneflow.compatible.single_client.support import async_util as async_util
+from oneflow.core.job import initializer_conf_pb2 as initializer_conf_util
+
+
+def register_local_tensor_method(name=None):
+    def decorator(method):
+        if name is None:
+            op_name = method.__name__
+        else:
+            op_name = name
+        setattr(oneflow._oneflow_internal.Tensor, op_name, method)
+        return method
+
+    return decorator
+
+
+@register_local_tensor_method("numpy")
+def _local_tensor_numpy(eager_local_tensor):
+    if eager_local_tensor.dtype == flow.tensor_buffer:
+        (shapes, dtypes) = eager_local_tensor._tensor_buffer_shapes_and_dtypes
+        tensors = flow.experimental.tensor_buffer_to_list_of_tensors(
+            Tensor(eager_local_tensor), shapes, dtypes
+        )
+        return [t.numpy() for t in tensors]
+    method_name = eager_local_tensor._get_copy_mirrored_tensor_to_numpy_func_name()
+    copy_to_numpy = getattr(eager_local_tensor, method_name)
+    ndarray = np.empty(
+        tuple(eager_local_tensor.shape),
+        dtype=flow.convert_oneflow_dtype_to_numpy_dtype(eager_local_tensor.dtype),
+    )
+    copy_to_numpy(ndarray)
+    return ndarray
+
+
+@register_local_tensor_method("copy_")
+def _copy_from_numpy_to_eager_local_tensor(eager_local_tensor, np_arr):
+    method_name = eager_local_tensor._get_copy_mirrored_tensor_from_numpy_func_name()
+    copy_from_numpy = getattr(eager_local_tensor, method_name)
+    assert np_arr.dtype == flow.convert_oneflow_dtype_to_numpy_dtype(
+        eager_local_tensor.dtype
+    )
+    if np_arr.shape == ():
+        assert tuple(eager_local_tensor.shape) == (1,)
+    else:
+        assert np_arr.shape == tuple(eager_local_tensor.shape)
+    copy_from_numpy(np_arr)
+
+
+@register_local_tensor_method("_init_by_initializer_conf")
+def _init_eager_local_tensor_by_initializer_conf(
+    eager_local_tensor, initializer_conf, random_seed=0
+):
+    shape = tuple(eager_local_tensor.shape)
+    initializer = initializer_util.GetInitializer(initializer_conf, random_seed, shape)
+    if initializer is None:
+        return
+    _copy_from_numpy_to_eager_local_tensor(
+        eager_local_tensor,
+        check_point_v2.generate_values_by_initializer(
+            initializer, shape, eager_local_tensor.dtype
+        ),
+    )
+
+
+def construct_tensor(
+    data,
+    dtype=None,
+    device=None,
+    requires_grad=False,
+    placement=None,
+    sbp=None,
+    is_consistent=False,
+    is_lazy=False,
+):
+    if _is_scalar(data) or _input_args_is_data(data):
+        if (
+            not _input_args_is_numpy(data)
+            and dtype is None
+            and _input_dtype_is_float(data)
+        ):
+            dtype = flow.float32
+        data = np.array(data)
+        if dtype is None:
+            dtype = dtype_util.convert_numpy_dtype_to_oneflow_dtype(data.dtype)
+        return Tensor(
+            data,
+            dtype=dtype,
+            device=device,
+            requires_grad=requires_grad,
+            placement=placement,
+            sbp=sbp,
+            is_consistent=is_consistent,
+            is_lazy=is_lazy,
+        )
+    else:
+        raise TypeError("Construction error, invalid combination of arguments")
+
+
+class Tensor:
+    def __init__(
+        self,
+        *args,
+        dtype=None,
+        device=None,
+        requires_grad=False,
+        placement=None,
+        sbp=None,
+        is_consistent=False,
+        is_lazy=False,
+        data_initializer=None,
+        determining_initializer=None,
+    ):
+        assert len(args) > 0
+        dtype = dtype if dtype is not None else oneflow._oneflow_internal.float32
+        if isinstance(device, str):
+            device = flow.device(device)
+        if placement is None:
+            device = (
+                device
+                if device is not None
+                else oneflow._oneflow_internal.device("cpu")
+            )
+        if _input_args_is_tensor(*args):
+            self._local_or_consistent_tensor = flow.to(
+                *args, device=args[0].device, dtype=args[0].dtype, copy=True
+            )
+            self._undetermined_tensor = None
+        elif _input_args_is_consistent_or_local(*args):
+            self._local_or_consistent_tensor = args[0]
+            self._undetermined_tensor = None
+        elif _input_args_is_data(*args):
+            self._local_or_consistent_tensor = None
+            self._construct_with_data(
+                *args,
+                dtype=dtype,
+                device=device,
+                requires_grad=requires_grad,
+                placement=placement,
+                sbp=sbp,
+                is_consistent=is_consistent,
+                is_lazy=is_lazy,
+            )
+        elif _input_args_is_shape(*args):
+            shape = args
+            self._local_or_consistent_tensor = None
+            self._undetermined_tensor = UndeterminedTensor(
+                shape,
+                dtype,
+                device=device,
+                requires_grad=requires_grad,
+                placement=placement,
+                sbp=sbp,
+                is_consistent=is_consistent,
+                is_lazy=is_lazy,
+                data_initializer=data_initializer,
+            )
+            if determining_initializer is None:
+                determining_initializer = _default_initializer_for_determining
+            self._determining_initializer = determining_initializer
+        else:
+            raise TypeError("new() received an invalid combination of arguments")
+
+    @property
+    def shape(self):
+        if self._local_or_consistent_tensor is not None:
+            return self._local_or_consistent_tensor.shape
+        else:
+            return self._undetermined_tensor.shape
+
+    @property
+    def device(self):
+        if self._local_or_consistent_tensor is not None:
+            return self._local_or_consistent_tensor.device
+        else:
+            return self._undetermined_tensor.device
+
+    @register_local_tensor_method("ndim")
+    @property
+    def ndim(self):
+        return len(self.shape)
+
+    @property
+    def is_cuda(self):
+        if self._local_or_consistent_tensor is not None:
+            return self._local_or_consistent_tensor.is_cuda
+        else:
+            return self._undetermined_tensor.is_cuda
+
+    @property
+    def dtype(self):
+        if self._local_or_consistent_tensor is not None:
+            return self._local_or_consistent_tensor.dtype
+        else:
+            return self._undetermined_tensor.dtype
+
+    def _auto_determine(func):
+        def wrapped_func(*args, **kwargs):
+            tensor = args[0]
+            if not tensor.is_determined:
+                tensor.determine()
+            return func(*args, **kwargs)
+
+        return wrapped_func
+
+    @property
+    @_auto_determine
+    def data(self):
+        if self._local_or_consistent_tensor is not None:
+            return flow.Tensor(self._local_or_consistent_tensor.data)
+        else:
+            return None
+
+    @property
+    def grad(self):
+        if self._local_or_consistent_tensor is not None:
+            if self._local_or_consistent_tensor.grad is not None:
+                return flow.Tensor(self._local_or_consistent_tensor.grad)
+        else:
+            return None
+
+    @grad.setter
+    @_auto_determine
+    def grad(self, new_grad):
+        def check_grad(grad, new_grad):
+            assert grad.shape == new_grad.shape, "Shape of new grad is not equal"
+            assert grad.device == new_grad.device, "Device of new grad is not equal"
+            assert grad.dtype == new_grad.dtype, "Data type of new grad is not equal"
+            assert type(grad) == type(new_grad), "Type of new grad is not equal"
+
+        if self._local_or_consistent_tensor is not None:
+            if new_grad is None:
+                self._local_or_consistent_tensor.set_grad(None)
+            else:
+                new_grad_detach = new_grad.detach()._local_or_consistent_tensor
+                check_grad(self._local_or_consistent_tensor.grad, new_grad_detach)
+                self._local_or_consistent_tensor.set_grad(new_grad_detach)
+
+    @property
+    def grad_fn(self):
+        if self._local_or_consistent_tensor is not None:
+            return self._local_or_consistent_tensor.grad_fn
+        else:
+            return None
+
+    @property
+    def requires_grad(self):
+        if self._local_or_consistent_tensor is not None:
+            return self._local_or_consistent_tensor.requires_grad
+        else:
+            return self._undetermined_tensor.requires_grad
+
+    @property
+    def is_leaf(self):
+        if self._local_or_consistent_tensor is not None:
+            return self._local_or_consistent_tensor.is_leaf
+        else:
+            return True
+
+    @requires_grad.setter
+    def requires_grad(self, requires_grad):
+        if self._local_or_consistent_tensor is not None:
+            self._local_or_consistent_tensor.requires_grad = requires_grad
+        else:
+            self._undetermined_tensor.requires_grad = requires_grad
+
+    @register_local_tensor_method()
+    def size(self, idx=None):
+        if idx is None:
+            return self.shape
+        else:
+            return self.shape[idx]
+
+    @register_local_tensor_method()
+    def dim(self):
+        return self.ndim
+
+    @register_local_tensor_method()
+    def ndimension(self):
+        return self.ndim
+
+    @_auto_determine
+    def detach(self):
+        if self._local_or_consistent_tensor is not None:
+            return flow.Tensor(self._local_or_consistent_tensor.detach())
+        else:
+            return None
+
+    def requires_grad_(self, requires_grad=True):
+        self.requires_grad = requires_grad
+
+    def get_device(self):
+        if self._local_or_consistent_tensor is not None:
+            return self._local_or_consistent_tensor.device
+        else:
+            return self._undetermined_tensor.device
+
+    @register_local_tensor_method()
+    def nelement(self):
+        prod = 1
+        for dim in self.shape:
+            prod *= dim
+        return prod
+
+    @register_local_tensor_method()
+    def numel(self):
+        return self.nelement()
+
+    def retain_grad(self):
+        assert self.is_determined
+        self._local_or_consistent_tensor.retain_grad()
+
+    def data_ptr(self):
+        TODO()
+
+    def element_size(self):
+        return self.dtype.bytes
+
+    @_auto_determine
+    def numpy(self):
+        internal_tensor = self._local_or_consistent_tensor
+        if not internal_tensor.is_lazy and (not internal_tensor.is_consistent):
+            return _local_tensor_numpy(internal_tensor)
+        raise NotImplementedError()
+
+    @register_local_tensor_method()
+    def tolist(self):
+        return self.numpy().tolist()
+
+    @_auto_determine
+    @register_local_tensor_method()
+    def backward(self, gradient=None, retain_graph=False, create_graph=False):
+        flow.autograd.backward(self, gradient, retain_graph, create_graph)
+
+    @register_local_tensor_method()
+    def _transform_ellipsis_type(self, key):
+        d = self.ndim - len(key)
+        new_key = list()
+        for k in key:
+            if isinstance(k, type(Ellipsis)):
+                new_key.append(slice(None, None, None))
+                while d > 0:
+                    new_key.append(slice(None, None, None))
+                    d -= 1
+            else:
+                new_key.append(k)
+        return tuple(new_key)
+
+    @register_local_tensor_method()
+    def _get_slice_obj(self, key):
+        def get_or_default(x, default):
+            return x if x is not None else default
+
+        def get_canonical_index(index, length, *, start=0):
+            if index < 0:
+                index += length
+            if index > length or index < 0:
+                raise IndexError(f"Index should be in [0, {length}), but got {index}")
+            return max(min(index, length), start)
+
+        def get_slice_if_int(x):
+            if isinstance(x, slice):
+                return x
+            return slice(x, x + 1)
+
+        if isinstance(key, tuple):
+            assert all((isinstance(x, (slice, int)) for x in key))
+        else:
+            assert isinstance(key, (slice, int))
+            key = (key,)
+        key = list(map(get_slice_if_int, key))
+        assert len(key) <= len(self.shape)
+        for i in range(len(key), len(self.shape)):
+            key += (slice(None, None, None),)
+        starts = [
+            get_canonical_index(get_or_default(x.start, 0), self.shape[i])
+            for (i, x) in enumerate(key)
+        ]
+        stops = [
+            get_canonical_index(
+                get_or_default(x.stop, self.shape[i]), self.shape[i], start=starts[i]
+            )
+            for (i, x) in enumerate(key)
+        ]
+        steps = [get_or_default(x.step, 1) for x in key]
+        assert all((x > 0 for x in steps))
+        shape = (np.abs(np.array(stops) - np.array(starts)) - 1) // np.abs(
+            np.array(steps)
+        ) + 1
+        shape = shape.tolist()
+        return (starts, stops, steps, shape)
+
+    @_auto_determine
+    @register_local_tensor_method()
+    def __getitem__(self, key):
+        assert (
+            isinstance(key, int) or isinstance(key, tuple) or isinstance(key, slice)
+        ), "Unsupported key type!"
+        squeeze_dims = None
+        if isinstance(key, tuple):
+            key = self._transform_ellipsis_type(key)
+            squeeze_dims = list(
+                filter(lambda idx: isinstance(key[idx], int), range(len(key)))
+            )
+        elif isinstance(key, int):
+            squeeze_dims = [0]
+        else:
+            pass
+        (start, stop, step, _) = self._get_slice_obj(key)
+        res = flow.experimental.slice(self, list(zip(start, stop, step)))
+        if squeeze_dims is not None:
+            res = res.squeeze(dim=squeeze_dims)
+        return res
+
+    @_auto_determine
+    @register_local_tensor_method()
+    def __setitem__(self, key, value):
+        if isinstance(key, tuple):
+            key = self._transform_ellipsis_type(key)
+            unsqueeze_dims = list(
+                filter(lambda idx: isinstance(key[idx], int), range(len(key)))
+            )
+        elif isinstance(key, int):
+            unsqueeze_dims = [0]
+        else:
+            unsqueeze_dims = []
+        (start, stop, step, shape) = self._get_slice_obj(key)
+        if isinstance(value, (int, float)):
+            scalar = value
+            value = flow.Tensor(*shape)
+            value.fill_(scalar)
+        else:
+            prepended_broadcasting_dims = range(
+                len(self.shape) - len(unsqueeze_dims) - len(value.shape)
+            )
+            for dim in prepended_broadcasting_dims:
+                value = flow.experimental.unsqueeze(value, dim)
+            for dim in unsqueeze_dims:
+                value = flow.experimental.unsqueeze(value, dim)
+            value = flow.experimental.expand(value, *shape)
+        flow.experimental.tmp.logical_slice_assign(
+            self, value, list(zip(start, stop, step))
+        )
+        return self
+
+    @register_local_tensor_method()
+    def __str__(self):
+        return self.__repr__()
+
+    @register_local_tensor_method()
+    def __repr__(self):
+        return tensor_str_util._gen_tensor_str(self)
+
+    @register_local_tensor_method()
+    def __gt__(self, other):
+        return self.gt(other)
+
+    @register_local_tensor_method()
+    def __lt__(self, other):
+        return self.lt(other)
+
+    @register_local_tensor_method()
+    def __ge__(self, other):
+        return self.ge(other)
+
+    @register_local_tensor_method()
+    def __le__(self, other):
+        return self.le(other)
+
+    def __array__(self):
+        TODO()
+
+    def __sizeof__(self):
+        TODO()
+
+    def __deepcopy__(self, memo):
+        TODO()
+
+    @register_local_tensor_method()
+    def __mul__(self, other):
+        return self.mul(other)
+
+    @register_local_tensor_method()
+    def __rmul__(self, other):
+        return self.mul(other)
+
+    @register_local_tensor_method()
+    def __add__(self, other):
+        return self.add(other)
+
+    @register_local_tensor_method()
+    def __radd__(self, other):
+        return self.add(other)
+
+    @register_local_tensor_method()
+    def __sub__(self, other):
+        return self.sub(other)
+
+    @register_local_tensor_method()
+    def __rsub__(self, other):
+        return flow.experimental.sub(other, self)
+
+    @register_local_tensor_method()
+    def __truediv__(self, other):
+        return self.div(other)
+
+    @register_local_tensor_method()
+    def __rtruediv__(self, other):
+        return flow.experimental.div(other, self)
+
+    @register_local_tensor_method()
+    def __neg__(self):
+        return flow.experimental.neg(self)
+
+    @register_local_tensor_method()
+    def __pow__(self, b):
+        return flow.experimental.pow(self, b)
+
+    def _determine_if_needed(self, determining_initializer=None):
+        if not self.is_determined:
+            self.determine(determining_initializer)
+
+    def determine(self, determining_initializer=None):
+        assert not self.is_determined
+        if determining_initializer is None:
+            determining_initializer = self._determining_initializer
+        self._local_or_consistent_tensor = determining_initializer(self)
+        self._undetermined_tensor = None
+
+    @property
+    def is_determined(self):
+        if self._local_or_consistent_tensor is not None:
+            assert self._undetermined_tensor is None
+            return True
+        else:
+            assert self._undetermined_tensor is not None
+            return False
+
+    def set_placement(self, placement):
+        assert isinstance(placement, flow.placement)
+        assert self._local_or_consistent_tensor is None
+        assert self._undetermined_tensor is not None
+        self._undetermined_tensor.placement = placement
+        self._undetermined_tensor.device = None
+
+    def set_sbp(self, sbp):
+        assert isinstance(sbp, oneflow._oneflow_internal.Distribute)
+        assert self._local_or_consistent_tensor is None
+        assert self._undetermined_tensor is not None
+        self._undetermined_tensor.sbp = sbp
+
+    def set_is_consistent(self, is_consistent):
+        assert isinstance(is_consistent, bool)
+        assert self._local_or_consistent_tensor is None
+        assert self._undetermined_tensor is not None
+        self._undetermined_tensor.is_consistent = is_consistent
+
+    def set_is_lazy(self, is_lazy):
+        assert isinstance(is_lazy, bool)
+        assert self._local_or_consistent_tensor is None
+        assert self._undetermined_tensor is not None
+        self._undetermined_tensor.is_lazy = is_lazy
+
+    def set_data_initializer(self, data_initializer):
+        assert isinstance(data_initializer, initializer_conf_util.InitializerConf)
+        assert self._local_or_consistent_tensor is None
+        assert self._undetermined_tensor is not None
+        self._undetermined_tensor.data_initializer = data_initializer
+
+    @property
+    def placement(self):
+        if self._local_or_consistent_tensor is not None:
+            return self._local_or_consistent_tensor.placement
+        else:
+            return self._undetermined_tensor.placement
+
+    @property
+    def is_lazy(self):
+        if self._local_or_consistent_tensor is not None:
+            return self._local_or_consistent_tensor.is_lazy
+        else:
+            return self._undetermined_tensor.is_lazy
+
+    @property
+    def is_consistent(self):
+        if self._local_or_consistent_tensor is not None:
+            return self._local_or_consistent_tensor.is_consistent
+        else:
+            return self._undetermined_tensor.is_consistent
+
+    @property
+    def sbp(self):
+        if self._local_or_consistent_tensor is not None:
+            return self._local_or_consistent_tensor.sbp
+        else:
+            return self._undetermined_tensor.sbp
+
+    @register_local_tensor_method()
+    def uniform_(self, a=0, b=1):
+        initializer_conf = flow.random_uniform_initializer(
+            minval=a, maxval=b, dtype=self.dtype
+        )
+        return self._init_by_initializer_conf(initializer_conf)
+
+    @register_local_tensor_method()
+    def kaiming_uniform_(
+        self, a=0, mode="fan_in", nonlinearity="leaky_relu", *, data_format="NCHW"
+    ):
+        initializer_conf = flow.kaiming_initializer(
+            shape=self.shape,
+            distribution="random_uniform",
+            mode=mode,
+            nonlinearity=nonlinearity,
+            negative_slope=a,
+            data_format=data_format,
+        )
+        return self._init_by_initializer_conf(initializer_conf)
+
+    @register_local_tensor_method()
+    def kaiming_normal_(
+        self, a=0, mode="fan_in", nonlinearity="leaky_relu", *, data_format="NCHW"
+    ):
+        initializer_conf = flow.kaiming_initializer(
+            shape=self.shape,
+            distribution="random_normal",
+            mode=mode,
+            nonlinearity=nonlinearity,
+            negative_slope=a,
+            data_format=data_format,
+        )
+        return self._init_by_initializer_conf(initializer_conf)
+
+    @register_local_tensor_method()
+    def xavier_normal_(self, gain=1.0, *, data_format="NCHW"):
+        assert gain == 1.0, "Only gain == 1.0 is supported now"
+        initializer_conf = flow.xavier_normal_initializer(data_format=data_format)
+        return self._init_by_initializer_conf(initializer_conf)
+
+    @register_local_tensor_method()
+    def xavier_uniform_(self, gain=1.0, *, data_format="NCHW"):
+        assert gain == 1.0, "Only gain == 1.0 is supported now"
+        initializer_conf = flow.xavier_uniform_initializer(data_format=data_format)
+        return self._init_by_initializer_conf(initializer_conf)
+
+    @register_local_tensor_method()
+    def normal_(self, mean=0, std=1):
+        initializer_conf = flow.random_normal_initializer(mean=mean, stddev=std)
+        return self._init_by_initializer_conf(initializer_conf)
+
+    @register_local_tensor_method()
+    def fill_(self, value):
+        initializer_conf = flow.constant_initializer(value=value, dtype=self.dtype)
+        return self._init_by_initializer_conf(initializer_conf)
+
+    @_auto_determine
+    def zeros_(self):
+        internal_tensor = self._local_or_consistent_tensor
+        if internal_tensor.is_lazy:
+            TODO()
+        if internal_tensor.is_consistent:
+            TODO()
+        internal_tensor.zeros_()
+
+    @_auto_determine
+    @register_local_tensor_method()
+    def register_hook(self, hook):
+        assert self.is_leaf, "register_hook only supports leaf tensor for now"
+        assert (
+            self.requires_grad
+        ), "register_hook only supports tensor with requires_grad=True"
+
+        def hook_returning_determined_tensor(grad):
+            new_grad = hook(grad)
+            if isinstance(new_grad, Tensor) and (not new_grad.is_determined):
+                new_grad.determine()
+                new_grad = new_grad._local_or_consistent_tensor
+            return new_grad
+
+        self._local_or_consistent_tensor._register_hook(
+            hook_returning_determined_tensor
+        )
+
+    @_auto_determine
+    def copy_(self, other: Union["Tensor", np.ndarray]):
+        internal_tensor = self._local_or_consistent_tensor
+        if internal_tensor.is_lazy:
+            TODO()
+        if internal_tensor.is_consistent:
+            TODO()
+        if isinstance(other, (Tensor, check_point_v2.FileBackendVariableBlob)):
+            src_np = other.numpy()
+        else:
+            assert isinstance(other, np.ndarray)
+            src_np = other
+        _copy_from_numpy_to_eager_local_tensor(internal_tensor, src_np)
+
+    def _init_by_initializer_conf(self, initializer_conf):
+        if self.is_determined:
+            if self.is_consistent:
+                with self._placement_scope():
+                    check_point_v2.init_by_initializer_conf(
+                        self, initializer_conf, True, None
+                    )
+            else:
+                _init_eager_local_tensor_by_initializer_conf(
+                    self._local_or_consistent_tensor, initializer_conf
+                )
+        else:
+            self.set_data_initializer(initializer_conf)
+        return self
+
+    def _placement_scope(self):
+        if self.is_consistent:
+            return _convert_to_placement_scope(self.placement)
+        else:
+            return _convert_to_placement_scope(self.device)
+
+    def _construct_with_data(
+        self,
+        *args,
+        dtype=None,
+        device=None,
+        requires_grad=False,
+        placement=None,
+        sbp=None,
+        is_consistent=False,
+        is_lazy=False,
+    ):
+        numpy_data = None
+        if _input_args_is_tuple_or_list(*args):
+            numpy_data = np.array(args[0])
+        elif _input_args_is_numpy(*args):
+            numpy_data = np.ascontiguousarray(args[0])
+        numpy_data = numpy_data.astype(flow.convert_oneflow_dtype_to_numpy_dtype(dtype))
+        shape = oneflow._oneflow_internal.Size(tuple(numpy_data.shape))
+        self._determining_initializer = _numpy_initializer_for_determining
+        self._undetermined_tensor = UndeterminedTensor(
+            shape,
+            dtype,
+            device=device,
+            requires_grad=requires_grad,
+            placement=placement,
+            sbp=sbp,
+            is_consistent=is_consistent,
+            is_lazy=is_lazy,
+            numpy_data=numpy_data,
+        )
+
+
+class UndeterminedTensor:
+    def __init__(
+        self,
+        shape,
+        dtype,
+        device=None,
+        requires_grad=False,
+        placement=None,
+        sbp=None,
+        is_consistent=False,
+        is_lazy=False,
+        data_initializer=None,
+        numpy_data=None,
+    ):
+        if not isinstance(shape, oneflow._oneflow_internal.Size):
+            if not isinstance(shape, tuple):
+                shape = tuple(shape)
+            shape = oneflow._oneflow_internal.Size(shape)
+        data_initializer = (
+            data_initializer
+            if data_initializer is not None
+            else flow.empty_initializer(dtype=dtype)
+        )
+        device = (
+            device if device is not None else oneflow._oneflow_internal.device("cpu")
+        )
+        self.shape = shape
+        self.dtype = dtype
+        self.device = device
+        self.requires_grad = requires_grad
+        self.placement = placement
+        self.sbp = sbp
+        self.is_consistent = is_consistent
+        self.is_lazy = is_lazy
+        self.data_initializer = data_initializer
+        self.numpy_data = numpy_data
+
+    @property
+    def is_cuda(self):
+        device_type = None
+        if self.placement is not None:
+            device_type = self.placement.device_tag
+        elif self.device is not None:
+            device_type = self.device.type
+        else:
+            raise ValueError("Neither placement nor device found.")
+        return device_type == "gpu" or device_type == "cuda"
+
+
+def _default_initializer_for_determining(tensor):
+    assert not tensor.is_determined
+    undetermined_tensor = tensor._undetermined_tensor
+    if undetermined_tensor.is_consistent:
+        raise NotImplementedError()
+    else:
+        shape = undetermined_tensor.shape
+        dtype = undetermined_tensor.dtype
+        determined_tensor = oneflow._oneflow_internal.Tensor(
+            shape,
+            dtype,
+            undetermined_tensor.device,
+            undetermined_tensor.is_lazy,
+            undetermined_tensor.requires_grad,
+            True,
+        )
+        _init_eager_local_tensor_by_initializer_conf(
+            determined_tensor, undetermined_tensor.data_initializer
+        )
+    return determined_tensor
+
+
+def _numpy_initializer_for_determining(tensor):
+    assert not tensor.is_determined
+    undetermined_tensor = tensor._undetermined_tensor
+    numpy_data = undetermined_tensor.numpy_data
+    assert numpy_data is not None
+    if undetermined_tensor.is_consistent:
+        raise NotImplementedError()
+    else:
+        determined_tensor = oneflow._oneflow_internal.Tensor(
+            undetermined_tensor.shape,
+            undetermined_tensor.dtype,
+            undetermined_tensor.device,
+            undetermined_tensor.is_lazy,
+            undetermined_tensor.requires_grad,
+            True,
+        )
+        _copy_from_numpy_to_eager_local_tensor(determined_tensor, numpy_data)
+    return determined_tensor
+
+
+def _input_args_is_tuple_or_list(*args):
+    return len(args) == 1 and isinstance(args[0], (tuple, list))
+
+
+def _input_args_is_numpy(*args):
+    return len(args) == 1 and isinstance(args[0], np.ndarray)
+
+
+def _input_args_is_consistent_or_local(*args):
+    return len(args) == 1 and isinstance(args[0], oneflow._oneflow_internal.Tensor)
+
+
+def _input_args_is_tensor(*args):
+    return len(args) == 1 and isinstance(args[0], flow.Tensor)
+
+
+def _input_args_is_data(*args):
+    return _input_args_is_numpy(*args) or _input_args_is_tuple_or_list(*args)
+
+
+def _input_args_is_shape(*args):
+    return all((isinstance(x, int) for x in args))
+
+
+def register_tensor_op(op_name):
+    def set_tensor_op(method):
+        setattr(Tensor, op_name, method)
+        setattr(oneflow._oneflow_internal.Tensor, op_name, method)
+        return method
+
+    return set_tensor_op
+
+
+def _convert_to_placement_scope(placement_or_device):
+    if isinstance(placement_or_device, flow.placement):
+        placement = placement_or_device
+        return flow.scope.placement(
+            placement.device_tag,
+            list(placement.parallel_conf.device_name()),
+            placement.hierarchy,
+        )
+    else:
+        device = placement_or_device
+        machine_id = 0
+        if device.type == "cuda":
+            device_tag = "gpu"
+        else:
+            device_tag = device.type
+        return flow.scope.placement(
+            device_tag, "{}:{}".format(machine_id, device.index), None
+        )
+
+
+def _is_scalar(data):
+    return isinstance(data, (int, float, bool, complex))
+
+
+def _flatten_list_or_tuple(list_or_tuple):
+    for item in list_or_tuple:
+        if isinstance(item, (list, tuple)):
+            yield from _flatten_list_or_tuple(item)
+        else:
+            yield item
+
+
+def _input_dtype_is_float(data):
+    if _is_scalar(data):
+        return isinstance(data, float)
+    elif isinstance(data, (list, tuple)):
+        return any((isinstance(x, float) for x in _flatten_list_or_tuple(data)))
+    return False
diff --git a/python/oneflow/compatible/single_client/framework/tensor_str.py b/python/oneflow/compatible/single_client/framework/tensor_str.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e52a27506206ddc594ca6fda8d5d0f742d3acde
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/tensor_str.py
@@ -0,0 +1,54 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import numpy as np
+
+from oneflow.compatible import single_client as flow
+
+
+def _add_suffixes(tensor_str, suffixes, indent):
+    tensor_strs = [tensor_str]
+    last_line_len = len(tensor_str) - tensor_str.rfind("\n") + 1
+    linewidth = 80
+    for suffix in suffixes:
+        suffix_len = len(suffix)
+        if last_line_len + suffix_len + 2 > linewidth:
+            tensor_strs.append(",\n" + " " * indent + suffix)
+            last_line_len = indent + suffix_len
+        else:
+            tensor_strs.append(", " + suffix)
+            last_line_len += suffix_len + 2
+    tensor_strs.append(")")
+    return "".join(tensor_strs)
+
+
+def _gen_tensor_str(tensor):
+    prefix = "tensor("
+    indent = len(prefix)
+    suffixes = []
+    if tensor.device.type != "cpu" or (
+        tensor.device.type == "cuda" and tensor.device.index != 0
+    ):
+        suffixes.append("device='" + str(tensor.device) + "'")
+    suffixes.append("dtype=" + str(tensor.dtype))
+    if tensor.grad_fn is not None:
+        name = tensor.grad_fn.name()
+        suffixes.append("grad_fn=<{}>".format(name))
+    elif tensor.requires_grad:
+        suffixes.append("requires_grad=True")
+    tensor_str = np.array2string(
+        tensor.numpy(), precision=4, separator=", ", prefix=prefix
+    )
+    return _add_suffixes(prefix + tensor_str, suffixes, indent)
diff --git a/python/oneflow/compatible/single_client/framework/tensor_tuple_util.py b/python/oneflow/compatible/single_client/framework/tensor_tuple_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..db67e03cb6ddbc4a57d5cc1edbd901d59da901a1
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/tensor_tuple_util.py
@@ -0,0 +1,43 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import collections
+from typing import Optional, Sequence, Tuple, Union
+
+from oneflow._oneflow_internal import Tensor, TensorTuple
+from oneflow.compatible.single_client.framework.tensor import Tensor as PyTensor
+
+
+def convert_to_tensor_tuple(
+    args: Optional[Union[PyTensor, Sequence[PyTensor], Tensor, Sequence[Tensor]]]
+):
+    if args is None:
+        return TensorTuple()
+    elif isinstance(args, collections.abc.Sequence):
+        if isinstance(args[0], PyTensor):
+            for tensor in args:
+                if not tensor.is_determined:
+                    tensor.determine()
+            return TensorTuple([x._local_or_consistent_tensor for x in args])
+        return TensorTuple(args)
+    else:
+        tensor_tuple = TensorTuple()
+        if isinstance(args, PyTensor):
+            if not args.is_determined:
+                args.determine()
+            tensor_tuple.append(args._local_or_consistent_tensor)
+        else:
+            tensor_tuple.append(args)
+        return tensor_tuple
diff --git a/python/oneflow/compatible/single_client/framework/typing.py b/python/oneflow/compatible/single_client/framework/typing.py
new file mode 100644
index 0000000000000000000000000000000000000000..15a8f955ca973a13dc121d57c53932ee1800d9da
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/typing.py
@@ -0,0 +1,151 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import inspect
+import sys
+import typing
+from typing import Optional, Sequence
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import input_blob_def as input_blob_def
+
+
+class PyStructCompatibleToBlob(object):
+    pass
+
+
+class Numpy(PyStructCompatibleToBlob):
+    """`Numpy` is a type hint for numpy output of a OneFlow global function
+    For instance::
+
+        @oneflow.compatible.single_client.global_function()
+        def foo() -> oneflow.compatible.single_client.typing.Numpy:
+            loss = ... # your network
+            return loss
+
+        loss = foo() # get a numpy.ndarray
+        print(loss)
+    """
+
+    def Placeholder(shape: Sequence[int], dtype=flow.float):
+        """`Numpy.Placeholder` is a typing function for numpy input of a OneFlow global function.
+        A `numpy.ndarray` takes a `Numpy.Placeholder`'s place must have an identical shape.
+        For instance::
+
+            @oneflow.compatible.single_client.global_function()
+            def foo(
+                image_blob: oneflow.compatible.single_client.typing.Numpy.Placeholder(
+                    (2, 255, 255, 3), dtype=flow.float32
+                )
+            ):
+                # your network
+
+            foo(np.random.randn(2, 255, 255, 3).astype(np.float32))
+
+        """
+        assert type(shape) is tuple, "shape should be a tuple. %s found" % shape
+        return type("Numpy.Placeholder", (NumpyDef,), dict(shape=shape, dtype=dtype))
+
+
+class ListNumpy(PyStructCompatibleToBlob):
+    """`ListNumpy` is a type hint for numpy output of a OneFlow global function
+    For instance::
+
+        @oneflow.compatible.single_client.global_function()
+        def foo() -> oneflow.compatible.single_client.typing.ListNumpy:
+            mirrored_tensors = ... # your network
+            return mirrored_tensors
+
+        mirrored_tensors = foo() # get a list of numpy.ndarray
+        for tensor in mirrored_tensors:
+            print(mirrored_tensors)
+    """
+
+    def Placeholder(shape: Sequence[int], dtype=flow.float):
+        """`ListNumpy.Placeholder` is a typing function for numpy input of a OneFlow global function.
+        A `list` of `numpy.ndarray` takes a `ListNumpy.Placeholder`'s place. Each `numpy.ndarray` in the `list` could have any shape as long as it has the same rank and a smaller/equal size.
+        For instance::
+
+            @oneflow.compatible.single_client.global_function()
+            def foo(
+                image_blob: oneflow.compatible.single_client.typing.ListNumpy.Placeholder(
+                    (2, 255, 255, 3), dtype=flow.float32
+                )
+            ):
+                # your network
+
+            input1 = np.random.randn(2, 255, 255, 3).astype(np.float32)
+            input2 = np.random.randn(2, 251, 251, 3).astype(np.float32)
+            foo([input1])
+            foo([input2])
+
+        """
+        assert type(shape) is tuple, "shape should be a tuple. %s found" % shape
+        return type(
+            "ListNumpy.Placeholder", (ListOfNumpyDef,), dict(shape=shape, dtype=dtype)
+        )
+
+
+class OneflowNumpyDef(object):
+    @classmethod
+    def NewInputBlobDef(subclass):
+        raise NotImplementedError
+
+
+class NumpyDef(OneflowNumpyDef):
+    @classmethod
+    def NewInputBlobDef(subclass):
+        return input_blob_def.FixedTensorDef(subclass.shape, dtype=subclass.dtype)
+
+
+class ListOfNumpyDef(OneflowNumpyDef):
+    @classmethod
+    def NewInputBlobDef(subclass):
+        return input_blob_def.MirroredTensorDef(subclass.shape, dtype=subclass.dtype)
+
+
+class Callback(typing.Generic[typing.TypeVar("T")]):
+    pass
+
+
+class Bundle(typing.Generic[typing.TypeVar("T")]):
+    """
+    One or a collection of  typing.Numpy/typing.ListNumpy,
+    such as x, [x], (x,), {"key": x} and the mixed form of them.
+    """
+
+    pass
+
+
+def OriginFrom(parameterised, generic):
+    if inspect.isclass(parameterised) and inspect.isclass(generic):
+        return issubclass(parameterised, generic)
+    if generic == OneflowNumpyDef:
+        assert not inspect.isclass(parameterised)
+        return False
+    if (sys.version_info.major, sys.version_info.minor) >= (3, 7):
+        if not hasattr(parameterised, "__origin__"):
+            return False
+        if generic == typing.Dict:
+            return parameterised.__origin__ is dict
+        if generic == typing.Tuple:
+            return parameterised.__origin__ is tuple
+        if generic == typing.List:
+            return parameterised.__origin__ is list
+        if generic == Callback:
+            return parameterised.__origin__ is Callback
+        if generic == Bundle:
+            return parameterised.__origin__ is Bundle
+    raise NotImplementedError("python typing is a monster torturing everyone.")
diff --git a/python/oneflow/compatible/single_client/framework/typing_util.py b/python/oneflow/compatible/single_client/framework/typing_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a83651912e9f13b44231b17b8eb6853e818656b
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/typing_util.py
@@ -0,0 +1,300 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import inspect
+import typing
+
+import oneflow._oneflow_internal
+from oneflow.compatible.single_client.experimental import (
+    typing_check as enable_typing_check,
+)
+from oneflow.compatible.single_client.framework import local_blob as local_blob_util
+from oneflow.compatible.single_client.framework import pull_util as pull_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+from oneflow.compatible.single_client.framework import typing as oft
+
+
+def CheckGlobalFunctionAnnotation(signature):
+    parameters = signature.parameters
+    if all((p.annotation is not inspect._empty for (_, p) in parameters.items())):
+        for (_, p) in parameters.items():
+            assert (
+                p.kind == inspect._ParameterKind.POSITIONAL_OR_KEYWORD
+            ), "no parameters like *args or **kwargs supported"
+            CheckGlobalFunctionParamAnnotation(p.annotation)
+    elif enable_typing_check.typing_check_enabled:
+        for (param_name, p) in parameters.items():
+            if p.annotaion is inspect._empty:
+                raise NotImplementedError("parameter %s is not annotated" % param_name)
+    else:
+        pass
+    return_annotation = signature.return_annotation
+    if return_annotation is not inspect._empty:
+        CheckGlobalFunctionReturnAnnotation(return_annotation)
+    elif enable_typing_check.typing_check_enabled:
+        raise NotImplementedError("no return annotation found.")
+    else:
+        pass
+
+
+def CheckGlobalFunctionParamAnnotation(cls):
+    if oft.OriginFrom(cls, typing.Tuple):
+        assert cls.__args__ is not None, "T in typing.Tuple[T, ...] cannot be omitted"
+        assert len(cls.__args__) > 0
+        for cls_arg in cls.__args__:
+            CheckGlobalFunctionParamAnnotation(cls_arg)
+    elif oft.OriginFrom(cls, oft.OneflowNumpyDef):
+        pass
+    else:
+        raise NotImplementedError("invalid parameter annotation %s found" % cls)
+
+
+def CheckGlobalFunctionReturnAnnotation(cls):
+    if cls is None:
+        pass
+    elif oft.OriginFrom(cls, oft.Callback):
+        assert (
+            cls.__args__ is not None
+        ), "T in oneflow.compatible.single_client.typing.Callback[T] cannot be omitted"
+        assert len(cls.__args__) == 1
+        _CheckGlobalFunctionReturnAnnotation(cls.__args__[0])
+    elif oft.OriginFrom(cls, oft.Bundle):
+        assert cls.__args__[0] in (
+            oft.Numpy,
+            oft.ListNumpy,
+        ), "T in oneflow.compatible.single_client.typing.Bundle[T] must be one of (oneflow.compatible.single_client.typing.Numpy, oneflow.compatible.single_client.typing.ListNumpy)"
+        assert len(cls.__args__) == 1
+        _CheckGlobalFunctionReturnAnnotation(cls.__args__[0])
+    else:
+        _CheckGlobalFunctionReturnAnnotation(cls)
+
+
+def _CheckGlobalFunctionReturnAnnotation(cls):
+    if oft.OriginFrom(cls, typing.Tuple):
+        assert cls.__args__ is not None, "T in typing.Tuple[T, ...] cannot be omitted"
+        assert len(cls.__args__) > 0
+        for cls_arg in cls.__args__:
+            _CheckGlobalFunctionReturnAnnotation(cls_arg)
+    elif oft.OriginFrom(cls, typing.List):
+        assert cls.__args__ is not None, "T in typing.List[T] cannot be omitted"
+        assert len(cls.__args__) == 1
+        _CheckGlobalFunctionReturnAnnotation(cls.__args__[0])
+    elif oft.OriginFrom(cls, typing.Dict):
+        assert cls.__args__ is not None, "(K, V) in typing.Dict[K,V] cannot be omitted"
+        assert len(cls.__args__) == 2
+        _CheckGlobalFunctionReturnAnnotation(cls.__args__[1])
+    elif oft.OriginFrom(cls, oft.PyStructCompatibleToBlob):
+        pass
+    else:
+        raise NotImplementedError("invalid return annotation %s found" % cls)
+
+
+def CheckReturnByAnnotation(function_name, ret, annotation):
+    if annotation is inspect._empty:
+        return
+    if annotation is None:
+        error_str = (
+            "%s does not matched return annotation %s of global_function %s."
+            % (ret, annotation, function_name)
+        )
+        assert ret is None, error_str
+    elif oft.OriginFrom(annotation, oft.Callback):
+        _CheckReturnByAnnotation(function_name, ret, annotation.__args__[0])
+    elif oft.OriginFrom(annotation, oft.Bundle):
+        if isinstance(ret, oneflow._oneflow_internal.BlobDesc):
+            _CheckReturnByAnnotation(function_name, ret, annotation.__args__[0])
+        elif isinstance(ret, (list, tuple)):
+            for elem in ret:
+                CheckReturnByAnnotation(function_name, elem, annotation)
+        elif type(ret) is dict:
+            for val in ret.values():
+                CheckReturnByAnnotation(function_name, val, annotation)
+        else:
+            raise NotImplementedError("invalid return  %s found" % type(ret))
+    else:
+        _CheckReturnByAnnotation(function_name, ret, annotation)
+
+
+def _CheckReturnByAnnotation(function_name, ret, annotation):
+    error_str = "%s does not matched return annotation %s of global_function %s." % (
+        ret,
+        annotation,
+        function_name,
+    )
+    if oft.OriginFrom(annotation, typing.Tuple):
+        assert type(ret) is tuple, error_str
+        assert len(ret) == len(annotation.__args__), "%s length compare: %s v.s. %s" % (
+            error_str,
+            len(ret),
+            len(annotation.__args__),
+        )
+        for (ret_i, annotation_i) in zip(ret, annotation.__args__):
+            _CheckReturnByAnnotation(function_name, ret_i, annotation_i)
+    elif oft.OriginFrom(annotation, typing.List):
+        assert type(ret) is list, error_str
+        assert len(annotation.__args__) == 1, (
+            "%s element type in list must be unique" % error_str
+        )
+        for ret_i in ret:
+            _CheckReturnByAnnotation(function_name, ret_i, annotation.__args__[0])
+    elif oft.OriginFrom(annotation, typing.Dict):
+        assert len(annotation.__args__) == 2
+        assert type(ret) is dict, error_str
+        for (key, val) in ret.items():
+            assert type(key) is annotation.__args__[0], (
+                "type of %s:%s and %s:%s do not matched return annotation (%s, %s) of global_function %s."
+                % (
+                    key,
+                    type(key),
+                    val,
+                    type(val),
+                    annotation.__args__[0],
+                    annotation.__args__[1],
+                    function_name,
+                )
+            )
+            _CheckReturnByAnnotation(function_name, val, annotation.__args__[1])
+    elif oft.OriginFrom(annotation, oft.Numpy):
+        assert isinstance(
+            ret, oneflow._oneflow_internal.BlobDesc
+        ), "type(ret): %s" % type(ret)
+        assert (
+            not ret.is_dynamic
+        ), "only fixed shaped blob compatible to oneflow.compatible.single_client.typing.Numpy. you can change annotation to oneflow.compatible.single_client.typing.ListNumpy "
+    elif oft.OriginFrom(annotation, oft.ListNumpy):
+        assert isinstance(
+            ret, oneflow._oneflow_internal.BlobDesc
+        ), "type(ret): %s" % type(ret)
+    else:
+        raise NotImplementedError("invalid return annotation %s found" % annotation)
+
+
+def TransformGlobalFunctionResult(future_blob, annotation):
+    if annotation is inspect._empty:
+        return future_blob
+    elif annotation is None:
+        assert future_blob is None
+        return None
+    elif oft.OriginFrom(annotation, oft.Callback):
+        annotation = annotation.__args__[0]
+
+        def Transform(f):
+            return lambda x: f(TransformReturnedLocalBlob(x, annotation))
+
+        return lambda f: future_blob.async_get(Transform(f))
+    elif oft.OriginFrom(annotation, oft.Bundle):
+        return TransformReturnedBundle(future_blob.get(), annotation)
+    else:
+        return TransformReturnedLocalBlob(future_blob.get(), annotation)
+
+
+def TransformReturnedBundle(bundle_blob, annotation):
+    """
+    Transform returned bundle blob from global_function(job_func),
+    the returned bundle blob could be the form like x, [x], (x, ),
+    {"key": x} or the mixed form of them.
+    """
+    if isinstance(bundle_blob, (local_blob_util.LocalBlob,)):
+        return TransformReturnedLocalBlob(bundle_blob, annotation.__args__[0])
+    elif isinstance(bundle_blob, (list, tuple)):
+        return type(bundle_blob)(
+            (TransformReturnedBundle(elem, annotation) for elem in bundle_blob)
+        )
+    elif type(bundle_blob) is dict:
+        return {
+            key: TransformReturnedBundle(val, annotation)
+            for (key, val) in bundle_blob.items()
+        }
+    else:
+        raise NotImplementedError(
+            "invalid return  %s : %s found" % (bundle_blob, type(bundle_blob))
+        )
+
+
+def TransformReturnedLocalBlob(local_blob, annotation):
+    if oft.OriginFrom(annotation, typing.Tuple):
+        assert type(local_blob) is tuple
+        assert len(local_blob) == len(annotation.__args__)
+        pairs = zip(local_blob, annotation.__args__)
+        return tuple((TransformReturnedLocalBlob(*pair) for pair in pairs))
+    elif oft.OriginFrom(annotation, typing.List):
+        assert type(local_blob) is list
+        assert len(annotation.__args__) == 1
+        return [
+            TransformReturnedLocalBlob(elem, annotation.__args__[0])
+            for elem in local_blob
+        ]
+    elif oft.OriginFrom(annotation, typing.Dict):
+        assert type(local_blob) is dict
+        assert len(annotation.__args__) == 2
+        vals = [
+            TransformReturnedLocalBlob(val, annotation.__args__[1])
+            for val in local_blob.values()
+        ]
+        return dict(zip(local_blob.keys(), vals))
+    elif oft.OriginFrom(annotation, oft.PyStructCompatibleToBlob):
+        return TransformLocalBlob(local_blob, annotation)
+    else:
+        raise NotImplementedError(
+            "invalid watch callback parameter annotation %s found" % annotation
+        )
+
+
+def CheckWatchCallbackParameterAnnotation(parameters):
+    assert len(parameters) == 1, "watch callback should accept only one parameter"
+    annotation = parameters[list(parameters.keys())[0]].annotation
+    if annotation is inspect._empty:
+        if enable_typing_check.typing_check_enabled:
+            raise NotImplementedError("the watch callback's parameter is not annotated")
+        return
+    if not oft.OriginFrom(annotation, oft.PyStructCompatibleToBlob):
+        raise NotImplementedError(
+            "invalid watch callback paremeter annotation %s found. " % annotation
+            + "candidate annotations: oneflow.compatible.single_client.typing.Numpy, oneflow.compatible.single_client.typing.ListNumpy. "
+        )
+
+
+def CheckWatchedBlobByAnnotation(blob, annotation):
+    if annotation is inspect._empty:
+        return
+    if oft.OriginFrom(annotation, oft.Numpy):
+        assert (
+            not blob.is_dynamic
+        ), "only fixed shaped blob compatible to oneflow.compatible.single_client.typing.Numpy. you can change annotation to oneflow.compatible.single_client.typing.ListNumpy "
+    elif oft.OriginFrom(annotation, oft.ListNumpy):
+        pass
+    else:
+        raise NotImplementedError(
+            "invalid watch callback parameter annotation %s found" % annotation
+        )
+
+
+def TransformWatchedBlob(future_blob, handler):
+    parameters = inspect.signature(handler).parameters
+    annotation = parameters[list(parameters.keys())[0]].annotation
+    if annotation is inspect._empty:
+        return future_blob
+    return TransformLocalBlob(future_blob, annotation)
+
+
+def TransformLocalBlob(future_blob, annotation):
+    if oft.OriginFrom(annotation, oft.Numpy):
+        return future_blob.numpy()
+    elif oft.OriginFrom(annotation, oft.ListNumpy):
+        return future_blob.numpy_list()
+    else:
+        raise NotImplementedError(
+            "invalid watch callback parameter annotation %s found" % annotation
+        )
diff --git a/python/oneflow/compatible/single_client/framework/unittest.py b/python/oneflow/compatible/single_client/framework/unittest.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8806b4b81058286a841f75f952d5b9ef76dc142
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/unittest.py
@@ -0,0 +1,367 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import atexit
+import imp
+import os
+import socket
+import subprocess
+import sys
+import unittest
+import uuid
+from contextlib import closing
+from tempfile import NamedTemporaryFile
+from typing import Any, Callable, Dict
+
+from google.protobuf import text_format as pbtxt
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import env_util as env_util
+from oneflow.core.job.env_pb2 import EnvProto
+
+
+class _ClearDefaultSession(object):
+    def setUp(self):
+        flow.clear_default_session()
+        flow.enable_eager_execution(False)
+
+
+def register_test_cases(
+    scope: Dict[str, Any],
+    directory: str,
+    filter_by_num_nodes: Callable[[bool], int],
+    base_class: unittest.TestCase = unittest.TestCase,
+    test_case_mixin=_ClearDefaultSession,
+) -> None:
+    def FilterTestPyFile(f):
+        return (
+            os.path.isfile(os.path.join(directory, f))
+            and f.endswith(".py")
+            and f.startswith("test")
+        )
+
+    def FilterMethodName(module, name):
+        method = getattr(module, name)
+        return (
+            name.startswith("test")
+            and callable(method)
+            and filter_by_num_nodes(_GetNumOfNodes(method))
+        )
+
+    onlytest_files = [f for f in os.listdir(directory) if FilterTestPyFile(f)]
+    for f in onlytest_files:
+        class_name = f[0:-3]
+        module = imp.load_source(class_name, os.path.join(directory, f))
+        test_func_names = [
+            name for name in dir(module) if FilterMethodName(module, name)
+        ]
+        method_dict = {k: getattr(module, k) for k in test_func_names}
+        scope[class_name] = type(class_name, (test_case_mixin, base_class), method_dict)
+
+
+def num_nodes_required(num_nodes: int) -> Callable[[Callable], Callable]:
+    def Decorator(f):
+        f.__oneflow_test_case_num_nodes_required__ = num_nodes
+        return f
+
+    return Decorator
+
+
+def _GetNumOfNodes(func):
+    if hasattr(func, "__oneflow_test_case_num_nodes_required__") == False:
+        return 1
+    return getattr(func, "__oneflow_test_case_num_nodes_required__")
+
+
+def eager_execution_enabled():
+    return os.getenv("ONEFLOW_TEST_ENABLE_EAGER") == "1"
+
+
+def typing_check_enabled():
+    return os.getenv("ONEFLOW_TEST_ENABLE_TYPING_CHECK") == "1"
+
+
+def node_list():
+    node_list_str = os.getenv("ONEFLOW_TEST_NODE_LIST")
+    assert node_list_str
+    return node_list_str.split(",")
+
+
+def has_node_list():
+    if os.getenv("ONEFLOW_TEST_NODE_LIST"):
+        return True
+    else:
+        return False
+
+
+def node_size():
+    if has_node_list():
+        node_list_from_env = node_list()
+        return len(node_list_from_env)
+    else:
+        return 1
+
+
+def has_world_size():
+    if os.getenv("ONEFLOW_TEST_WORLD_SIZE"):
+        assert os.getenv(
+            "ONEFLOW_TEST_WORLD_SIZE"
+        ).isdigit(), "env var ONEFLOW_TEST_WORLD_SIZE must be num"
+        return True
+    else:
+        return False
+
+
+def world_size():
+    return int(os.getenv("ONEFLOW_TEST_WORLD_SIZE"))
+
+
+def device_num():
+    device_num_str = os.getenv("ONEFLOW_TEST_DEVICE_NUM")
+    if device_num_str:
+        return int(device_num_str)
+    else:
+        return 1
+
+
+def enable_init_by_host_list():
+    return os.getenv("ONEFLOW_TEST_ENABLE_INIT_BY_HOST_LIST") == "1"
+
+
+def enable_multi_process():
+    return os.getenv("ONEFLOW_TEST_MULTI_PROCESS") == "1"
+
+
+def find_free_port():
+    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
+        s.bind(("localhost", 0))
+        s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+        return s.getsockname()[1]
+
+
+_unittest_env_initilized = False
+_unittest_worker_initilized = False
+
+
+def worker_agent_port():
+    port_txt = os.getenv("ONEFLOW_TEST_WORKER_AGENT_PORT")
+    if port_txt:
+        return int(port_txt)
+    else:
+        return None
+
+
+def worker_agent_authkey():
+    key = os.getenv("ONEFLOW_TEST_WORKER_AGENT_AUTHKEY")
+    assert key
+    return key
+
+
+def use_worker_agent():
+    return worker_agent_port() is not None
+
+
+def cast(conn=None, cmd=None, msg=None):
+    cmd = "cast/" + cmd
+    print("[unittest]", f"[{cmd}]", msg)
+    conn.send(cmd.encode())
+    conn.send(msg.encode())
+
+
+def call(conn=None, cmd=None, msg=None):
+    cmd = "call/" + cmd
+    print("[unittest]", f"[{cmd}]", msg)
+    conn.send(cmd.encode())
+    msg_ = ""
+    if msg is not None:
+        msg_ = msg
+    conn.send(msg_.encode())
+    return conn.recv().decode()
+
+
+def launch_worker_via_agent(host=None, env_proto=None):
+    print("[unittest]", "launching worker via agent at", host)
+    from multiprocessing.connection import Client
+
+    address = ("localhost", worker_agent_port())
+    conn = Client(address, authkey=worker_agent_authkey().encode())
+    cast(conn=conn, cmd="host", msg=host)
+    cast(conn=conn, cmd="env_proto", msg=pbtxt.MessageToString(env_proto))
+    assert call(conn=conn, cmd="start_worker") == "ok"
+    print("[unittest]", "worker launched via agent at", host)
+    conn.close()
+
+
+class TestCase(unittest.TestCase):
+    def setUp(self):
+        global _unittest_env_initilized
+        global _unittest_worker_initilized
+        if has_node_list():
+            assert node_size() > 1
+            if _unittest_worker_initilized == False:
+                master_port = os.getenv("ONEFLOW_TEST_MASTER_PORT")
+                assert master_port, "env var ONEFLOW_TEST_MASTER_PORT not set"
+                flow.env.ctrl_port(int(master_port))
+                data_port = os.getenv("ONEFLOW_TEST_DATA_PORT")
+                if data_port:
+                    flow.env.data_port(int(data_port))
+                if enable_init_by_host_list():
+                    flow.env.machine(node_list())
+                    data_port = os.getenv("ONEFLOW_TEST_DATA_PORT")
+                    print("initializing worker...")
+                    for machine in env_util.default_env_proto.machine:
+                        if machine.id == 0:
+                            pass
+                        else:
+                            launch_worker_via_agent(
+                                host=machine.addr, env_proto=env_util.default_env_proto
+                            )
+                else:
+                    ctrl_port = os.getenv("ONEFLOW_TEST_CTRL_PORT")
+                    config_rank_ctrl_port = -1
+                    if ctrl_port:
+                        config_rank_ctrl_port = int(ctrl_port)
+                    if has_world_size():
+                        config_world_size = world_size()
+                    else:
+                        config_world_size = 0
+                    config_node_size = -1
+                    env_node_size = os.getenv("ONEFLOW_TEST_NODE_SIZE")
+                    if env_node_size:
+                        config_node_size = int(env_node_size)
+                    bootstrap_conf_list = flow.env.init_bootstrap_confs(
+                        node_list(),
+                        int(master_port),
+                        config_world_size,
+                        config_rank_ctrl_port,
+                        config_node_size,
+                    )
+                    worker_env_proto = EnvProto()
+                    worker_env_proto.CopyFrom(env_util.default_env_proto)
+                    worker_env_proto.ClearField("ctrl_bootstrap_conf")
+                    for bootstrap_conf in bootstrap_conf_list:
+                        if bootstrap_conf.rank == 0:
+                            continue
+                        assert bootstrap_conf.HasField("host")
+                        worker_env_proto.ctrl_bootstrap_conf.CopyFrom(bootstrap_conf)
+                        launch_worker_via_agent(
+                            host=bootstrap_conf.host, env_proto=worker_env_proto
+                        )
+                _unittest_worker_initilized = True
+        elif device_num() > 1 and enable_multi_process():
+            master_port = find_free_port()
+            flow.env.ctrl_port(master_port)
+            config_world_size = device_num()
+            bootstrap_conf_list = flow.env.init_bootstrap_confs(
+                ["127.0.0.1"], master_port, config_world_size
+            )
+            env_proto = env_util.default_env_proto
+            assert (
+                len(env_proto.machine) == 1
+                and env_proto.HasField("ctrl_bootstrap_conf") == 1
+            )
+            run_dir = os.getenv("HOME") + "/oneflow_temp/" + str(uuid.uuid1())
+            run_dir = os.path.abspath(os.path.expanduser(run_dir))
+            if not os.path.exists(run_dir):
+                os.makedirs(run_dir)
+            for rank in range(1, config_world_size):
+                worker_env_proto = EnvProto()
+                worker_env_proto.CopyFrom(env_proto)
+                worker_env_proto.ctrl_bootstrap_conf.rank = rank
+                worker_env_proto.cpp_logging_conf.log_dir = (
+                    run_dir + "/log_" + str(rank)
+                )
+                env_file = NamedTemporaryFile(delete=False)
+                if sys.version_info >= (3, 0):
+                    env_file.write(pbtxt.MessageToString(worker_env_proto).encode())
+                else:
+                    env_file.write(pbtxt.MessageToString(worker_env_proto))
+                env_file.close()
+                if not os.path.exists(run_dir + "/log_" + str(rank)):
+                    os.mkdir(run_dir + "/log_" + str(rank))
+                os.system(
+                    "cp "
+                    + env_file.name
+                    + " "
+                    + run_dir
+                    + "/log_"
+                    + str(rank)
+                    + "/env_proto_"
+                    + str(rank)
+                    + ".proto"
+                )
+                oneflow_cmd = (
+                    "python3 -m oneflow.compatible.single_client --start_worker"
+                    + " --env_proto="
+                    + run_dir
+                    + "/log_"
+                    + str(rank)
+                    + "/"
+                    + "env_proto_"
+                    + str(rank)
+                    + ".proto"
+                )
+                subprocess.Popen(
+                    oneflow_cmd,
+                    stdout=subprocess.DEVNULL,
+                    stderr=subprocess.DEVNULL,
+                    shell=True,
+                )
+                os.remove(env_file.name)
+            atexit.register(
+                flow.deprecated.delete_worker_of_multi_process, run_dir=run_dir
+            )
+        log_dir = os.getenv("ONEFLOW_TEST_LOG_DIR")
+        if log_dir:
+            flow.env.log_dir(log_dir)
+        if _unittest_env_initilized == False:
+            flow.env.init()
+            _unittest_env_initilized = True
+        flow.clear_default_session()
+        flow.enable_eager_execution(eager_execution_enabled())
+        flow.experimental.enable_typing_check(typing_check_enabled())
+
+
+def skip_unless(n, d):
+    if node_size() == n and device_num() == d:
+        return lambda func: func
+    else:
+        return unittest.skip(
+            "only runs when node_size is {} and device_num is {}".format(n, d)
+        )
+
+
+def skip_unless_1n1d():
+    return skip_unless(1, 1)
+
+
+def skip_unless_1n2d():
+    return skip_unless(1, 2)
+
+
+def skip_unless_1n4d():
+    return skip_unless(1, 4)
+
+
+def skip_unless_2n1d():
+    return skip_unless(2, 1)
+
+
+def skip_unless_2n2d():
+    return skip_unless(2, 2)
+
+
+def skip_unless_2n4d():
+    return skip_unless(2, 4)
diff --git a/python/oneflow/compatible/single_client/framework/variable_getter_composite.py b/python/oneflow/compatible/single_client/framework/variable_getter_composite.py
new file mode 100644
index 0000000000000000000000000000000000000000..86e76cfd07d04af13e424a803ca2cdb270b61fe4
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/variable_getter_composite.py
@@ -0,0 +1,37 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import functools
+
+
+class VariableGetterComposite(object):
+    def __init__(self):
+        self.getter_stack = []
+
+    def __call__(self, var_gen_fn, *args, **kwargs):
+        def make_inner(outter, inner):
+            @functools.wraps(inner)
+            def inner_fn():
+                return outter(inner, *args, **kwargs)
+
+            return inner_fn
+
+        fn = var_gen_fn
+        for getter in self.getter_stack:
+            fn = make_inner(getter, fn)
+        return fn()
+
+    def register(self, fn):
+        self.getter_stack.append(fn)
diff --git a/python/oneflow/compatible/single_client/framework/watcher.py b/python/oneflow/compatible/single_client/framework/watcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..e103a7d1776b12eb9a2b6c4a994fe75387eab892
--- /dev/null
+++ b/python/oneflow/compatible/single_client/framework/watcher.py
@@ -0,0 +1,56 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import traceback
+
+from google.protobuf import text_format
+
+import oneflow._oneflow_internal
+from oneflow.compatible.single_client.framework import local_blob as local_blob_util
+from oneflow.compatible.single_client.framework import ofblob as ofblob
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+from oneflow.compatible.single_client.framework import session_context as session_ctx
+from oneflow.compatible.single_client.framework import typing_util as oft_util
+from oneflow.core.record import record_pb2 as record_util
+
+
+def BindUuidAndHandler(uuid, blob_watched, handler):
+    assert isinstance(blob_watched, oneflow._oneflow_internal.ConsistentBlob)
+    session_ctx.GetDefaultSession().uuid2watch_handler[uuid] = (blob_watched, handler)
+
+
+class _Watcher(oneflow._oneflow_internal.ForeignWatcher):
+    def __init__(self):
+        oneflow._oneflow_internal.ForeignWatcher.__init__(self)
+
+    def Call(self, handler_uuid, of_blob_ptr):
+        try:
+            _WatcherHandler(handler_uuid, of_blob_ptr)
+        except Exception as e:
+            print(traceback.format_exc())
+            raise e
+
+
+def _WatcherHandler(handler_uuid, of_blob_ptr):
+    uuid2handler = session_ctx.GetDefaultSession().uuid2watch_handler
+    assert handler_uuid in uuid2handler
+    (blob_watched, handler) = uuid2handler[handler_uuid]
+    assert callable(handler)
+    ndarray = ofblob.OfBlob(of_blob_ptr).CopyToNdarray()
+    local_blob = local_blob_util.LocalBlob(ndarray, blob_watched.is_dynamic)
+    handler(oft_util.TransformWatchedBlob(local_blob, handler))
+
+
+_global_watcher = _Watcher()
diff --git a/python/oneflow/compatible/single_client/image.py b/python/oneflow/compatible/single_client/image.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff08a44ad5687cba3cc74a5f1b255f3a77ea3c8b
--- /dev/null
+++ b/python/oneflow/compatible/single_client/image.py
@@ -0,0 +1,39 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.ops.user_data_ops import CropMirrorNormalize
+from oneflow.compatible.single_client.ops.user_data_ops import (
+    CropMirrorNormalize as crop_mirror_normalize,
+)
+from oneflow.compatible.single_client.ops.user_data_ops import (
+    api_image_random_crop as random_crop,
+)
+from oneflow.compatible.single_client.ops.user_data_ops import (
+    api_image_resize as Resize,
+)
+from oneflow.compatible.single_client.ops.user_data_ops import (
+    api_image_resize as resize,
+)
+from oneflow.compatible.single_client.ops.user_data_ops import (
+    api_image_target_resize as target_resize,
+)
+from oneflow.compatible.single_client.ops.user_data_ops import (
+    image_batch_align as batch_align,
+)
+from oneflow.compatible.single_client.ops.user_data_ops import image_decode as decode
+from oneflow.compatible.single_client.ops.user_data_ops import image_flip as flip
+from oneflow.compatible.single_client.ops.user_data_ops import (
+    image_normalize as normalize,
+)
diff --git a/python/oneflow/compatible/single_client/layers.py b/python/oneflow/compatible/single_client/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6a5be7ec77a57ca811ac7abe02cbab84ce3bf04
--- /dev/null
+++ b/python/oneflow/compatible/single_client/layers.py
@@ -0,0 +1,32 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.ops.categorical_ordinal_encode_op import (
+    categorical_ordinal_encoder,
+)
+from oneflow.compatible.single_client.ops.layers import (
+    batch_normalization,
+    batch_normalization_add_relu,
+    batch_normalization_relu,
+    conv1d,
+    conv2d,
+    conv3d,
+    dense,
+    layer_norm,
+    layer_norm_grad,
+    layer_norm_param_grad,
+)
+from oneflow.compatible.single_client.ops.layers import upsample as upsample_2d
+from oneflow.compatible.single_client.ops.prelu import prelu
diff --git a/python/oneflow/compatible/single_client/linalg.py b/python/oneflow/compatible/single_client/linalg.py
new file mode 100644
index 0000000000000000000000000000000000000000..a727bad18482b59eef60c6aa78fb5ca48885fd13
--- /dev/null
+++ b/python/oneflow/compatible/single_client/linalg.py
@@ -0,0 +1,16 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.ops.linalg import matmul as matmul
diff --git a/python/oneflow/compatible/single_client/losses.py b/python/oneflow/compatible/single_client/losses.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffc3e5c1ee3afac98bc4c3a482312ffc3dc710d1
--- /dev/null
+++ b/python/oneflow/compatible/single_client/losses.py
@@ -0,0 +1,18 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.ops.losses.add_loss import (
+    api_add_loss as add_loss,
+)
diff --git a/python/oneflow/compatible/single_client/math.py b/python/oneflow/compatible/single_client/math.py
new file mode 100644
index 0000000000000000000000000000000000000000..68568cad9d37f831bb7268e92468c8a2762bc7d4
--- /dev/null
+++ b/python/oneflow/compatible/single_client/math.py
@@ -0,0 +1,119 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.ops.math_binary_elementwise_ops import (
+    atan2,
+    floordiv,
+    pow,
+    xdivy,
+    xlogy,
+)
+from oneflow.compatible.single_client.ops.math_ops import (
+    add,
+    add_n,
+    argmax,
+    broadcast_to_compatible_with,
+    clip_by_value,
+    divide,
+)
+from oneflow.compatible.single_client.ops.math_ops import (
+    elem_cnt as reduced_shape_elem_cnt,
+)
+from oneflow.compatible.single_client.ops.math_ops import equal
+from oneflow.compatible.single_client.ops.math_ops import floor_mod as mod
+from oneflow.compatible.single_client.ops.math_ops import (
+    fused_scale_tril,
+    fused_scale_tril_softmax_dropout,
+    gelu,
+    gelu_grad,
+    greater,
+    greater_equal,
+    in_top_k,
+    l2_normalize,
+    less,
+    less_equal,
+    logical_and,
+    maximum,
+    minimum,
+    multiply,
+    not_equal,
+    polyval,
+    relu,
+    sigmoid,
+    sigmoid_grad,
+    squared_difference,
+    subtract,
+    top_k,
+    tril,
+    unsorted_batch_segment_sum,
+    unsorted_segment_sum,
+    unsorted_segment_sum_like,
+)
+from oneflow.compatible.single_client.ops.math_unary_elementwise_ops import (
+    abs,
+    acos,
+    acosh,
+    asin,
+    asinh,
+    atan,
+    atanh,
+    ceil,
+    cos,
+    cosh,
+    erf,
+    erfc,
+    exp,
+    expm1,
+    floor,
+    lgamma,
+    log,
+    log1p,
+    log_sigmoid,
+    negative,
+    reciprocal,
+    reciprocal_no_nan,
+    rint,
+    round,
+    rsqrt,
+    sigmoid_v2,
+    sign,
+    sin,
+    sinh,
+    softplus,
+    sqrt,
+    square,
+    tan,
+    tanh,
+    tanh_v2,
+)
+from oneflow.compatible.single_client.ops.reduce_mean import reduce_mean
+from oneflow.compatible.single_client.ops.reduce_ops import (
+    reduce_all,
+    reduce_any,
+    reduce_euclidean_norm,
+    reduce_logsumexp,
+    reduce_max,
+    reduce_min,
+    reduce_prod,
+    reduce_std,
+    reduce_sum,
+    reduce_variance,
+)
+from oneflow.compatible.single_client.ops.two_stage_reduce import (
+    api_two_stage_reduce_max as two_stage_reduce_max,
+)
+from oneflow.compatible.single_client.ops.two_stage_reduce import (
+    api_two_stage_reduce_min as two_stage_reduce_min,
+)
diff --git a/python/oneflow/compatible/single_client/model.py b/python/oneflow/compatible/single_client/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..823c5fed92a27e7dc4e73bae8f1bb1fbe73cc1e9
--- /dev/null
+++ b/python/oneflow/compatible/single_client/model.py
@@ -0,0 +1,26 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.framework.model import (
+    Callback,
+    CheckpointConfig,
+    DataModule,
+)
+from oneflow.compatible.single_client.framework.model import Model as Model
+from oneflow.compatible.single_client.framework.model import (
+    NumpyDataModule,
+    TrainingConfig,
+    ValidationConfig,
+)
diff --git a/python/oneflow/compatible/single_client/nn/__init__.py b/python/oneflow/compatible/single_client/nn/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9eacac8c15b0037df0aefd60e7bf36520b39e82
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/__init__.py
@@ -0,0 +1,102 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from oneflow.compatible.single_client.nn.module import Module
+from oneflow.compatible.single_client.nn.modules.sparse import Embedding
+from oneflow.compatible.single_client.nn.parameter import Parameter
+from oneflow.compatible.single_client.ops.domain_ops import (
+    api_fused_self_attention_query_mul_key_and_value as fused_self_attention_query_mul_key_and_value,
+)
+from oneflow.compatible.single_client.ops.loss_ops import ctc_greedy_decoder
+from oneflow.compatible.single_client.ops.math_ops import (
+    fused_scale_tril as fused_scale_tril,
+)
+from oneflow.compatible.single_client.ops.math_ops import (
+    fused_scale_tril_softmax_dropout as fused_scale_tril_softmax_dropout,
+)
+from oneflow.compatible.single_client.ops.math_ops import relu as relu
+from oneflow.compatible.single_client.ops.math_ops import tril as tril
+from oneflow.compatible.single_client.ops.nn_ops import (
+    avg_pool1d,
+    avg_pool2d,
+    avg_pool3d,
+    batch_normalization,
+)
+from oneflow.compatible.single_client.ops.nn_ops import bce_loss as BCELoss
+from oneflow.compatible.single_client.ops.nn_ops import (
+    bce_with_logits_loss as BCEWithLogitsLoss,
+)
+from oneflow.compatible.single_client.ops.nn_ops import bias_add, conv1d, conv2d, conv3d
+from oneflow.compatible.single_client.ops.nn_ops import deconv2d as conv2d_transpose
+from oneflow.compatible.single_client.ops.nn_ops import (
+    deconv2d_torch as torch_conv2d_transpose,
+)
+from oneflow.compatible.single_client.ops.nn_ops import (
+    distributed_sparse_softmax_cross_entropy_with_logits,
+    dropout,
+    elu,
+    fused_bias_add_dropout,
+    fused_bias_add_gelu,
+)
+from oneflow.compatible.single_client.ops.nn_ops import group_normalization as GroupNorm
+from oneflow.compatible.single_client.ops.nn_ops import hard_sigmoid as hardsigmoid
+from oneflow.compatible.single_client.ops.nn_ops import hardswish, hardtanh
+from oneflow.compatible.single_client.ops.nn_ops import (
+    instance_normalization1d as InstanceNorm1d,
+)
+from oneflow.compatible.single_client.ops.nn_ops import (
+    instance_normalization2d as InstanceNorm2d,
+)
+from oneflow.compatible.single_client.ops.nn_ops import (
+    instance_normalization3d as InstanceNorm3d,
+)
+from oneflow.compatible.single_client.ops.nn_ops import kldivloss as KLDivLoss
+from oneflow.compatible.single_client.ops.nn_ops import l1_loss as L1Loss
+from oneflow.compatible.single_client.ops.nn_ops import (
+    layer_norm,
+    leaky_relu,
+    logsoftmax,
+)
+from oneflow.compatible.single_client.ops.nn_ops import (
+    margin_ranking_loss as MarginRankingLoss,
+)
+from oneflow.compatible.single_client.ops.nn_ops import (
+    max_pool1d,
+    max_pool2d,
+    max_pool3d,
+    mish,
+    moments,
+)
+from oneflow.compatible.single_client.ops.nn_ops import mse_loss as MSELoss
+from oneflow.compatible.single_client.ops.nn_ops import pixel_shuffle as PixelShuffle
+from oneflow.compatible.single_client.ops.nn_ops import (
+    pixel_shufflev2 as PixelShufflev2,
+)
+from oneflow.compatible.single_client.ops.nn_ops import (
+    random_mask_like,
+    relu6,
+    sigmoid_cross_entropy_with_logits,
+    softmax,
+    softmax_cross_entropy_with_logits,
+    softmax_grad,
+    sparse_cross_entropy,
+    sparse_softmax_cross_entropy_with_logits,
+    swish,
+)
+from oneflow.compatible.single_client.ops.nn_ops import tf_conv2d as compat_conv2d
+from oneflow.compatible.single_client.ops.nn_ops import (
+    triplet_margin_loss as TripletMarginLoss,
+)
diff --git a/python/oneflow/compatible/single_client/nn/common_types.py b/python/oneflow/compatible/single_client/nn/common_types.py
new file mode 100644
index 0000000000000000000000000000000000000000..d91ca3eda286f89df41573a5ffc4a0007afdbf8d
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/common_types.py
@@ -0,0 +1,35 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Tuple, TypeVar, Union
+
+T = TypeVar("T")
+_scalar_or_tuple_any_t = Union[T, Tuple[T, ...]]
+_scalar_or_tuple_1_t = Union[T, Tuple[T]]
+_scalar_or_tuple_2_t = Union[T, Tuple[T, T]]
+_scalar_or_tuple_3_t = Union[T, Tuple[T, T, T]]
+_scalar_or_tuple_4_t = Union[T, Tuple[T, T, T, T]]
+_scalar_or_tuple_5_t = Union[T, Tuple[T, T, T, T, T]]
+_scalar_or_tuple_6_t = Union[T, Tuple[T, T, T, T, T, T]]
+_size_any_t = _scalar_or_tuple_any_t[int]
+_size_1_t = _scalar_or_tuple_1_t[int]
+_size_2_t = _scalar_or_tuple_2_t[int]
+_size_3_t = _scalar_or_tuple_3_t[int]
+_size_4_t = _scalar_or_tuple_4_t[int]
+_size_5_t = _scalar_or_tuple_5_t[int]
+_size_6_t = _scalar_or_tuple_6_t[int]
+_ratio_2_t = _scalar_or_tuple_2_t[float]
+_ratio_3_t = _scalar_or_tuple_3_t[float]
+_ratio_any_t = _scalar_or_tuple_any_t[float]
diff --git a/python/oneflow/compatible/single_client/nn/init.py b/python/oneflow/compatible/single_client/nn/init.py
new file mode 100644
index 0000000000000000000000000000000000000000..39ff77a373d1ad8bd099c1e4cf973e51b7a7ae24
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/init.py
@@ -0,0 +1,77 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.ops.initializer_util import CalcGain
+
+
+def calculate_gain(nonlinearity, param=None):
+    return CalcGain(nonlinearity, param)
+
+
+def uniform_(tensor, a=0.0, b=1.0):
+    tensor.uniform_(a, b)
+
+
+def normal_(tensor, mean=0.0, std=1.0):
+    tensor.normal_(mean, std)
+
+
+def xavier_uniform_(tensor, gain=1.0, *, data_format="NCHW"):
+    tensor.xavier_uniform_(gain, data_format=data_format)
+
+
+def xavier_normal_(tensor, gain=1.0, *, data_format="NCHW"):
+    tensor.xavier_normal_(gain, data_format=data_format)
+
+
+def kaiming_uniform_(
+    tensor, a=0, mode="fan_in", nonlinearity="leaky_relu", *, data_format="NCHW"
+):
+    tensor.kaiming_uniform_(a, mode, nonlinearity, data_format=data_format)
+
+
+def kaiming_normal_(
+    tensor, a=0, mode="fan_in", nonlinearity="leaky_relu", *, data_format="NCHW"
+):
+    tensor.kaiming_normal_(a, mode, nonlinearity, data_format=data_format)
+
+
+def constant_(tensor, val):
+    tensor.fill_(val)
+
+
+def ones_(tensor):
+    tensor.fill_(1)
+
+
+def zeros_(tensor):
+    tensor.fill_(0)
+
+
+def _calculate_fan_in_and_fan_out(tensor):
+    dimensions = tensor.ndimension()
+    if dimensions < 2:
+        raise ValueError(
+            "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions"
+        )
+    num_input_fmaps = tensor.size(1)
+    num_output_fmaps = tensor.size(0)
+    receptive_field_size = 1
+    if tensor.ndimension() > 2:
+        for s in tensor.size()[2:]:
+            receptive_field_size *= s
+    fan_in = num_input_fmaps * receptive_field_size
+    fan_out = num_output_fmaps * receptive_field_size
+    return (fan_in, fan_out)
diff --git a/python/oneflow/compatible/single_client/nn/module.py b/python/oneflow/compatible/single_client/nn/module.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce9bcdcdcc010293d7767b3e029225ec36898c7f
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/module.py
@@ -0,0 +1,489 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import itertools
+from collections import OrderedDict, namedtuple
+from typing import Callable, Dict, Iterator, List, Optional, Set, Tuple, TypeVar, Union
+
+import numpy as np
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.check_point_v2 import (
+    FeedValueToVariable,
+)
+from oneflow.compatible.single_client.framework.function_util import (
+    global_function_or_identity,
+)
+from oneflow.compatible.single_client.framework.tensor import Tensor
+from oneflow.compatible.single_client.nn.parameter import Parameter
+
+
+class _IncompatibleKeys(
+    namedtuple("IncompatibleKeys", ["missing_keys", "unexpected_keys"])
+):
+    def __repr__(self):
+        if not self.missing_keys and (not self.unexpected_keys):
+            return "<All keys matched successfully>"
+        return super(_IncompatibleKeys, self).__repr__()
+
+    __str__ = __repr__
+
+
+T = TypeVar("T", bound="Module")
+
+
+class Module(object):
+    def __init__(self):
+        self.training = True
+        self._consistent = False
+        self._parameters = OrderedDict()
+        self._buffers = OrderedDict()
+        self._non_persistent_buffers_set = set()
+        self._backward_hooks = OrderedDict()
+        self._is_full_backward_hook = None
+        self._forward_hooks = OrderedDict()
+        self._forward_pre_hooks = OrderedDict()
+        self._state_dict_hooks = OrderedDict()
+        self._load_state_dict_pre_hooks = OrderedDict()
+        self._modules = OrderedDict()
+
+    @property
+    def consistent(self):
+        return self._consistent
+
+    def forward(self, *args):
+        raise NotImplementedError()
+
+    def consistent_forward(self, *args):
+        return self.forward(*args)
+
+    def force_mirrored_forward(self, *args):
+        raise NotImplementedError()
+
+    def __call__(self, *args):
+        for hook in itertools.chain(self._forward_pre_hooks.values()):
+            result = hook(self, args)
+            if result is not None:
+                if not isinstance(result, tuple):
+                    result = (result,)
+                args = result
+        res = self.forward(*args)
+        return res
+
+    def add_module(self, name: str, module: Optional["Module"]) -> None:
+        """Adds a child module to the current module.
+
+        The module can be accessed as an attribute using the given name.
+
+        Args:
+            name (string): name of the child module. The child module can be
+                accessed from this module using the given name
+            module (Module): child module to be added to the module.
+        """
+        if not isinstance(module, Module) and module is not None:
+            raise TypeError("{} is not a Module subclass".format(type(module)))
+        elif not isinstance(name, str):
+            raise TypeError("module name should be a string. Got {}".format(type(name)))
+        elif hasattr(self, name) and name not in self._modules:
+            raise KeyError("attribute '{}' already exists".format(name))
+        elif "." in name:
+            raise KeyError('module name can\'t contain ".", got: {}'.format(name))
+        elif name == "":
+            raise KeyError('module name can\'t be empty string ""')
+        self._modules[name] = module
+
+    def register_buffer(
+        self, name: str, tensor: Optional[Tensor], persistent: bool = True
+    ) -> None:
+        if "_buffers" not in self.__dict__:
+            raise AttributeError("cannot assign buffer before Module.__init__() call")
+        elif not isinstance(name, str):
+            raise TypeError("buffer name should be a string. Got {}".format(type(name)))
+        elif "." in name:
+            raise KeyError('buffer name can\'t contain "."')
+        elif name == "":
+            raise KeyError('buffer name can\'t be empty string ""')
+        elif hasattr(self, name) and name not in self._buffers:
+            raise KeyError("attribute '{}' already exists".format(name))
+        elif tensor is not None and (not isinstance(tensor, Tensor)):
+            raise TypeError(
+                "cannot assign '{}' object to buffer '{}' (Tensor or None required)".format(
+                    type(tensor), name
+                )
+            )
+        else:
+            self._buffers[name] = tensor
+            if persistent:
+                self._non_persistent_buffers_set.discard(name)
+            else:
+                self._non_persistent_buffers_set.add(name)
+
+    def register_parameter(self, name: str, param: Optional[Parameter]) -> None:
+        if "_parameters" not in self.__dict__:
+            raise AttributeError(
+                "cannot assign parameter before Module.__init__() call"
+            )
+        elif not isinstance(name, str):
+            raise TypeError(
+                "parameter name should be a string. Got {}".format(type(name))
+            )
+        elif "." in name:
+            raise KeyError('parameter name can\'t contain "."')
+        elif name == "":
+            raise KeyError('parameter name can\'t be empty string ""')
+        elif hasattr(self, name) and name not in self._parameters:
+            raise KeyError("attribute '{}' already exists".format(name))
+        if param is None:
+            self._parameters[name] = None
+        elif not isinstance(param, Parameter):
+            raise TypeError(
+                "cannot assign '{}' object to parameter '{}' (nn.Parameter or None required)".format(
+                    type(param), name
+                )
+            )
+        else:
+            self._parameters[name] = param
+
+    def __getattr__(self, name: str) -> Union[Tensor, "Module"]:
+        if "_parameters" in self.__dict__:
+            _parameters = self.__dict__["_parameters"]
+            if name in _parameters:
+                return _parameters[name]
+        if "_buffers" in self.__dict__:
+            _buffers = self.__dict__["_buffers"]
+            if name in _buffers:
+                return _buffers[name]
+        if "_modules" in self.__dict__:
+            modules = self.__dict__["_modules"]
+            if name in modules:
+                return modules[name]
+        raise AttributeError(
+            "'{}' object has no attribute '{}'".format(type(self).__name__, name)
+        )
+
+    def __setattr__(self, name: str, value: Union[Tensor, "Module"]) -> None:
+        def remove_from(*dicts_or_sets):
+            for d in dicts_or_sets:
+                if name in d:
+                    if isinstance(d, dict):
+                        del d[name]
+                    else:
+                        d.discard(name)
+
+        params = self.__dict__.get("_parameters")
+        if isinstance(value, Parameter):
+            if params is None:
+                raise AttributeError(
+                    "cannot assign parameters before Module.__init__() call"
+                )
+            remove_from(
+                self.__dict__,
+                self._buffers,
+                self._modules,
+                self._non_persistent_buffers_set,
+            )
+            self.register_parameter(name, value)
+        elif params is not None and name in params:
+            if value is not None:
+                raise TypeError(
+                    "cannot assign '{}' as parameter '{}' (nn.Parameter or None expected)".format(
+                        type(value), name
+                    )
+                )
+            self.register_parameter(name, value)
+        else:
+            modules = self.__dict__.get("_modules")
+            if isinstance(value, Module):
+                if modules is None:
+                    raise AttributeError(
+                        "cannot assign module before Module.__init__() call"
+                    )
+                remove_from(
+                    self.__dict__,
+                    self._parameters,
+                    self._buffers,
+                    self._non_persistent_buffers_set,
+                )
+                modules[name] = value
+            elif modules is not None and name in modules:
+                if value is not None:
+                    raise TypeError(
+                        "cannot assign '{}' as child module '{}' (nn.Module or None expected)".format(
+                            type(value), name
+                        )
+                    )
+                modules[name] = value
+            else:
+                buffers = self.__dict__.get("_buffers")
+                if buffers is not None and name in buffers:
+                    if value is not None and (not isinstance(value, Tensor)):
+                        raise TypeError(
+                            "cannot assign '{}' as buffer '{}' (Tensor or None expected)".format(
+                                type(value), name
+                            )
+                        )
+                    buffers[name] = value
+                else:
+                    object.__setattr__(self, name, value)
+
+    def _named_members(self, get_members_fn, prefix="", recurse=True):
+        memo = set()
+        modules = self.named_modules(prefix=prefix) if recurse else [(prefix, self)]
+        for (module_prefix, module) in modules:
+            members = get_members_fn(module)
+            for (k, v) in members:
+                if v is None or v in memo:
+                    continue
+                memo.add(v)
+                name = module_prefix + ("." if module_prefix else "") + k
+                yield (name, v)
+
+    def parameters(self, recurse: bool = True) -> Iterator[Parameter]:
+        for (name, param) in self.named_parameters(recurse=recurse):
+            yield param
+
+    def named_parameters(
+        self, prefix: str = "", recurse: bool = True
+    ) -> Iterator[Tuple[str, Tensor]]:
+        gen = self._named_members(
+            lambda module: module._parameters.items(), prefix=prefix, recurse=recurse
+        )
+        for elem in gen:
+            yield elem
+
+    def buffers(self, recurse: bool = True) -> Iterator[Tensor]:
+        for (name, buf) in self.named_buffers(recurse=recurse):
+            yield buf
+
+    def named_buffers(
+        self, prefix: str = "", recurse: bool = True
+    ) -> Iterator[Tuple[str, Tensor]]:
+        gen = self._named_members(
+            lambda module: module._buffers.items(), prefix=prefix, recurse=recurse
+        )
+        for elem in gen:
+            yield elem
+
+    def children(self) -> Iterator["Module"]:
+        for (name, module) in self.named_children():
+            yield module
+
+    def named_children(self) -> Iterator[Tuple[str, "Module"]]:
+        memo = set()
+        for (name, module) in self._modules.items():
+            if module is not None and module not in memo:
+                memo.add(module)
+                yield (name, module)
+
+    def modules(self) -> Iterator["Module"]:
+        for (name, module) in self.named_modules():
+            yield module
+
+    def named_modules(self, memo: Optional[Set["Module"]] = None, prefix: str = ""):
+        if memo is None:
+            memo = set()
+        if self not in memo:
+            memo.add(self)
+            yield (prefix, self)
+            for (name, module) in self._modules.items():
+                if module is None:
+                    continue
+                submodule_prefix = prefix + ("." if prefix else "") + name
+                for m in module.named_modules(memo, submodule_prefix):
+                    yield m
+
+    def train(self: T, mode: bool = True) -> T:
+        self.training = mode
+        for module in self.children():
+            module.train(mode)
+        return self
+
+    def eval(self: T) -> T:
+        return self.train(False)
+
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        for (name, param) in self._parameters.items():
+            if param is not None:
+                destination[prefix + name] = param
+        for (name, buf) in self._buffers.items():
+            if buf is not None and name not in self._non_persistent_buffers_set:
+                destination[prefix + name] = buf
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        for hook in self._load_state_dict_pre_hooks.values():
+            hook(
+                state_dict,
+                prefix,
+                local_metadata,
+                strict,
+                missing_keys,
+                unexpected_keys,
+                error_msgs,
+            )
+        persistent_buffers = {
+            k: v
+            for (k, v) in self._buffers.items()
+            if k not in self._non_persistent_buffers_set
+        }
+        local_name_params = itertools.chain(
+            self._parameters.items(), persistent_buffers.items()
+        )
+        local_state = {k: v for (k, v) in local_name_params if v is not None}
+        for (name, param) in local_state.items():
+            key = prefix + name
+            if key in state_dict:
+                input_param = state_dict[key]
+                if tuple(input_param.shape) != tuple(param.shape):
+                    error_msgs.append(
+                        "size mismatch for {}: copying a param with shape {} from checkpoint, the shape in current model is {}.".format(
+                            key, input_param.shape, param.shape
+                        )
+                    )
+                    continue
+                try:
+                    param.copy_(input_param)
+                except Exception as ex:
+                    error_msgs.append(
+                        'While copying the parameter named "{}", whose dimensions in the model are {} and whose dimensions in the checkpoint are {}, an exception occurred : {}.'.format(
+                            key, param.shape, input_param.shape, ex.args
+                        )
+                    )
+            elif strict:
+                missing_keys.append(key)
+        if strict:
+            for key in state_dict.keys():
+                if key.startswith(prefix):
+                    input_name = key[len(prefix) :]
+                    input_name = input_name.split(".", 1)[0]
+                    if (
+                        input_name not in self._modules
+                        and input_name not in local_state
+                    ):
+                        unexpected_keys.append(key)
+
+    def load_state_dict(
+        self,
+        state_dict: Union[Dict[str, Tensor], Dict[str, Tensor]],
+        strict: bool = True,
+    ):
+        missing_keys = []
+        unexpected_keys = []
+        error_msgs = []
+        metadata = getattr(state_dict, "_metadata", None)
+        state_dict = state_dict.copy()
+        if metadata is not None:
+            state_dict._metadata = metadata
+
+        def load(module, prefix=""):
+            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+            module._load_from_state_dict(
+                state_dict,
+                prefix,
+                local_metadata,
+                True,
+                missing_keys,
+                unexpected_keys,
+                error_msgs,
+            )
+            for (name, child) in module._modules.items():
+                if child is not None:
+                    load(child, prefix + name + ".")
+
+        load(self)
+        load = None
+        if strict:
+            if len(unexpected_keys) > 0:
+                error_msgs.insert(
+                    0,
+                    "Unexpected key(s) in state_dict: {}. ".format(
+                        ", ".join(('"{}"'.format(k) for k in unexpected_keys))
+                    ),
+                )
+            if len(missing_keys) > 0:
+                error_msgs.insert(
+                    0,
+                    "Missing key(s) in state_dict: {}. ".format(
+                        ", ".join(('"{}"'.format(k) for k in missing_keys))
+                    ),
+                )
+        if len(error_msgs) > 0:
+            raise RuntimeError(
+                "Error(s) in loading state_dict for {}:\n\t{}".format(
+                    self.__class__.__name__, "\n\t".join(error_msgs)
+                )
+            )
+        return _IncompatibleKeys(missing_keys, unexpected_keys)
+
+    def state_dict(
+        self, destination=None, prefix="", keep_vars=False
+    ) -> Dict[str, Tensor]:
+        if destination is None:
+            destination = OrderedDict()
+            destination._metadata = OrderedDict()
+        self._save_to_state_dict(destination, prefix, keep_vars)
+        for (name, module) in self._modules.items():
+            if module is not None:
+                module.state_dict(destination, prefix + name + ".", keep_vars=keep_vars)
+        for hook in self._state_dict_hooks.values():
+            hook_result = hook(self, destination, prefix)
+            if hook_result is not None:
+                destination = hook_result
+        return destination
+
+    def register_forward_pre_hook(self, hook: Callable[..., None]) -> None:
+        self._forward_pre_hooks[len(self._forward_pre_hooks)] = hook
+
+    def _apply(self, fn):
+        for module in self.children():
+            module._apply(fn)
+        for (key, param) in self._parameters.items():
+            if param is not None:
+                assert isinstance(param, Parameter)
+                assert param.is_leaf
+                with flow.no_grad():
+                    param_applied = Tensor(fn(param))
+                self._parameters[key] = Parameter(param_applied, param.requires_grad)
+                if param.grad is not None:
+                    assert param.grad.is_leaf
+                    with flow.no_grad():
+                        grad_applied = Tensor(fn(param.grad))
+                    self._parameters[key].grad = grad_applied.requires_grad_(
+                        param.grad.requires_grad
+                    )
+        for (key, buf) in self._buffers.items():
+            if buf is not None:
+                self._buffers[key] = Tensor(fn(buf))
+        return self
+
+    def apply(self: T, fn: Callable[["Module"], None]) -> T:
+        for module in self.children():
+            module.apply(fn)
+        fn(self)
+        return self
+
+    def to(self, device: Optional[Union[str, flow.device]] = None):
+        def convert(t):
+            return t.to(device)
+
+        return self._apply(convert)
diff --git a/python/oneflow/compatible/single_client/nn/modules/__init__.py b/python/oneflow/compatible/single_client/nn/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/python/oneflow/compatible/single_client/nn/modules/abs.py b/python/oneflow/compatible/single_client/nn/modules/abs.py
new file mode 100644
index 0000000000000000000000000000000000000000..51cdb86b2fbc3379123755ab6d2516fdfe863026
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/abs.py
@@ -0,0 +1,55 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class Abs(Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.abs(x)
+
+
+@register_tensor_op("abs")
+def abs_op(x):
+    """Return the absolute value of each element in input tensor:math:`y = |x|` element-wise.
+
+    Args:
+        input (Tensor): the input tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+
+        >>> x = flow.Tensor(np.array([-1, 2, -3, 4]).astype(np.float32))
+        >>> flow.abs(x)
+        tensor([1., 2., 3., 4.], dtype=oneflow.float32)
+
+    """
+    return Abs()(x)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/acos.py b/python/oneflow/compatible/single_client/nn/modules/acos.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff61be079641d0d849109aa1ba4154b1aad03eff
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/acos.py
@@ -0,0 +1,61 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class Acos(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.acos(x)
+
+
+@register_tensor_op("acos")
+def acos_op(tensor):
+    """
+    Returns a new tensor with the inverse cosine of the elements of :attr:`input`.
+
+    .. math::
+        \\text{out}_{i} = \\arccos(\\text{input}_{i})
+
+    Args:
+        input (Tensor): the input tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+
+        >>> arr = np.array([0.5, 0.6, 0.7])
+        >>> input = flow.Tensor(arr, dtype=flow.float32)
+        >>> output = flow.acos(input)
+        >>> print(output.numpy())
+        [1.0471976  0.9272952  0.79539883]
+
+    """
+    return Acos()(tensor)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/acosh.py b/python/oneflow/compatible/single_client/nn/modules/acosh.py
new file mode 100644
index 0000000000000000000000000000000000000000..9437a327800a3c9171d6c72ec0f432d51258f185
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/acosh.py
@@ -0,0 +1,95 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class Acosh(Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.acosh(x)
+
+
+def acosh_op(x):
+    """Returns a new tensor with the inverse hyperbolic cosine of the elements of :attr:`input`.
+
+    .. math::
+
+        \\text{out}_{i} = \\cosh^{-1}(\\text{input}_{i})
+
+    Args:
+        input (Tensor): the input tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+        >>> x1 = flow.Tensor(np.array([2, 3, 4]).astype(np.float32))
+        >>> out1 = flow.acosh(x1)
+        >>> out1
+        tensor([1.317 , 1.7627, 2.0634], dtype=oneflow.float32)
+        >>> x2 = flow.Tensor(np.array([1.5, 2.6, 3.7]).astype(np.float32),device=flow.device('cuda'))
+        >>> out2 = flow.acosh(x2)
+        >>> out2
+        tensor([0.9624, 1.6094, 1.9827], device='cuda:0', dtype=oneflow.float32)
+
+    """
+    return Acosh()(x)
+
+
+@register_tensor_op("acosh")
+def acosh_op_tensor(x):
+    """
+
+    acosh() -> Tensor
+
+    See :func:`oneflow.compatible.single_client.experimental.acosh`
+
+    """
+    return Acosh()(x)
+
+
+def arccosh_op(x):
+    """
+
+    See :func:`oneflow.compatible.single_client.experimental.acosh`
+
+    """
+    return Acosh()(x)
+
+
+@register_tensor_op("arccosh")
+def arccosh_op_tensor(x):
+    """
+
+    arccosh() -> Tensor
+
+    See :func:`oneflow.compatible.single_client.experimental.acosh`
+
+    """
+    return Acosh()(x)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/activation.py b/python/oneflow/compatible/single_client/nn/modules/activation.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad13843beb4a745f707059f2702252b2ace4b7ab
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/activation.py
@@ -0,0 +1,921 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+def _softmax_need_transpose(x, axis):
+    assert type(axis) is int
+    dim_num = len(x.shape)
+    if dim_num == 1:
+        return (False, None)
+    if axis < 0:
+        axis += dim_num
+    assert axis >= 0
+    assert axis < dim_num
+    need_transpose = False
+    permute = list(range(dim_num))
+    if axis != dim_num - 1:
+        need_transpose = True
+        permute[axis] = permute[-1]
+        permute[-1] = axis
+    return (need_transpose, permute)
+
+
+class PReLU(Module):
+    """Applies the element-wise function:
+
+    .. math::
+        PReLU(x) = \\max(0,x) + a * \\min(0,x)
+
+    Here :math:`a` is a learnable parameter. When called without arguments, `nn.PReLU()` uses a single
+    parameter :math:`a` across all input channels. If called with `nn.PReLU(nChannels)`,
+    a separate :math:`a` is used for each input channel.
+
+
+    .. note::
+        weight decay should not be used when learning :math:`a` for good performance.
+
+    .. note::
+        Channel dim is the 2nd dim of input. When input has dims < 2, then there is
+        no channel dim and the number of channels = 1.
+
+    Args:
+        num_parameters (int): number of :math:`a` to learn.
+            Although it takes an int as input, there is only two values are legitimate:
+            1, or the number of channels at input. Default: 1
+        init (float): the initial value of :math:`a`. Default: 0.25
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    Attr:
+        - weight (Tensor): the learnable weights of shape (:attr:`num_parameters`).
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> m = flow.nn.PReLU()
+        >>> input = flow.Tensor(np.asarray([[[[1, -2], [3, 4]]]]), dtype=flow.float32)
+        >>> print(m(input).numpy())
+        [[[[ 1.  -0.5]
+           [ 3.   4. ]]]]
+
+    """
+
+    def __init__(self, num_parameters: int = 1, init: float = 0.25) -> None:
+        super().__init__()
+        self.num_parameters = num_parameters
+        self.weight = flow.nn.Parameter(flow.Tensor(num_parameters, 1, 1).fill_(init))
+        self.op = flow.builtin_op("prelu").Input("x").Input("alpha").Output("y").Build()
+
+    def forward(self, x):
+        assert (
+            self.num_parameters == 1 or self.num_parameters == x.shape[1]
+        ), f"num_parameters in prelu must be 1 or {x.shape[1]}"
+        return self.op(x, self.weight)[0]
+
+
+class ReLU(Module):
+    """Applies the rectified linear unit function element-wise:
+
+    :math:`\\text{ReLU}(x) = (x)^+ = \\max(0, x)`
+
+    Args:
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+        >>> relu = flow.nn.ReLU()
+        >>> ndarr = np.asarray([1, -2, 3])
+        >>> x = flow.Tensor(ndarr)
+        >>> relu(x)
+        tensor([1., 0., 3.], dtype=oneflow.float32)
+
+    """
+
+    def __init__(self, inplace: bool = False):
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.relu(x)
+
+
+class ReLU6(Module):
+    """Applies the element-wise function:
+
+    .. math::
+
+        \\text{Relu6}(x) = \\begin{cases}
+            6 & \\text{ if } x > 6 \\\\
+            0 & \\text{ if } x < 0 \\\\
+            x & \\text{ otherwise } \\\\
+        \\end{cases}
+
+    Args:
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> x = np.array([-0.5, 0, 0.5]).astype(np.float32)
+        >>> input = flow.Tensor(x)
+        >>> relu6 = flow.nn.ReLU6()
+
+        >>> out = relu6(input)
+        >>> out
+        tensor([0. , 0. , 0.5], dtype=oneflow.float32)
+
+    """
+
+    def __init__(self, inplace: bool = False):
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.hardtanh(x, min_val=0.0, max_val=6.0)
+
+
+class Tanh(Module):
+    """This operator computes the hyperbolic tangent value of Tensor.
+
+    The equation is:
+
+    .. math::
+
+        out = \\frac{e^x-e^{-x}}{e^x+e^{-x}}
+
+    Args:
+        x (oneflow.compatible.single_client.Tensor): A Tensor
+
+    Returns:
+        oneflow.compatible.single_client.Tensor: The result Tensor
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> x = np.array([-1, 0, 1]).astype(np.float32)
+        >>> input = flow.Tensor(x)
+        >>> tanh = flow.nn.Tanh()
+        >>> out = tanh(input)
+        >>> out
+        tensor([-0.7616,  0.    ,  0.7616], dtype=oneflow.float32)
+
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.tanh(x)
+
+
+@register_tensor_op("tanh")
+def tanh_op(x):
+    """This operator computes the hyperbolic tangent value of Tensor.
+
+    The equation is:
+
+    .. math::
+
+        out = \\frac{e^x-e^{-x}}{e^x+e^{-x}}
+
+    Args:
+        x (oneflow.compatible.single_client.Tensor): A Tensor
+
+    Returns:
+        oneflow.compatible.single_client.Tensor: The result Tensor
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> x = np.array([-1, 0, 1]).astype(np.float32)
+        >>> input = flow.Tensor(x)
+        >>> tanh = flow.nn.Tanh()
+        >>> out = tanh(input)
+        >>> out
+        tensor([-0.7616,  0.    ,  0.7616], dtype=oneflow.float32)
+
+    """
+    return Tanh()(x)
+
+
+class ELU(Module):
+    """Applies the element-wise function:
+
+    .. math::
+
+        \\text{ELU}(x) = \\begin{cases}
+				x & \\text{ if } x \\gt 0  \\\\
+                \\alpha*(exp(x)-1) & \\text{ if } x \\le 0 \\\\
+    		    \\end{cases}
+
+    Args:
+        alpha: the :math:`\\alpha` value for the ELU formulation. Default: 1.0
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    For example:
+
+    .. code-block:: python
+
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> x = np.array([-0.5, 0, 0.5]).astype(np.float32)
+        >>> input = flow.Tensor(x)
+        >>> elu = flow.nn.ELU()
+
+        >>> out = elu(input)
+        >>> out
+        tensor([-0.3935,  0.    ,  0.5   ], dtype=oneflow.float32)
+
+    """
+
+    def __init__(self, alpha: float = 1.0, inplace: bool = False):
+        super().__init__()
+        self.alpha = alpha
+
+    def forward(self, x):
+        return flow.F.elu(x, alpha=self.alpha)
+
+
+class GELU(Module):
+    """Gelu activation operator.
+
+    The equation is:
+
+    .. math::
+        out = 0.5 * x * (1 + tanh(\\sqrt{\\frac{2}{\\pi}} * (x + 0.044715x^{3})))
+
+    Args:
+        x (oneflow.compatible.single_client.Tensor): Input Tensor
+
+    Returns:
+        oneflow.compatible.single_client.Tensor: A Tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> x = np.array([-0.5, 0, 0.5]).astype(np.float32)
+        >>> input = flow.Tensor(x)
+        >>> gelu = flow.nn.GELU()
+
+        >>> out = gelu(input)
+        >>> out
+        tensor([-0.1543,  0.    ,  0.3457], dtype=oneflow.float32)
+
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.gelu(x)
+
+
+@register_tensor_op("gelu")
+def gelu_op(x):
+    """Gelu activation operator.
+
+    The equation is:
+
+    .. math::
+        out = 0.5 * x * (1 + tanh(\\sqrt{\\frac{2}{\\pi}} * (x + 0.044715x^{3})))
+
+    Args:
+        x (oneflow.compatible.single_client.Tensor): Input Tensor
+
+    Returns:
+        oneflow.compatible.single_client.Tensor: A Tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> x = np.array([-0.5, 0, 0.5]).astype(np.float32)
+        >>> input = flow.Tensor(x)
+        >>> gelu = flow.nn.GELU()
+
+        >>> out = gelu(input)
+        >>> out
+        tensor([-0.1543,  0.    ,  0.3457], dtype=oneflow.float32)
+
+    """
+    return GELU()(x)
+
+
+class Sigmoid(Module):
+    """Applies the element-wise function:
+
+    .. math::
+        \\text{Sigmoid}(x) = \\sigma(x) = \\frac{1}{1 + \\exp(-x)}
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> x = flow.Tensor(np.array([0.81733328, 0.43621480, 0.10351428]))
+        >>> m = flow.nn.Sigmoid()
+        >>> out = m(x)
+        >>> out
+        tensor([0.6937, 0.6074, 0.5259], dtype=oneflow.float32)
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.sigmoid(x)
+
+
+@register_tensor_op("sigmoid")
+def sigmoid_op(x):
+    """Applies the element-wise function:
+
+    .. math::
+        \\text{Sigmoid}(x) = \\sigma(x) = \\frac{1}{1 + \\exp(-x)}
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> x = flow.Tensor(np.array([0.81733328, 0.43621480, 0.10351428]))
+        >>> out = flow.sigmoid(x)
+        >>> out
+        tensor([0.6937, 0.6074, 0.5259], dtype=oneflow.float32)
+
+    """
+    return Sigmoid()(x)
+
+
+class Hardsigmoid(Module):
+    """Applies the element-wise function:
+
+    .. math::
+        \\text{Hardsigmoid}(x) = \\begin{cases}
+            0 & \\text{ if } x \\le -3  \\\\
+            1 & \\text{ if } x \\ge +3 \\\\
+            \\frac{x}{6} + \\frac{1}{2} & \\text{ otherwise } \\\\
+        \\end{cases}
+
+    Args:
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> x = np.array([-0.5, 0, 0.5]).astype(np.float32)
+        >>> input = flow.Tensor(x)
+        >>> hardsigmoid = flow.nn.Hardsigmoid()
+
+        >>> out = hardsigmoid(input)
+        >>> out
+        tensor([0.4167, 0.5   , 0.5833], dtype=oneflow.float32)
+
+
+    """
+
+    def __init__(self, inplace: bool = False):
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.hardsigmoid(x)
+
+
+class Softmax(Module):
+    def __init__(self, dim: Optional[int] = None):
+        super().__init__()
+        self.axis = -1 if dim is None else dim
+
+    def forward(self, x):
+        (need_transpose, permute) = _softmax_need_transpose(x, self.axis)
+        if need_transpose:
+            x = flow.F.transpose(x, perm=permute)
+        res = flow.F.softmax(x)
+        if need_transpose:
+            res = flow.F.transpose(res, perm=permute)
+        return res
+
+
+@register_tensor_op("softmax")
+def softmax_op(tensor, dim=None):
+    """Applies the Softmax function to an n-dimensional input Tensor
+    rescaling them so that the elements of the n-dimensional output Tensor
+    lie in the range [0,1] and sum to 1.
+
+    Softmax is defined as:
+
+    .. math::
+        \\text{Softmax}(x_{i}) = \\frac{\\exp(x_i)}{\\sum_j \\exp(x_j)}
+
+    When the input Tensor is a sparse tensor then the unspecifed
+    values are treated as ``-inf``.
+
+    Shape:
+        - Input: :math:`(*)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(*)`, same shape as the input
+
+    Returns:
+        a Tensor of the same dimension and shape as the input with
+        values in the range [0, 1]
+
+    Args:
+        dim (int): A dimension along which Softmax will be computed (so every slice
+            along dim will sum to 1).
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> m = flow.nn.Softmax(dim = 2)
+        >>> x = flow.Tensor(
+        ...    np.array(
+        ...        [[[-0.46716809,  0.40112534,  0.61984003],
+        ...        [-1.31244969, -0.42528763,  1.47953856]]]
+        ...    )
+        ... )
+        >>> out = m(x)
+        >>> out
+        tensor([[[0.1575, 0.3754, 0.4671],
+                 [0.0507, 0.123 , 0.8263]]], dtype=oneflow.float32)
+    """
+    return Softmax(dim)(tensor)
+
+
+class LogSoftmax(Module):
+    """Applies the :math:`\\log(\\text{Softmax}(x))` function to an n-dimensional
+    input Tensor.
+    The LogSoftmax formulation can be simplified as:
+
+    .. math::
+        \\text{LogSoftmax}(x_{i}) = \\log\\left(\\frac{\\exp(x_i) }{ \\sum_j \\exp(x_j)} \\right)
+
+    Args:
+        dim (int): A dimension along which LogSoftmax will be computed.
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> m = flow.nn.LogSoftmax(dim=1)
+        >>> x = flow.Tensor(
+        ...    np.array(
+        ...        [[ 0.4296, -1.1957,  2.5463],
+        ...        [ 1.2552, -1.5747,  0.6923]]
+        ...    )
+        ... )
+        >>> out = m(x)
+        >>> out
+        tensor([[-2.2513, -3.8766, -0.1346],
+                [-0.4877, -3.3176, -1.0506]], dtype=oneflow.float32)
+    """
+
+    def __init__(self, dim: Optional[int] = 1):
+        super().__init__()
+        self.dim = dim
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        if not hasattr(self, "dim"):
+            self.dim = None
+
+    def forward(self, x):
+        (need_transpose, permute) = _softmax_need_transpose(x, self.dim)
+        if need_transpose:
+            x = flow.F.transpose(x, perm=permute)
+        x = x.softmax()
+        res = x.log()
+        if need_transpose:
+            res = flow.F.transpose(res, perm=permute)
+        return res
+
+    def extra_repr(self):
+        return "dim={dim}".format(dim=self.dim)
+
+
+class LogSigmoid(Module):
+    """Applies the element-wise function:
+
+    .. math::
+        \\text{LogSigmoid}(x) = \\log\\left(\\frac{ 1 }{ 1 + \\exp(-x)}\\right)
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    For example:
+
+    .. code-block:: python
+
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> x = np.array([-0.5, 0, 0.5]).astype(np.float32)
+        >>> input = flow.Tensor(x)
+        >>> logsigmoid = flow.nn.LogSigmoid()
+
+        >>> out = logsigmoid(input)
+        >>> out
+        tensor([-0.9741, -0.6931, -0.4741], dtype=oneflow.float32)
+
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        sigmoid_res = flow.experimental.sigmoid(x)
+        res = flow.experimental.log(sigmoid_res)
+        return res
+
+
+class Softplus(Module):
+    """Applies the element-wise function:
+
+    .. math::
+        \\text{Softplus}(x) = \\frac{1}{\\beta} * \\log(1 + \\exp(\\beta * x))
+
+    SoftPlus is a smooth approximation to the ReLU function and can be used
+    to constrain the output of a machine to always be positive.
+
+    For numerical stability the implementation reverts to the linear function
+    when :math:`input \\times \\beta > threshold`.
+
+    Args:
+        beta: the :math:`\\beta` value for the Softplus formulation. Default: 1
+        threshold: values above this revert to a linear function. Default: 20
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> x = np.array([-0.5, 0, 0.5]).astype(np.float32)
+        >>> input = flow.Tensor(x)
+        >>> softplus = flow.nn.Softplus()
+
+        >>> out = softplus(input)
+        >>> out
+        tensor([0.4741, 0.6931, 0.9741], dtype=oneflow.float32)
+    """
+
+    def __init__(self, beta: int = 1, threshold: int = 20):
+        super().__init__()
+        self.beta = beta
+        self.threshold = threshold
+
+    def forward(self, x):
+        return flow.experimental.where(
+            x * self.beta > self.threshold,
+            x,
+            1
+            / self.beta
+            * flow.experimental.log(1.0 + flow.experimental.exp(self.beta * x)),
+        )
+
+
+class Hardswish(Module):
+    """Applies the hardswish function, element-wise, as described in the paper:
+    `Searching for MobileNetV3`_.
+
+    .. math::
+        \\text{Hardswish}(x) = \\begin{cases}
+            0 & \\text{ if } x \\le -3  \\\\
+            x & \\text{ if } x \\ge +3 \\\\
+            x*(x+3)/6 & \\text{ otherwise } \\\\
+        \\end{cases}
+
+    Args:
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> x = np.array([-0.5, 0, 0.5]).astype(np.float32)
+        >>> input = flow.Tensor(x)
+        >>> hardswish = flow.nn.Hardswish()
+
+        >>> out = hardswish(input)
+        >>> out
+        tensor([-0.2083,  0.    ,  0.2917], dtype=oneflow.float32)
+
+    .. _`Searching for MobileNetV3`:
+        https://arxiv.org/abs/1905.02244
+    """
+
+    def __init__(self, inplace: bool = False):
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.hardswish(x)
+
+
+class Hardtanh(Module):
+    """
+    Applies the HardTanh function element-wise
+
+    HardTanh is defined as:
+
+    .. math::
+        \\text{HardTanh}(x) = \\begin{cases}
+            1 & \\text{ if } x > 1 \\\\
+            -1 & \\text{ if } x < -1 \\\\
+            x & \\text{ otherwise } \\\\
+        \\end{cases}
+
+    The range of the linear region :math:`[-1, 1]` can be adjusted using
+    :attr:`min_val` and :attr:`max_val`.
+
+    Args:
+        min_val: minimum value of the linear region range. Default: -1
+        max_val: maximum value of the linear region range. Default: 1
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Keyword arguments :attr:`min_value` and :attr:`max_value`
+    have been deprecated in favor of :attr:`min_val` and :attr:`max_val`.
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    For example:
+
+    .. code-block:: python
+
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> m = flow.nn.Hardtanh()
+        >>> arr = np.array([0.2, 0.3, 3.0, 4.0])
+        >>> x = flow.Tensor(arr)
+        >>> out = m(x)
+        >>> out
+        tensor([0.2, 0.3, 1. , 1. ], dtype=oneflow.float32)
+
+    """
+
+    def __init__(
+        self,
+        min_val: float = -1,
+        max_val: float = 1,
+        inplace: bool = False,
+        min_value: Optional[float] = None,
+        max_value: Optional[float] = None,
+    ):
+        super().__init__()
+        if min_value is not None:
+            warnings.warn(
+                "keyword argument min_value is deprecated and rename to min_val"
+            )
+            min_val = min_value
+        if max_value is not None:
+            warnings.warn(
+                "keyword argument max_value is deprecated and rename to max_val"
+            )
+            max_val = max_value
+        self.min_val = min_val
+        self.max_val = max_val
+
+    def forward(self, x):
+        return flow.F.hardtanh(x, min_val=self.min_val, max_val=self.max_val)
+
+
+class LeakyReLU(Module):
+    """Applies the element-wise function:
+
+    .. math::
+        \\text{LeakyRELU}(x) = \\begin{cases}
+            x, & \\text{ if } x \\geq 0 \\\\
+            \\text{negative_slope} \\times x, & \\text{ otherwise }
+        \\end{cases}
+
+    Args:
+        negative_slope: Controls the angle of the negative slope. Default: 1e-2
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> m = flow.nn.LeakyReLU(0.1)
+        >>> arr = np.array([0.2, 0.3, 3.0, 4.0])
+        >>> x = flow.Tensor(arr)
+        >>> out = m(x)
+        >>> out
+        tensor([0.2, 0.3, 3. , 4. ], dtype=oneflow.float32)
+    """
+
+    def __init__(self, negative_slope: float = 0.01, inplace: bool = False):
+        super().__init__()
+        self.negative_slope = negative_slope
+
+    def forward(self, x):
+        return flow.F.leaky_relu(x, alpha=self.negative_slope)
+
+
+class Mish(Module):
+    """Applies the element-wise function:
+
+    .. math::
+        \\text{Mish}(x) = x * \\text{Tanh}(\\text{Softplus}(x))
+
+    .. note::
+        See `Mish: A Self Regularized Non-Monotonic Neural Activation Function <https://arxiv.org/abs/1908.08681>`_
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> x = np.array([1, 2, 3]).astype(np.float32)
+        >>> input = flow.Tensor(x)
+        >>> mish = flow.nn.Mish()
+
+        >>> out = mish(input)
+        >>> out
+        tensor([0.8651, 1.944 , 2.9865], dtype=oneflow.float32)
+    """
+
+    def __init__(self, inplace: bool = False):
+        assert not inplace, "In-place operation is not currently supported"
+        super().__init__()
+
+    def forward(self, x):
+        return x * flow.experimental.tanh(flow.experimental.softplus(x))
+
+
+def mish_op(x):
+    """Applies the element-wise function:
+
+    .. math::
+        \\text{Mish}(x) = x * \\text{Tanh}(\\text{Softplus}(x))
+
+    .. note::
+        See `Mish: A Self Regularized Non-Monotonic Neural Activation Function <https://arxiv.org/abs/1908.08681>`_
+
+    See :mod:`oneflow.compatible.single_client.experimental.nn.Mish`
+    """
+    return Mish()(x)
+
+
+@register_tensor_op("mish")
+def mish_op_tensor(x):
+    """
+    mish() -> Tensor
+    See :func:`oneflow.compatible.single_client.experimental.mish`
+    """
+    return Mish()(x)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/adaptive_pool.py b/python/oneflow/compatible/single_client/nn/modules/adaptive_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba0a0cc48367dcdc4f7dcf4e0875adbcf75f9b60
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/adaptive_pool.py
@@ -0,0 +1,99 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class AdaptiveAvgPool2d(Module):
+    """Applies a 2D adaptive average pooling over an input signal composed of several input planes.
+
+    The output is of size H x W, for any input size.
+    The number of output features is equal to the number of input planes.
+
+    Args:
+        output_size: the target output size of the image of the form H x W.
+                     Can be a tuple (H, W) or a single H for a square image H x H.
+                     H and W can be either a ``int``, or ``None`` which means the size will
+                     be the same as that of the input.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import oneflow.compatible.single_client.experimental.nn as nn
+        >>> flow.enable_eager_execution()
+
+        >>> m = nn.AdaptiveAvgPool2d((5,7))
+        >>> input = flow.Tensor(np.random.randn(1, 64, 8, 9))
+        >>> output = m(input)
+        >>> output.size()
+        flow.Size([1, 64, 5, 7])
+
+        >>> m = nn.AdaptiveAvgPool2d(7)
+        >>> input = flow.Tensor(np.random.randn(1, 64, 10, 9))
+        >>> output = m(input)
+        >>> output.size()
+        flow.Size([1, 64, 7, 7])
+
+        >>> m = nn.AdaptiveAvgPool2d((None, 7))
+        >>> input = flow.Tensor(np.random.randn(1, 64, 10, 9))
+        >>> output = m(input)
+        >>> output.size()
+        flow.Size([1, 64, 10, 7])
+
+    """
+
+    def __init__(self, output_size) -> None:
+        super().__init__()
+        self.output_size = output_size
+        self._op = (
+            flow.builtin_op("adaptive_avg_pool2d")
+            .Input("x")
+            .Attr("output_size", [])
+            .Output("y")
+            .Build()
+        )
+
+    def forward(self, x):
+        new_output_size = []
+        assert len(x.shape) == 4
+        if isinstance(self.output_size, int):
+            new_output_size.append(self.output_size)
+            new_output_size.append(self.output_size)
+        elif isinstance(self.output_size, tuple):
+            new_output_size = list(self.output_size)
+            if self.output_size[0] is None:
+                new_output_size[0] = x.shape[2]
+            if self.output_size[1] is None:
+                new_output_size[1] = x.shape[3]
+        else:
+            raise NotImplementedError("output_size param wrong, please check!")
+        new_output_size = tuple(new_output_size)
+        assert (
+            new_output_size[0] <= x.shape[2]
+        ), f"output_size param wrong, please check!"
+        assert (
+            new_output_size[1] <= x.shape[3]
+        ), f"output_size param wrong, please check!"
+        return self._op(x, output_size=new_output_size)[0]
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/arange.py b/python/oneflow/compatible/single_client/nn/modules/arange.py
new file mode 100644
index 0000000000000000000000000000000000000000..68c8e248a32aaf1de60f15a54391ca8381b3fc1e
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/arange.py
@@ -0,0 +1,103 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Union
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class Arange(Module):
+    def __init__(
+        self,
+        start: int = 0,
+        end: int = None,
+        step: int = 1,
+        dtype: flow.dtype = None,
+        device: Union[str, flow.device] = "cpu",
+        requires_grad: bool = False,
+    ) -> None:
+        super().__init__()
+        assert end > start, "end should be larger than start"
+        assert step <= end - start, "step is ilegal"
+        self.start = start
+        self.end = end
+        self.step = step
+        self.dtype = dtype
+        self.device = device
+        self.requires_grad = requires_grad
+
+    def forward(self):
+        tmp = flow.F.range(
+            start=self.start, limit=self.end, delta=self.step, dtype=flow.int64
+        )
+        tmp.requires_grad = self.requires_grad
+        if isinstance(self.device, str):
+            device = flow.device(self.device)
+        else:
+            device = self.device
+        res = tmp.to(device, dtype=self.dtype)
+        return res
+
+
+def arange_op(
+    start: int = 0,
+    end: int = None,
+    step: int = 1,
+    dtype: flow.dtype = flow.int64,
+    device: Union[str, flow.device] = "cpu",
+    requires_grad: bool = False,
+):
+    """
+    Returns a 1-D tensor of size :math:`\\left\\lfloor \\frac{\\text{end} - \\text{start}}{\\text{step}} \\right\\rfloor + 1`
+    with values from :attr:`start` to :attr:`end` with step :attr:`step`. Step is
+    the gap between two values in the tensor.
+
+    .. math::
+        \\text{out}_{i+1} = \\text{out}_i + \\text{step}.
+
+    Args:
+        start (int): the starting value for the set of points. Default: ``0``.
+        end (int): the ending value for the set of points
+        step (int): the gap between each pair of adjacent points. Default: ``1``.
+
+    Keyword args:
+        dtype(flow.dtype, optional): If `dtype` is not given, the `dtype` is inferred to be `flow.int64`.
+        device(flow.device, optional): the desired device of returned tensor. Default: if None, uses the current device for the default tensor.
+        requires_grad(bool, optional): If autograd should record operations on the returned tensor. Default: `False`.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> y = flow.arange(0, 5)
+        >>> y
+        tensor([0, 1, 2, 3, 4], dtype=oneflow.int64)
+
+    """
+    if end is None:
+        end = start
+        start = 0
+    return Arange(start, end, step, dtype, device, requires_grad)()
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/argmax.py b/python/oneflow/compatible/single_client/nn/modules/argmax.py
new file mode 100644
index 0000000000000000000000000000000000000000..df5c6cb57894553519474b18b816f57150d58167
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/argmax.py
@@ -0,0 +1,91 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+from oneflow.compatible.single_client.ops.transpose_util import (
+    get_inversed_perm,
+    get_perm_when_transpose_axis_to_last_dim,
+)
+
+
+class Argmax(Module):
+    def __init__(self, dim: int = None, keepdim: bool = False) -> None:
+        super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
+
+    def forward(self, input):
+        if self.dim == None:
+            input = flow.F.flatten(input)
+            self.dim = 0
+        num_axes = len(input.shape)
+        axis = self.dim if self.dim >= 0 else self.dim + num_axes
+        assert 0 <= axis < num_axes, "axis out of range"
+        if axis == num_axes - 1:
+            x = flow.F.argmax(input)
+            if self.keepdim == True:
+                x = flow.experimental.unsqueeze(x, -1)
+            return x
+        else:
+            perm = get_perm_when_transpose_axis_to_last_dim(num_axes, axis)
+            x = flow.F.transpose(input, perm=perm)
+            x = flow.F.argmax(x)
+            x = flow.experimental.unsqueeze(x, -1)
+            x = flow.F.transpose(x, perm=get_inversed_perm(perm))
+            if self.keepdim == False:
+                x = x.squeeze(dim=[axis])
+            return x
+
+
+@register_tensor_op("argmax")
+def argmax_op(input, dim: int = None, keepdim: bool = False):
+    """The op computes the index with the largest value of a Tensor at specified axis.
+
+    Args:
+        input (oneflow.compatible.single_client.Tensor): Input Tensor
+        dim (int, optional): dimension to be calculated. Defaults to the last dim (-1)
+        keepdim (bool optional):  whether the output tensor has dim retained or not. Ignored if dim=None.
+
+    Returns:
+        oneflow.compatible.single_client.Tensor: A Tensor(dtype=int32) contains the index with the largest value of `input`
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> x = np.array([[1, 3, 8, 7, 2],
+        ...            [1, 9, 4, 3, 2]], dtype=np.float32)
+
+        >>> out = flow.argmax(flow.Tensor(x))
+        >>> out
+        tensor([6], dtype=oneflow.int32)
+        >>> out = flow.argmax(flow.Tensor(x), dim=1)
+        >>> out
+        tensor([2, 1], dtype=oneflow.int32)
+
+    """
+    return Argmax(dim=dim, keepdim=keepdim)(input)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/argsort.py b/python/oneflow/compatible/single_client/nn/modules/argsort.py
new file mode 100644
index 0000000000000000000000000000000000000000..54d3180299ea7071f8d7c2836872d98a8b170396
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/argsort.py
@@ -0,0 +1,94 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+from oneflow.compatible.single_client.ops.transpose_util import (
+    get_inversed_perm,
+    get_perm_when_transpose_axis_to_last_dim,
+)
+
+
+class Argsort(Module):
+    def __init__(self, dim: int = -1, descending: bool = False) -> None:
+        super().__init__()
+        self.dim = dim
+        direction = "DESCENDING" if descending else "ASCENDING"
+        self._argsort_op = (
+            flow.builtin_op("arg_sort")
+            .Input("in")
+            .Output("out")
+            .Attr("direction", direction)
+            .Build()
+        )
+
+    def forward(self, input):
+        num_dims = len(input.shape)
+        dim = self.dim if self.dim >= 0 else self.dim + num_dims
+        assert 0 <= dim < num_dims, "dim out of range"
+        if dim == num_dims - 1:
+            return self._argsort_op(input)[0]
+        else:
+            perm = get_perm_when_transpose_axis_to_last_dim(num_dims, dim)
+            x = flow.F.transpose(input, perm=perm)
+            x = self._argsort_op(x)[0]
+            return flow.F.transpose(x, perm=get_inversed_perm(perm))
+
+
+@register_tensor_op("argsort")
+def argsort_op(input, dim: int = -1, descending: bool = False):
+    """This operator sorts the input Tensor at specified dim and return the indices of the sorted Tensor.
+
+    Args:
+        input (oneflow.compatible.single_client.Tensor): The input Tensor.
+        dim (int, optional): dimension to be sorted. Defaults to the last dim (-1).
+        descending (bool, optional): controls the sorting order (ascending or descending).
+
+    Returns:
+        oneflow.compatible.single_client.Tensor: The indices of the sorted Tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> x = np.array([[10, 2, 9, 3, 7],
+        ...               [1, 9, 4, 3, 2]]).astype("float32")
+        >>> input = flow.Tensor(x)
+        >>> output = flow.argsort(input)
+        >>> output
+        tensor([[1, 3, 4, 2, 0],
+                [0, 4, 3, 2, 1]], dtype=oneflow.int32)
+        >>> output = flow.argsort(input, descending=True)
+        >>> output
+        tensor([[0, 2, 4, 3, 1],
+                [1, 2, 3, 4, 0]], dtype=oneflow.int32)
+        >>> output = flow.argsort(input, dim=0)
+        >>> output
+        tensor([[1, 0, 1, 0, 1],
+                [0, 1, 0, 1, 0]], dtype=oneflow.int32)
+
+    """
+    return Argsort(dim=dim, descending=descending)(input)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/argwhere.py b/python/oneflow/compatible/single_client/nn/modules/argwhere.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0391974a122395935399b823ef27b13ea561dfe
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/argwhere.py
@@ -0,0 +1,87 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional
+
+import numpy as np
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class Argwhere(Module):
+    def __init__(self, dtype) -> None:
+        super().__init__()
+        if dtype == None:
+            dtype = flow.int32
+        self.dtype = dtype
+
+    def forward(self, x):
+        (res, size) = flow.F.argwhere(x, dtype=self.dtype)
+        slice_tup_list = [[0, int(size.numpy()), 1]]
+        return flow.experimental.slice(res, slice_tup_list=slice_tup_list)
+
+
+def argwhere_op(x, dtype: Optional[flow.dtype] = None):
+    """This operator finds the indices of input Tensor `x` elements that are non-zero. 
+
+    It returns a list in which each element is a coordinate that points to a non-zero element in the condition.
+
+    Args:
+        x (oneflow.compatible.single_client.Tensor): The input Tensor.
+        dtype (Optional[flow.dtype], optional): The data type of output. Defaults to None.
+
+    Returns:
+        oneflow.compatible.single_client.Tensor: The result Tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> x = np.array([[0, 1, 0],
+        ...            [2, 0, 2]]).astype(np.float32)
+        
+        >>> input = flow.Tensor(x)
+        >>> output = flow.argwhere(input)
+        >>> output
+        tensor([[0, 1],
+                [1, 0],
+                [1, 2]], dtype=oneflow.int32)
+
+    """
+    return Argwhere(dtype=dtype)(x)
+
+
+@register_tensor_op("argwhere")
+def argwhere_tebsor_op(x, dtype: Optional[flow.dtype] = None):
+    """
+
+    argwhere() -> Tensor
+
+    See :func:`oneflow.compatible.single_client.experimental.argwhere`
+
+    """
+    return Argwhere(dtype=dtype)(x)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/atan2.py b/python/oneflow/compatible/single_client/nn/modules/atan2.py
new file mode 100644
index 0000000000000000000000000000000000000000..04984c05cf4d0e3467d04d77b52c5a92950e4d5f
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/atan2.py
@@ -0,0 +1,84 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class Atan2(Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.atan2_op = (
+            flow.builtin_op("atan2").Input("x").Input("y").Output("z").Build()
+        )
+
+    def forward(self, x, y):
+        return self.atan2_op(x, y)[0]
+
+
+def atan2_op(input, other):
+    """Element-wise arctangent of input{i}/other{i}
+    with consideration of the quadrant. Returns a new tensor with the signed
+    angles in radians between vector (other{i},input{i}) and vector (1, 0).
+
+    The shapes of input and other must be broadcastable.
+
+    Args:
+        input (Tensor): the first input tensor.
+
+        other (Tensor): the second input tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+
+        >>> x1 = flow.Tensor(np.array([1,2,3]))
+        >>> y1 = flow.Tensor(np.array([3,2,1]))
+        >>> x2 = flow.Tensor(np.array([1.53123589,0.54242598,0.15117185]))
+        >>> y2 = flow.Tensor(np.array([-0.21906378,0.09467151,-0.75562878]))
+        >>> x3 = flow.Tensor(np.array([1,0,-1]))
+        >>> y3 = flow.Tensor(np.array([0,1,0]))
+
+        >>> flow.enable_eager_execution()
+        >>> flow.atan2(x1,y1).numpy()
+        array([0.32175055, 0.7853982 , 1.2490457 ], dtype=float32)
+        >>> flow.atan2(x2,y2).numpy()
+        array([1.7128955, 1.3980033, 2.9441385], dtype=float32)
+        >>> flow.atan2(x3,y3).numpy()
+        array([ 1.5707964,  0.       , -1.5707964], dtype=float32)
+
+    """
+    return Atan2()(input, other)
+
+
+@register_tensor_op("atan2")
+def atan2_op_tensor(input, other):
+    """
+
+    atan2(other) -> Tensor
+
+    See :func:`oneflow.compatible.single_client.experimental.atan2`
+    """
+    return Atan2()(input, other)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/atanh.py b/python/oneflow/compatible/single_client/nn/modules/atanh.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4f9790e05c7f3786a41b10415afe7b96fa270a7
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/atanh.py
@@ -0,0 +1,85 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class Atanh(Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.atanh(x)
+
+
+def atanh_op(input):
+    """Returns a new tensor with the inverse hyperbolic tangent of the elements of :attr:`input`.
+
+    .. math::
+        \\text{out}_{i} = \\tanh^{-1}(\\text{input}_{i})
+
+    Args:
+        input (Tensor): the input tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+        >>> np_arr = np.array([0.5, 0.6, 0.7]).astype(np.float32)
+        >>> input = flow.Tensor(np_arr)
+        >>> output = flow.atanh(input)
+        >>> output
+        tensor([0.5493, 0.6931, 0.8673], dtype=oneflow.float32)
+
+    """
+    return Atanh()(input)
+
+
+@register_tensor_op("atanh")
+def atanh_op_tensor(x):
+    """
+    atanh() -> Tensor
+    See :func:`oneflow.compatible.single_client.experimental.atanh`
+
+    """
+    return Atanh()(x)
+
+
+def arctanh_op(input):
+    """
+
+    Alias for :func:`oneflow.compatible.single_client.experimental.atanh`
+    """
+    return Atanh()(input)
+
+
+@register_tensor_op("arctanh")
+def arctanh_op_tensor(input):
+    """
+
+    Alias for :func:`oneflow.compatible.single_client.experimental.atanh`
+    """
+    return Atanh()(input)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/batchnorm.py b/python/oneflow/compatible/single_client/nn/modules/batchnorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..805cdf7e12a9f621cd42451ac1890fb6494c272c
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/batchnorm.py
@@ -0,0 +1,337 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Union
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class _NormBase(Module):
+    """Common base of _InstanceNorm and _BatchNorm"""
+
+    def __init__(
+        self,
+        num_features: int,
+        eps: float = 1e-05,
+        momentum: float = 0.1,
+        affine: bool = True,
+        track_running_stats: bool = True,
+        device: Union[str, flow.device] = None,
+        dtype: flow.dtype = None,
+    ) -> None:
+        super().__init__()
+        self.num_features = num_features
+        self.eps = eps
+        self.momentum = momentum
+        self.affine = affine
+        self.track_running_stats = track_running_stats
+        self.device = device
+        self.dtype = dtype
+        if self.affine:
+            self.weight = flow.nn.Parameter(
+                flow.Tensor(num_features, device=self.device)
+            )
+            self.bias = flow.nn.Parameter(flow.Tensor(num_features, device=self.device))
+        else:
+            self.register_parameter("weight", None)
+            self.register_parameter("bias", None)
+        if self.track_running_stats:
+            self.register_buffer(
+                "running_mean", flow.Tensor(num_features, device=self.device)
+            )
+            self.register_buffer(
+                "running_var", flow.Tensor(num_features, device=self.device)
+            )
+        else:
+            self.register_parameter("running_mean", None)
+            self.register_parameter("running_var", None)
+        self.reset_parameters()
+
+    def reset_running_stats(self) -> None:
+        if self.track_running_stats:
+            self.running_mean.fill_(0)
+            self.running_var.fill_(1)
+
+    def reset_parameters(self) -> None:
+        self.reset_running_stats()
+        if self.affine:
+            flow.nn.init.ones_(self.weight)
+            flow.nn.init.zeros_(self.bias)
+
+    def _check_input_dim(self, input):
+        raise NotImplementedError
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        super(_NormBase, self)._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
+
+class _BatchNorm(_NormBase):
+    def __init__(
+        self,
+        num_features,
+        eps=1e-05,
+        momentum=0.1,
+        affine=True,
+        track_running_stats=True,
+        device=None,
+        dtype=None,
+    ):
+        super().__init__(
+            num_features, eps, momentum, affine, track_running_stats, device, dtype
+        )
+
+    def forward(self, x):
+        if self.dtype is None:
+            self.dtype = x.dtype
+        if self.device is None:
+            self.device = x.device
+        self._check_input_dim(x)
+        reduce_axis = []
+        for dim in range(len(x.shape)):
+            if dim != 1:
+                reduce_axis.append(dim)
+        mean = x.mean(dim=reduce_axis, keepdim=False)
+        variance = x.var(dim=reduce_axis, keepdim=False)
+        if x.device == flow.device("cpu"):
+            if self.training and self.track_running_stats:
+                running_mean = (
+                    self.momentum * self.running_mean + (1 - self.momentum) * mean
+                )
+                running_var = (
+                    self.momentum * self.running_var + (1 - self.momentum) * variance
+                )
+                self.__setattr__("running_mean", flow.Tensor(running_mean))
+                self.__setattr__("running_var", flow.Tensor(running_var))
+            else:
+                mean = mean if self.running_mean is None else self.running_mean
+                variance = variance if self.running_var is None else self.running_var
+            axis = 1
+            params_shape = [x.shape[axis]]
+            weight = self.weight
+            bias = self.bias
+            if len(mean.shape) == 1:
+                nd_params_shape = [1] * len(x.shape)
+                nd_params_shape[axis] = params_shape[0]
+                mean = mean.reshape(shape=nd_params_shape)
+                variance = variance.reshape(shape=nd_params_shape)
+                if self.weight and params_shape[0] == self.weight.nelement():
+                    weight = self.weight.reshape(shape=nd_params_shape)
+                if self.bias and params_shape[0] == self.bias.nelement():
+                    bias = self.bias.reshape(shape=nd_params_shape)
+            elif len(mean.shape) == len(x.shape):
+                pass
+            else:
+                raise ValueError(
+                    "shape of mean and variance should be 1D or has number of axes and x's"
+                )
+            variance += self.eps
+            normalized = (x - mean) * variance.rsqrt()
+            affined = normalized
+            if self.weight:
+                affined = affined * weight
+            if self.bias:
+                affined = affined + bias
+            return affined.to(dtype=self.dtype)
+        else:
+            res = flow.F.normalization(
+                x,
+                self.running_mean if self.track_running_stats else mean,
+                self.running_var if self.track_running_stats else variance,
+                self.weight,
+                self.bias,
+                axis=1,
+                epsilon=self.eps,
+                momentum=self.momentum,
+                is_training=self.training,
+            )
+            return res.to(dtype=self.dtype, device=self.device)
+
+
+class BatchNorm1d(_BatchNorm):
+    """Applies Batch Normalization over a 2D or 3D input (a mini-batch of 1D
+    inputs with optional additional channel dimension) as described in the paper
+    `Batch Normalization: Accelerating Deep Network Training by Reducing
+    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .
+
+    .. math::
+
+        y = \\frac{x - \\mathrm{E}[x]}{\\sqrt{\\mathrm{Var}[x] + \\epsilon}} * \\gamma + \\beta
+
+    The mean and standard-deviation are calculated per-dimension over
+    the mini-batches and :math:`\\gamma` and :math:`\\beta` are learnable parameter vectors
+    of size `C` (where `C` is the input size). By default, the elements of :math:`\\gamma` are set
+    to 1 and the elements of :math:`\\beta` are set to 0. The standard-deviation is calculated
+    via the biased estimator, equivalent to `torch.var(input, unbiased=False)`.
+
+    Also by default, during training this layer keeps running estimates of its
+    computed mean and variance, which are then used for normalization during
+    evaluation. The running estimates are kept with a default :attr:`momentum`
+    of 0.1.
+
+    If :attr:`track_running_stats` is set to ``False``, this layer then does not
+    keep running estimates, and batch statistics are instead used during
+    evaluation time as well.
+
+    .. note::
+        This :attr:`momentum` argument is different from one used in optimizer
+        classes and the conventional notion of momentum. Mathematically, the
+        update rule for running statistics here is
+        :math:`\\hat{x}_\\text{new} = (1 - \\text{momentum}) \\times \\hat{x} + \\text{momentum} \\times x_t`,
+        where :math:`\\hat{x}` is the estimated statistic and :math:`x_t` is the
+        new observed value.
+
+    Because the Batch Normalization is done over the `C` dimension, computing statistics
+    on `(N, L)` slices, it's common terminology to call this Temporal Batch Normalization.
+
+    Args:
+        num_features: :math:`C` from an expected input of size
+            :math:`(N, C, L)` or :math:`L` from input of size :math:`(N, L)`
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Can be set to ``None`` for cumulative moving average
+            (i.e. simple average). Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters. Default: ``True``
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics, and initializes statistics
+            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
+            When these buffers are ``None``, this module always uses batch statistics.
+            in both training and eval modes. Default: ``True``
+
+    Shape:
+        - Input: :math:`(N, C)` or :math:`(N, C, L)`
+        - Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input)
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+
+        >>> x = flow.Tensor(np.random.randn(20, 100))
+        >>> m = flow.nn.BatchNorm1d(100)
+        >>> y = m(x)
+
+    """
+
+    def _check_input_dim(self, input):
+        if input.ndim != 2 and input.ndim != 3:
+            raise ValueError(
+                "expected 2D or 3D input (got {}D input)".format(input.ndim)
+            )
+
+
+class BatchNorm2d(_BatchNorm):
+    """Applies Batch Normalization over a 4D input (a mini-batch of 2D inputs
+    with additional channel dimension) as described in the paper
+    `Batch Normalization: Accelerating Deep Network Training by Reducing
+    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .
+
+    .. math::
+
+        y = \\frac{x - \\mathrm{E}[x]}{ \\sqrt{\\mathrm{Var}[x] + \\epsilon}} * \\gamma + \\beta
+
+    The mean and standard-deviation are calculated per-dimension over
+    the mini-batches and :math:`\\gamma` and :math:`\\beta` are learnable parameter vectors
+    of size `C` (where `C` is the input size). By default, the elements of :math:`\\gamma` are set
+    to 1 and the elements of :math:`\\beta` are set to 0. The standard-deviation is calculated
+    via the biased estimator, equivalent to `torch.var(input, unbiased=False)`.
+
+    Also by default, during training this layer keeps running estimates of its
+    computed mean and variance, which are then used for normalization during
+    evaluation. The running estimates are kept with a default :attr:`momentum`
+    of 0.1.
+
+    If :attr:`track_running_stats` is set to ``False``, this layer then does not
+    keep running estimates, and batch statistics are instead used during
+    evaluation time as well.
+
+    .. note::
+        This :attr:`momentum` argument is different from one used in optimizer
+        classes and the conventional notion of momentum. Mathematically, the
+        update rule for running statistics here is
+        :math:`\\hat{x}_\\text{new} = (1 - \\text{momentum}) \\times \\hat{x} + \\text{momentum} \\times x_t`,
+        where :math:`\\hat{x}` is the estimated statistic and :math:`x_t` is the
+        new observed value.
+
+    Because the Batch Normalization is done over the `C` dimension, computing statistics
+    on `(N, H, W)` slices, it's common terminology to call this Spatial Batch Normalization.
+
+    Args:
+        num_features: :math:`C` from an expected input of size
+            :math:`(N, C, H, W)`
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Can be set to ``None`` for cumulative moving average
+            (i.e. simple average). Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters. Default: ``True``
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics, and initializes statistics
+            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
+            When these buffers are ``None``, this module always uses batch statistics.
+            in both training and eval modes. Default: ``True``
+
+    Shape:
+        - Input: :math:`(N, C, H, W)`
+        - Output: :math:`(N, C, H, W)` (same shape as input)
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+
+        >>> x = flow.Tensor(np.random.randn(4, 2, 8, 3))
+        >>> m = flow.nn.BatchNorm2d(num_features=2, eps=1e-5, momentum=0.1)
+        >>> y = m(x)
+
+    """
+
+    def _check_input_dim(self, input):
+        if input.ndim != 4:
+            raise ValueError("expected 4D input (got {}D input)".format(input.ndim()))
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/bmm.py b/python/oneflow/compatible/single_client/nn/modules/bmm.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b470cb6a428e09409cbe10350360a878cc78c1c
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/bmm.py
@@ -0,0 +1,75 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class BMM(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, input, mat2):
+        assert (
+            input.shape[0] == mat2.shape[0] and input.shape[2] == mat2.shape[1]
+        ), f"batch dim or matmul dim not match, please check input!"
+        return flow.F.batch_matmul(input, mat2)
+
+
+def bmm_op(x, y):
+    """
+    Performs a batch matrix-matrix product of matrices stored in input and mat2.
+
+    `input` and `mat2` must be 3-D tensors each containing the same number of matrices.
+
+    If input is a (b x n x m) tensor, mat2 is a (b x m x p) tensor, out will be a (b x n x p) tensor.
+
+    Args:
+        input(oneflow.compatible.single_client.Tensor):  the first batch of matrices to be multiplied
+        mat2(oneflow.compatible.single_client.Tensor): the second batch of matrices to be multiplied
+    
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+        >>> input1 = flow.Tensor(np.random.randn(10, 3, 4), dtype=flow.float32)
+        >>> input2 = flow.Tensor(np.random.randn(10, 4, 5), dtype=flow.float32)
+        >>> of_out = flow.bmm(input1, input2)
+        >>> of_out.shape
+        flow.Size([10, 3, 5])
+    """
+    return BMM()(x, y)
+
+
+@register_tensor_op("bmm")
+def bmm_op_tensor(x, y):
+    """
+
+    bmm() -> Tensor
+
+    See :func:`oneflow.compatible.single_client.experimental.bmm`
+
+    """
+    return BMM()(x, y)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/broadcast_like.py b/python/oneflow/compatible/single_client/nn/modules/broadcast_like.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbd48db445bd6a9d5f227f564eb92186e5e1b492
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/broadcast_like.py
@@ -0,0 +1,30 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class BroadCastLike(Module):
+    def __init__(self, broadcast_axes: None) -> None:
+        super().__init__()
+        self.broadcast_axes = broadcast_axes
+
+    def forward(self, x, like_tensor):
+        return flow.F.broadcast_like(x, like_tensor, broadcast_axes=self.broadcast_axes)
+
+
+def broadcast_like_op(x, like_tensor, broadcast_axes: None):
+    return BroadCastLike(broadcast_axes=broadcast_axes)(x, like_tensor)
diff --git a/python/oneflow/compatible/single_client/nn/modules/cast.py b/python/oneflow/compatible/single_client/nn/modules/cast.py
new file mode 100644
index 0000000000000000000000000000000000000000..f310655d6d80f50a230f13c2ac3390e4b7d31885
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/cast.py
@@ -0,0 +1,62 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class Cast(Module):
+    def __init__(self, dtype: flow.dtype) -> None:
+        super().__init__()
+        self.dtype = dtype
+
+    def forward(self, x):
+        return flow.F.cast(x, dtype=self.dtype)
+
+
+@register_tensor_op("cast")
+def cast_op(x, dtype):
+    """The operation takes input tensor `x` and casts it to the output with `dtype`
+
+    Args:
+        x (oneflow.compatible.single_client.Tensor): A Tensor
+        dtype (flow.dtype): Data type of the output tensor
+
+    Returns:
+        oneflow.compatible.single_client.Tensor: A Tensor with specific dtype.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+
+        >>> np_arr = np.random.randn(2, 3, 4, 5).astype(np.float32)
+        >>> input = flow.Tensor(np_arr, dtype=flow.float32)
+        >>> output = flow.cast(input, flow.int8)
+        >>> np.array_equal(output.numpy(), np_arr.astype(np.int8))
+        True
+
+    """
+    return Cast(dtype)(x)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/chunk.py b/python/oneflow/compatible/single_client/nn/modules/chunk.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f71229c268507e4890c70db637ba45d1b7e5706
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/chunk.py
@@ -0,0 +1,128 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import Tensor, register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+from oneflow.compatible.single_client.ops.array_ops import check_slice_tup_list
+
+
+class Chunk(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, input, chunks, dim):
+        if dim is not None:
+            assert input.shape[dim] > 0, "chunk expects at least a 1-dimensional tensor"
+            assert chunks > 0, "chunk expects `chunks` to be greater than 0"
+            channel = input.dim()
+            dim_size = input.shape[dim]
+            chunk_size = (
+                dim_size / chunks if dim_size % chunks == 0 else int(dim_size / chunks)
+            )
+            last_chunk_size = (
+                dim_size / chunks
+                if dim_size % chunks == 0
+                else dim_size - chunk_size * (chunks - 1)
+            )
+            chunk_dim_dict = {}
+            tup_ndim = []
+            splits = []
+            for chunk in range(0, chunks):
+                if dim_size % chunks == 0:
+                    start = chunk * chunk_size
+                    stop = (chunk + 1) * chunk_size
+                else:
+                    start = (
+                        chunk * chunk_size
+                        if chunk < chunks - 1
+                        else chunk_size * (chunks - 1)
+                    )
+                    stop = (chunk + 1) * chunk_size if chunk < chunks - 1 else dim_size
+                step = 1
+                chunk_dim_dict.setdefault(dim, []).append(
+                    [int(start), int(stop), int(step)]
+                )
+            for (k, v) in chunk_dim_dict.items():
+                for v_chunk in v:
+                    tup_list = []
+                    for i in range(0, channel):
+                        if i != dim:
+                            tup_list.append([None, None, None])
+                        else:
+                            tup_list.append(v_chunk)
+                    (start_tup, stop_tup, step_tup) = check_slice_tup_list(
+                        tup_list, input.shape
+                    )
+                    splits.append(
+                        flow.F.slice(
+                            input, start=start_tup, stop=stop_tup, step=step_tup
+                        )
+                    )
+            return splits
+
+
+@register_tensor_op("chunk")
+def chunk_op(input, chunks, dim):
+    """Splits a tensor into a specific number of chunks. Each chunk is a view of the input tensor. Last chunk will be smaller if the tensor size along the given dimension dim is not divisible by chunks.
+
+    Args:
+        input (oneflow.compatible.single_client.experimental.Tensor): The tensor to split.
+        chunks (int): Number of chunks to return.
+        dim (int): Dimension along which to split the tensor.
+
+    Returns:
+        List of Tensors.
+
+    For example:
+
+    .. code-block:: python
+    
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+       
+        >>> np_arr = np.random.randn(5, 3, 6, 9).astype(np.float32)
+        >>> input = flow.Tensor(np_arr)
+        >>> of_out = []
+        >>> of_out = flow.chunk(input, chunks=3, dim=2)
+        >>> chunks = 3
+        >>> of_out_shape = []
+        >>> for i in range(0, chunks):
+        ...     of_out_shape.append(of_out[i].numpy().shape)
+        >>> of_out_shape
+        [(5, 3, 2, 9), (5, 3, 2, 9), (5, 3, 2, 9)]
+
+        >>> np_arr = np.random.randn(5, 3, 6, 9).astype(np.float32)
+        >>> input = flow.Tensor(np_arr)
+        >>> of_out = []
+        >>> of_out = flow.chunk(input, chunks=4, dim=3)
+        >>> chunks = 4
+        >>> of_out_shape = []
+        >>> for i in range(0, chunks):
+        ...     of_out_shape.append(of_out[i].numpy().shape)
+        >>> of_out_shape
+        [(5, 3, 6, 2), (5, 3, 6, 2), (5, 3, 6, 2), (5, 3, 6, 3)]
+
+    """
+    return Chunk()(input, chunks, dim)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/concat.py b/python/oneflow/compatible/single_client/nn/modules/concat.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4d18b562da28c1c01d73d77b23f985da79ffc25
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/concat.py
@@ -0,0 +1,85 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional, Sequence
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import Tensor, register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class Cat(Module):
+    def __init__(self, dim=0) -> None:
+        super().__init__()
+        self.axis = dim
+
+    def forward(self, inputs):
+        if len(inputs) == 1:
+            return inputs[0]
+        axis = self.axis
+        assert len(inputs) >= 2
+        if axis < 0:
+            axis += len(inputs[0].shape)
+        assert axis >= 0 and axis < len(
+            inputs[0].shape
+        ), "axis must be in range [0, num_axes of inputs)"
+        first_input_shape = inputs[0].shape
+        dynamic_dim_size = 0
+        for input in inputs:
+            assert len(input.shape) == len(first_input_shape)
+            for i in range(len(input.shape)):
+                if i == axis:
+                    dynamic_dim_size += input.shape[i]
+                else:
+                    assert input.shape[i] == first_input_shape[i]
+        return flow.F.concat(inputs, axis=axis, max_dim_size=dynamic_dim_size)
+
+
+def concat_op(inputs, dim=0):
+    """Concatenate two or more `Tensor` s at specified axis.
+
+    Analogous to `numpy.concatenate <https://docs.scipy.org/doc/numpy/reference/generated/numpy.concatenate.html>`_
+
+    Args:
+        inputs: a `list` of `Tensor`
+        dim: a `int`.
+
+    Returns:
+        A `Tensor`
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+        >>> import numpy as np
+
+        >>> input1 = flow.Tensor(np.random.randn(2, 6, 5, 3), dtype=flow.float32)
+        >>> input2 = flow.Tensor(np.random.randn(2, 6, 5, 3), dtype=flow.float32)
+        >>> input3 = flow.Tensor(np.random.randn(2, 6, 5, 3), dtype=flow.float32)
+
+        >>> out = flow.cat([input1, input2, input3], dim=1)
+        >>> out.shape
+        flow.Size([2, 18, 5, 3])
+
+    """
+    return Cat(dim=dim)(inputs)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/constant.py b/python/oneflow/compatible/single_client/nn/modules/constant.py
new file mode 100644
index 0000000000000000000000000000000000000000..4babe05dcc50b3c5eee509672012108eb5f1e90c
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/constant.py
@@ -0,0 +1,272 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional, Union
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.common_types import _size_any_t
+from oneflow.compatible.single_client.nn.module import Module
+from oneflow.compatible.single_client.nn.modules.utils import _single
+
+
+class _ConstantBase(Module):
+    def __init__(
+        self,
+        size: Union[_size_any_t, flow.Size],
+        value: Union[float, int],
+        dtype: Optional[flow.dtype],
+        device: Union[flow.device, str] = None,
+        requires_grad: bool = False,
+    ) -> None:
+        super().__init__()
+        assert size is not None, "shape must not be None!"
+        assert isinstance(
+            size, (int, tuple, flow.Size)
+        ), "shape should be int or tuple int!"
+        self.device = device
+        self.requires_grad = requires_grad
+        size = _single(size)
+        if dtype is None:
+            dtype = flow.float32
+        if device is None:
+            self.device = flow.device("cpu")
+        self.shape = size
+        self.value = value
+        self.dtype = dtype
+
+    def forward(self):
+        res = flow.F.constant(self.shape, self.value, self.dtype)
+        res = res.to(device=self.device)
+        res.requires_grad = self.requires_grad
+        return res
+
+
+class Ones(_ConstantBase):
+    def __init__(self, size, dtype=None, device=None, requires_grad=False):
+        super().__init__(size, 1, dtype, device, requires_grad)
+
+
+def ones_op(
+    size: Union[_size_any_t, flow.Size],
+    dtype: Optional[flow.dtype] = None,
+    device: Union[flow.device, str, None] = None,
+    requires_grad: bool = False,
+):
+    """
+    Returns a tensor filled with the scalar value 1,
+    with the shape defined by the variable argument `size`.
+
+    Args:
+        size (an integer or tuple of integer values) 鈥� defining the shape of the output tensor. Can be \\
+         a variable number of arguments or a collection like a list or tuple.
+        dtype (flow.dtype, optional) 鈥� the desired data type of returned tensor.
+        device (torch.device, optional) 鈥� the desired device of returned tensor. Default: if None, uses the current device for the default tensor type
+        requires_grad (bool, optional) 鈥� If autograd should record operations on the returned tensor. Default: False.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> y = flow.ones(5)
+        >>> y
+        tensor([1., 1., 1., 1., 1.], dtype=oneflow.float32)
+
+    """
+    return Ones(size, dtype, device, requires_grad)()
+
+
+class Zeros(_ConstantBase):
+    def __init__(self, size, dtype=None, device=None, requires_grad=False):
+        super().__init__(size, 0, dtype, device, requires_grad)
+
+
+def zeros_op(
+    size: Union[_size_any_t, flow.Size],
+    dtype: Optional[flow.dtype] = None,
+    device: Union[flow.device, str, None] = None,
+    requires_grad: bool = False,
+):
+    """
+    Returns a tensor filled with the scalar value 0,
+    with the shape defined by the variable argument `size`.
+
+    Args:
+        size(an integer or tuple of integer values) - defining the shape of the output tensor. Can be \\
+         a variable number of arguments or a collection like a list or tuple.
+        dtype (flow.dtype, optional) 鈥� the desired data type of returned tensor.
+        device (torch.device, optional) 鈥� the desired device of returned tensor. Default: if None, uses the current device for the default tensor type
+        requires_grad (bool, optional) 鈥� If autograd should record operations on the returned tensor. Default: False.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> y = flow.zeros(5)
+        >>> y
+        tensor([0., 0., 0., 0., 0.], dtype=oneflow.float32)
+
+    """
+    return Zeros(size, dtype, device, requires_grad)()
+
+
+class ZerosLike(Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, other):
+        return flow.F.zeros_like(other)
+
+
+def zeros_like_op(other):
+    """
+    Returns a tensor filled with the scalar value 0, with the same size as input.
+    flow.zeros_like(input) is equivalent to flow.zeros(input.shape, dtype=input.dtype)
+
+    Args:
+        other(Tensor): The size of input will determine size of the output tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client.experimental as flow
+        import numpy as np
+
+        x = flow.Tensor(np.random.rand([5]))
+        y = flow.zeros_like(x)
+        # [0. 0. 0. 0. 0. ]
+
+    """
+    return ZerosLike()(other)
+
+
+class OnesLike(Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, other):
+        return flow.F.ones_like(other)
+
+
+def ones_like_op(other):
+    """
+    Returns a tensor filled with the scalar value 1, with the same size as input.
+    flow.ones_like(input) is equivalent to flow.ones(input.shape, dtype=input.dtype)
+
+    Args:
+        other(Tensor): The size of input will determine size of the output tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client.experimental as flow
+        import numpy as np
+
+        x = flow.Tensor(np.random.rand([5]))
+        y = flow.ones_like(x)
+        # [1. 1. 1. 1. 1. ]
+
+    """
+    return OnesLike()(other)
+
+
+class NewOnes(Module):
+    def __init__(
+        self,
+        size: Union[_size_any_t, flow.Size] = None,
+        dtype: Optional[flow.dtype] = None,
+        device: Union[flow.device, str] = None,
+        requires_grad: bool = False,
+    ):
+        super().__init__()
+        self.device = device
+        self.requires_grad = requires_grad
+        if size != None:
+            size = _single(size)
+        self.size = size
+        self.dtype = dtype
+
+    def forward(self, x):
+        new_size = self.size
+        new_dtype = self.dtype
+        new_device = self.device
+        new_requires_grad = self.requires_grad
+        if self.size is None:
+            new_size = x.shape
+        if self.dtype is None:
+            new_dtype = x.dtype
+        if self.device is None:
+            new_device = x.device
+        assert isinstance(
+            new_size, (int, tuple, flow.Size)
+        ), f"size parameter not correct, please check!"
+        assert isinstance(
+            new_dtype, flow.dtype
+        ), f"dtype parameter not correct, please check!"
+        assert isinstance(
+            new_device, (str, flow.device)
+        ), f"device parameter not correct, please check!"
+        assert isinstance(
+            new_requires_grad, bool
+        ), f"requires_grad parameter not correct, please check!"
+        res = flow.F.constant(new_size, 1.0, new_dtype)
+        res = res.to(new_device)
+        res.requires_grad = new_requires_grad
+        return res
+
+
+@register_tensor_op("new_ones")
+def new_ones_op(x, size=None, dtype=None, device=None, requires_grad=False):
+    """
+    
+    Returns a Tensor of size size filled with 1. By default, the returned Tensor has the same torch.dtype and torch.device as this tensor.
+
+    Args:
+        size (int...): a list, tuple, or flow.Size of integers defining the shape of the output tensor.
+        dtype (flow.dtype, optional):  the desired type of returned tensor. Default: if None, same flow.dtype as this tensor.
+        device (flow.device, optional): the desired device of returned tensor. Default: if None, same flow.device as this tensor.
+        requires_grad (bool, optional): If autograd should record operations on the returned tensor. Default: False.
+    
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> x = flow.Tensor(np.ones((1, 2, 3)))
+        >>> y = x.new_ones((2, 2))
+        >>> y
+        tensor([[1., 1.],
+                [1., 1.]], dtype=oneflow.float32)
+    """
+    return NewOnes(size=size, dtype=dtype, device=device, requires_grad=requires_grad)(
+        x
+    )
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/constantpad2d.py b/python/oneflow/compatible/single_client/nn/modules/constantpad2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..71af8461fa5074a4d96a770fbe587b1ae50a2b33
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/constantpad2d.py
@@ -0,0 +1,118 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Union
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class ConstantPad2d(Module):
+    """The interface is consistent with PyTorch.
+    The documentation is referenced from:
+    https://pytorch.org/docs/stable/generated/torch.nn.ConstantPad2d.html?highlight=constantpad2d#torch.nn.ConstantPad2d
+
+    This operator pads the input with constant value that user specifies. User can set the amount of padding by setting the parameter `paddings`.
+
+    Args:
+        padding (Union[int, tuple, list]):  the size of the padding. If is `int`, uses the same padding in all boundaries. If a 4-`tuple`, uses (:math:`\\mathrm{padding_{left}}`, :math:`\\mathrm{padding_{right}}`, :math:`\\mathrm{padding_{top}}`, :math:`\\mathrm{padding_{bottom}}`)
+        
+        value (Union[int, float]): The constant value used for padding. Defaults to 0.
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})`
+        - Output: :math:`(N, C, H_{out}, W_{out})` where
+
+            :math:`H_{out} = H_{in} + \\mathrm{padding_{top}} + \\mathrm{padding_{bottom}}`
+
+            :math:`W_{out} = W_{in} + \\mathrm{padding_{left}} + \\mathrm{padding_{right}}`
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+        >>> constantpad_layer_0 = flow.nn.ConstantPad2d((2, 2, 1, 1), 1)
+        >>> input = flow.Tensor(np.arange(18).reshape((1, 2, 3, 3)).astype(np.float32))
+        >>> input_int = flow.Tensor(np.arange(18).reshape((1, 2, 3, 3)).astype(np.int32))
+        >>> output = constantpad_layer_0(input)
+        >>> output.shape
+        flow.Size([1, 2, 5, 7])
+        >>> output
+        tensor([[[[ 1.,  1.,  1.,  1.,  1.,  1.,  1.],
+                  [ 1.,  1.,  0.,  1.,  2.,  1.,  1.],
+                  [ 1.,  1.,  3.,  4.,  5.,  1.,  1.],
+                  [ 1.,  1.,  6.,  7.,  8.,  1.,  1.],
+                  [ 1.,  1.,  1.,  1.,  1.,  1.,  1.]],
+        <BLANKLINE>
+                 [[ 1.,  1.,  1.,  1.,  1.,  1.,  1.],
+                  [ 1.,  1.,  9., 10., 11.,  1.,  1.],
+                  [ 1.,  1., 12., 13., 14.,  1.,  1.],
+                  [ 1.,  1., 15., 16., 17.,  1.,  1.],
+                  [ 1.,  1.,  1.,  1.,  1.,  1.,  1.]]]], dtype=oneflow.float32)
+        >>> output_int = constantpad_layer_0(input_int)
+        >>> output_int
+        tensor([[[[ 1.,  1.,  1.,  1.,  1.,  1.,  1.],
+                  [ 1.,  1.,  0.,  1.,  2.,  1.,  1.],
+                  [ 1.,  1.,  3.,  4.,  5.,  1.,  1.],
+                  [ 1.,  1.,  6.,  7.,  8.,  1.,  1.],
+                  [ 1.,  1.,  1.,  1.,  1.,  1.,  1.]],
+        <BLANKLINE>
+                 [[ 1.,  1.,  1.,  1.,  1.,  1.,  1.],
+                  [ 1.,  1.,  9., 10., 11.,  1.,  1.],
+                  [ 1.,  1., 12., 13., 14.,  1.,  1.],
+                  [ 1.,  1., 15., 16., 17.,  1.,  1.],
+                  [ 1.,  1.,  1.,  1.,  1.,  1.,  1.]]]], dtype=oneflow.float32)
+    """
+
+    def __init__(self, padding: Union[int, tuple, list], value: Union[int, float] = 0):
+        super().__init__()
+        if isinstance(padding, (tuple, list)):
+            assert len(padding) == 4, ValueError("Length of padding must be 4")
+            boundary = [padding[0], padding[1], padding[2], padding[3]]
+        elif isinstance(padding, int):
+            boundary = [padding, padding, padding, padding]
+        else:
+            raise ValueError("padding must be int or list or tuple!")
+        self.padding = boundary
+        self.value = value
+
+    def forward(self, x):
+        (_, _, h, w) = x.shape
+        if x.dtype in [flow.float32, flow.float16, flow.float64]:
+            floating_value = float(self.value)
+            integral_value = int(0)
+        else:
+            floating_value = float(0)
+            integral_value = int(self.value)
+        self._op = (
+            flow.builtin_op("constant_pad2d")
+            .Input("x")
+            .Output("y")
+            .Attr("padding", self.padding)
+            .Attr("floating_value", floating_value)
+            .Attr("integral_value", integral_value)
+            .Build()
+        )
+        res = self._op(x)[0]
+        return res
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/container.py b/python/oneflow/compatible/single_client/nn/modules/container.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecc5b0ac59a77a0a9ba5f647aa866665f2693b34
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/container.py
@@ -0,0 +1,524 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import collections.abc
+import operator
+from collections import OrderedDict
+from itertools import islice
+from typing import (
+    Any,
+    Iterable,
+    Iterator,
+    Mapping,
+    Optional,
+    Tuple,
+    TypeVar,
+    Union,
+    overload,
+)
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.nn.module import Module
+
+T = TypeVar("T")
+
+
+class Sequential(Module):
+    """A sequential container.
+    Modules will be added to it in the order they are passed in the constructor.
+    Alternatively, an ordered dict of modules can also be passed in.
+
+    To make it easier to understand, here is a small example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental.nn as nn
+        >>> nn.Sequential(nn.Conv2d(1,20,5), nn.ReLU(), nn.Conv2d(20,64,5), nn.ReLU()) #doctest: +ELLIPSIS
+        <oneflow.compatible.single_client.python.nn.modules.container.Sequential object at 0x...>
+        >>> nn.Sequential(OrderedDict([
+        ...    ('conv1', nn.Conv2d(1,20,5)),
+        ...    ('relu1', nn.ReLU()),
+        ...    ('conv2', nn.Conv2d(20,64,5)),
+        ...    ('relu2', nn.ReLU())
+        ... ])) #doctest: +ELLIPSIS
+        <oneflow.compatible.single_client.python.nn.modules.container.Sequential object at 0x...>
+
+    """
+
+    @overload
+    def __init__(self, *args: Module) -> None:
+        ...
+
+    @overload
+    def __init__(self, arg: "OrderedDict[str, Module]") -> None:
+        ...
+
+    def __init__(self, *args: Any):
+        super(Sequential, self).__init__()
+        if len(args) == 1 and isinstance(args[0], OrderedDict):
+            for (key, module) in args[0].items():
+                self.add_module(key, module)
+        else:
+            for (idx, module) in enumerate(args):
+                self.add_module(str(idx), module)
+
+    def _get_item_by_idx(self, iterator, idx):
+        """Get the idx-th item of the iterator"""
+        size = len(self)
+        idx = operator.index(idx)
+        if not -size <= idx < size:
+            raise IndexError("index {} is out of range".format(idx))
+        idx %= size
+        return next(islice(iterator, idx, None))
+
+    def __getitem__(self: T, idx) -> T:
+        if isinstance(idx, slice):
+            return self.__class__(OrderedDict(list(self._modules.items())[idx]))
+        else:
+            return self._get_item_by_idx(self._modules.values(), idx)
+
+    def __setitem__(self, idx: int, module: Module) -> None:
+        key = self._get_item_by_idx(self._modules.keys(), idx)
+        return setattr(self, key, module)
+
+    def __delitem__(self, idx: Union[slice, int]) -> None:
+        if isinstance(idx, slice):
+            for key in list(self._modules.keys())[idx]:
+                delattr(self, key)
+        else:
+            key = self._get_item_by_idx(self._modules.keys(), idx)
+            delattr(self, key)
+
+    def __len__(self) -> int:
+        return len(self._modules)
+
+    def __dir__(self):
+        keys = super(Sequential, self).__dir__()
+        keys = [key for key in keys if not key.isdigit()]
+        return keys
+
+    def __iter__(self) -> Iterator[Module]:
+        return iter(self._modules.values())
+
+    def forward(self, input):
+        for module in self:
+            input = module(input)
+        return input
+
+
+class ParameterList(Module):
+    def __init__(self, parameters: Optional[Iterable["Parameter"]] = None) -> None:
+        super(ParameterList, self).__init__()
+        self._initialized = True
+        if parameters is not None:
+            self += parameters
+
+    def __setstate__(self, state):
+        state["_initialized"] = False
+        super(ParameterList, self).__setstate__(state)
+        self._initialized = True
+
+    def _get_abs_string_index(self, idx):
+        """Get the absolute index for the list of modules"""
+        idx = operator.index(idx)
+        if not -len(self) <= idx < len(self):
+            raise IndexError("index {} is out of range".format(idx))
+        if idx < 0:
+            idx += len(self)
+        return str(idx)
+
+    @overload
+    def __getitem__(self, idx: int) -> "Parameter":
+        ...
+
+    @overload
+    def __getitem__(self: T, idx: slice) -> T:
+        ...
+
+    def __getitem__(self, idx):
+        if isinstance(idx, slice):
+            return self.__class__(list(self._parameters.values())[idx])
+        else:
+            idx = self._get_abs_string_index(idx)
+            return self._parameters[str(idx)]
+
+    def __setitem__(self, idx: int, param: "Parameter") -> None:
+        idx = self._get_abs_string_index(idx)
+        return self.register_parameter(str(idx), param)
+
+    def __setattr__(self, key: Any, value: Any) -> None:
+        if getattr(self, "_initialized", False):
+            if not hasattr(self, key) and (not isinstance(value, flow.nn.Parameter)):
+                warnings.warn("Setting attributes on ParameterList is not supported.")
+        super(ParameterList, self).__setattr__(key, value)
+
+    def __len__(self) -> int:
+        return len(self._parameters)
+
+    def __iter__(self) -> Iterator["Parameter"]:
+        return iter(self._parameters.values())
+
+    def __iadd__(self: T, parameters: Iterable["Parameter"]) -> T:
+        return self.extend(parameters)
+
+    def __dir__(self):
+        keys = super(ParameterList, self).__dir__()
+        keys = [key for key in keys if not key.isdigit()]
+        return keys
+
+    def append(self: T, parameter: "Parameter") -> T:
+        """Appends a given parameter at the end of the list.
+
+        Arguments:
+            parameter (nn.Parameter): parameter to append
+        """
+        self.register_parameter(str(len(self)), parameter)
+        return self
+
+    def extend(self: T, parameters: Iterable["Parameter"]) -> T:
+        """Appends parameters from a Python iterable to the end of the list.
+
+        Arguments:
+            parameters (iterable): iterable of parameters to append
+        """
+        if not isinstance(parameters, collections.abc.Iterable):
+            raise TypeError(
+                "ParameterList.extend should be called with an iterable, but got "
+                + type(parameters).__name__
+            )
+        offset = len(self)
+        for (i, param) in enumerate(parameters):
+            self.register_parameter(str(offset + i), param)
+        return self
+
+    def extra_repr(self) -> str:
+        child_lines = []
+        for (k, p) in self._parameters.items():
+            size_str = "x".join((str(size) for size in p.size()))
+            device_str = "" if not p.is_cuda else " (GPU {})".format(p.get_device())
+            parastr = "Parameter containing: [{} of size {}{}]".format(
+                type(p), size_str, device_str
+            )
+            child_lines.append("  (" + str(k) + "): " + parastr)
+        tmpstr = "\n".join(child_lines)
+        return tmpstr
+
+    def __call__(self, input):
+        raise RuntimeError("ParameterList should not be called.")
+
+    def _replicate_for_data_parallel(self):
+        warnings.warn(
+            "nn.ParameterList is being used with DataParallel but this is not supported. This list will appear empty for the models replicated on each GPU except the original one."
+        )
+        return super(ParameterList, self)._replicate_for_data_parallel()
+
+
+class ParameterDict(Module):
+    def __init__(self, parameters: Optional[Mapping[str, "Parameter"]] = None) -> None:
+        super(ParameterDict, self).__init__()
+        self._initialized = True
+        if parameters is not None:
+            self.update(parameters)
+
+    def __setstate__(self, state):
+        state["_initialized"] = False
+        super(ParameterDict, self).__setstate__(state)
+        self._initialized = True
+
+    def __getitem__(self, key: str) -> "Parameter":
+        return self._parameters[key]
+
+    def __setitem__(self, key: str, parameter: "Parameter") -> None:
+        self.register_parameter(key, parameter)
+
+    def __delitem__(self, key: str) -> None:
+        del self._parameters[key]
+
+    def __setattr__(self, key: Any, value: Any) -> None:
+        if getattr(self, "_initialized", False):
+            if not hasattr(self, key) and (not isinstance(value, flow.nn.Parameter)):
+                warnings.warn("Setting attributes on ParameterDict is not supported.")
+        super(ParameterDict, self).__setattr__(key, value)
+
+    def __len__(self) -> int:
+        return len(self._parameters)
+
+    def __iter__(self) -> Iterator[str]:
+        return iter(self._parameters.keys())
+
+    def __contains__(self, key: str) -> bool:
+        return key in self._parameters
+
+    def clear(self) -> None:
+        """Remove all items from the ParameterDict.
+        """
+        self._parameters.clear()
+
+
+class ModuleList(Module):
+    def __init__(self, modules: Optional[Iterable[Module]] = None) -> None:
+        super(ModuleList, self).__init__()
+        if modules is not None:
+            self += modules
+
+    def _get_abs_string_index(self, idx):
+        """Get the absolute index for the list of modules"""
+        idx = operator.index(idx)
+        if not -len(self) <= idx < len(self):
+            raise IndexError("index {} is out of range".format(idx))
+        if idx < 0:
+            idx += len(self)
+        return str(idx)
+
+    def __getitem__(self, idx: int) -> Module:
+        if isinstance(idx, slice):
+            return self.__class__(list(self._modules.values())[idx])
+        else:
+            return self._modules[self._get_abs_string_index(idx)]
+
+    def __setitem__(self, idx: int, module: Module) -> None:
+        idx = self._get_abs_string_index(idx)
+        return setattr(self, str(idx), module)
+
+    def __delitem__(self, idx: Union[int, slice]) -> None:
+        if isinstance(idx, slice):
+            for k in range(len(self._modules))[idx]:
+                delattr(self, str(k))
+        else:
+            delattr(self, self._get_abs_string_index(idx))
+        str_indices = [str(i) for i in range(len(self._modules))]
+        self._modules = OrderedDict(list(zip(str_indices, self._modules.values())))
+
+    def __len__(self) -> int:
+        return len(self._modules)
+
+    def __iter__(self) -> Iterator[Module]:
+        return iter(self._modules.values())
+
+    def __iadd__(self: T, modules: Iterable[Module]) -> T:
+        return self.extend(modules)
+
+    def __dir__(self):
+        keys = super(ModuleList, self).__dir__()
+        keys = [key for key in keys if not key.isdigit()]
+        return keys
+
+    def insert(self, index: int, module: Module) -> None:
+        """Insert a given module before a given index in the list.
+
+        Arguments:
+            index (int): index to insert.
+            module (nn.Module): module to insert
+        """
+        for i in range(len(self._modules), index, -1):
+            self._modules[str(i)] = self._modules[str(i - 1)]
+        self._modules[str(index)] = module
+
+    def append(self: T, module: Module) -> T:
+        """Appends a given module to the end of the list.
+
+        Arguments:
+            module (nn.Module): module to append
+        """
+        self.add_module(str(len(self)), module)
+        return self
+
+    def extend(self: T, modules: Iterable[Module]) -> T:
+        """Appends modules from a Python iterable to the end of the list.
+
+        Arguments:
+            modules (iterable): iterable of modules to append
+        """
+        if not isinstance(modules, collections.abc.Iterable):
+            raise TypeError(
+                "ModuleList.extend should be called with an iterable, but got "
+                + type(modules).__name__
+            )
+        offset = len(self)
+        for (i, module) in enumerate(modules):
+            self.add_module(str(offset + i), module)
+        return self
+
+    def forward(self):
+        raise NotImplementedError()
+
+
+class ModuleDict(Module):
+    def __init__(self, modules: Optional[Mapping[str, Module]] = None) -> None:
+        super(ModuleDict, self).__init__()
+        if modules is not None:
+            self.update(modules)
+
+    def __getitem__(self, key: str) -> Module:
+        return self._modules[key]
+
+    def __setitem__(self, key: str, module: Module) -> None:
+        self.add_module(key, module)
+
+    def __delitem__(self, key: str) -> None:
+        del self._modules[key]
+
+    def __len__(self) -> int:
+        return len(self._modules)
+
+    def __iter__(self) -> Iterator[str]:
+        return iter(self._modules)
+
+    def __contains__(self, key: str) -> bool:
+        return key in self._modules
+
+    def clear(self) -> None:
+        """Remove all items from the ModuleDict.
+        """
+        self._modules.clear()
+
+    def pop(self, key: str) -> Module:
+        """Remove key from the ModuleDict and return its module.
+
+        Arguments:
+            key (string): key to pop from the ModuleDict
+        """
+        v = self[key]
+        del self[key]
+        return v
+
+    def keys(self) -> Iterable[str]:
+        """Return an iterable of the ModuleDict keys.
+        """
+        return self._modules.keys()
+
+    def items(self) -> Iterable[Tuple[str, Module]]:
+        """Return an iterable of the ModuleDict key/value pairs.
+        """
+        return self._modules.items()
+
+    def values(self) -> Iterable[Module]:
+        """Return an iterable of the ModuleDict values.
+        """
+        return self._modules.values()
+
+    def update(self, modules: Mapping[str, Module]) -> None:
+        if not isinstance(modules, collections.abc.Iterable):
+            raise TypeError(
+                "ModuleDict.update should be called with an iterable of key/value pairs, but got "
+                + type(modules).__name__
+            )
+        if isinstance(modules, (OrderedDict, ModuleDict, collections.abc.Mapping)):
+            for (key, module) in modules.items():
+                self[key] = module
+        else:
+            for (j, m) in enumerate(modules):
+                if not isinstance(m, collections.abc.Iterable):
+                    raise TypeError(
+                        "ModuleDict update sequence element #"
+                        + str(j)
+                        + " should be Iterable; is"
+                        + type(m).__name__
+                    )
+                if not len(m) == 2:
+                    raise ValueError(
+                        "ModuleDict update sequence element #"
+                        + str(j)
+                        + " has length "
+                        + str(len(m))
+                        + "; 2 is required"
+                    )
+                self[m[0]] = m[1]
+
+    def forward(self):
+        raise NotImplementedError()
+
+    def pop(self, key: str) -> "Parameter":
+        """Remove key from the ParameterDict and return its parameter.
+
+        Arguments:
+            key (string): key to pop from the ParameterDict
+        """
+        v = self[key]
+        del self[key]
+        return v
+
+    def keys(self) -> Iterable[str]:
+        """Return an iterable of the ParameterDict keys.
+        """
+        return self._parameters.keys()
+
+    def items(self) -> Iterable[Tuple[str, "Parameter"]]:
+        """Return an iterable of the ParameterDict key/value pairs.
+        """
+        return self._parameters.items()
+
+    def values(self) -> Iterable["Parameter"]:
+        """Return an iterable of the ParameterDict values.
+        """
+        return self._parameters.values()
+
+    def update(self, parameters: Mapping[str, "Parameter"]) -> None:
+        if not isinstance(parameters, collections.abc.Iterable):
+            raise TypeError(
+                "ParametersDict.update should be called with an iterable of key/value pairs, but got "
+                + type(parameters).__name__
+            )
+        if isinstance(parameters, (OrderedDict, ParameterDict)):
+            for (key, parameter) in parameters.items():
+                self[key] = parameter
+        elif isinstance(parameters, collections.abc.Mapping):
+            for (key, parameter) in sorted(parameters.items()):
+                self[key] = parameter
+        else:
+            for (j, p) in enumerate(parameters):
+                if not isinstance(p, collections.abc.Iterable):
+                    raise TypeError(
+                        "ParameterDict update sequence element #"
+                        + str(j)
+                        + " should be Iterable; is"
+                        + type(p).__name__
+                    )
+                if not len(p) == 2:
+                    raise ValueError(
+                        "ParameterDict update sequence element #"
+                        + str(j)
+                        + " has length "
+                        + str(len(p))
+                        + "; 2 is required"
+                    )
+                self[p[0]] = p[1]
+
+    def extra_repr(self) -> str:
+        child_lines = []
+        for (k, p) in self._parameters.items():
+            size_str = "x".join((str(size) for size in p.size()))
+            device_str = "" if not p.is_cuda else " (GPU {})".format(p.get_device())
+            parastr = "Parameter containing: [{} of size {}{}]".format(
+                type(p), size_str, device_str
+            )
+            child_lines.append("  (" + k + "): " + parastr)
+        tmpstr = "\n".join(child_lines)
+        return tmpstr
+
+    def __call__(self, input):
+        raise RuntimeError("ParameterDict should not be called.")
+
+    def _replicate_for_data_parallel(self):
+        warnings.warn(
+            "nn.ParameterDict is being used with DataParallel but this is not supported. This dict will appear empty for the models replicated on each GPU except the original one."
+        )
+        return super(ParameterDict, self)._replicate_for_data_parallel()
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/conv.py b/python/oneflow/compatible/single_client/nn/modules/conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce755e02ac7149c62a5a96f8fbadf94a0facff65
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/conv.py
@@ -0,0 +1,461 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import math
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.nn import init
+from oneflow.compatible.single_client.nn.common_types import _size_1_t, _size_2_t
+from oneflow.compatible.single_client.nn.module import Module
+from oneflow.compatible.single_client.nn.modules.utils import _pair, _single
+
+
+def slice(x, begin, size):
+    ndim = len(x.shape)
+    if not isinstance(begin, (list, tuple)) or len(begin) != ndim:
+        raise ValueError(
+            "begin must be a list/tuple with the same length as input tensor's number of dimensions"
+        )
+    if not all((isinstance(b, int) or b is None for b in begin)):
+        raise ValueError("element of begin must be a int or None")
+    if not isinstance(size, (list, tuple)) or len(size) != ndim:
+        raise ValueError(
+            "size must be a list/tuple with the same length as input tensor's number of dimensions."
+        )
+    if not all((isinstance(s, int) or s is None for s in size)):
+        raise ValueError("element of size must be a int or None")
+    slice_tup_list = []
+    for (b, s, dim_size) in zip(begin, size, x.shape):
+        (start, stop, step) = (None, None, 1)
+        if b is not None:
+            if b < -dim_size or b >= dim_size:
+                raise ValueError("element of begin is out of range")
+            start = b
+        if s is not None:
+            if s == -1:
+                stop = dim_size
+            else:
+                if s <= 0 or s > dim_size:
+                    raise ValueError("element of size is invalid")
+                if b + s < dim_size:
+                    stop = b + s
+        slice_tup_list.append((start, stop, step))
+    return flow.experimental.slice(x, slice_tup_list)
+
+
+class ConvUtil(object):
+    @classmethod
+    def split(cls, x, axis, split_num):
+        split_len = x.shape[axis] // split_num
+        result_list = []
+        slice_begin = [0] * len(x.shape)
+        slice_size = [-1] * len(x.shape)
+        slice_size[axis] = split_len
+        for i in range(split_num):
+            slice_begin[axis] = i * split_len
+            result = slice(x, slice_begin, slice_size)
+            result_list.append(result)
+        return result_list
+
+
+class Conv1d(Module):
+    """The interface is consistent with PyTorch.    
+    The documentation is referenced from: https://pytorch.org/docs/master/generated/torch.nn.Conv1d.html#conv1d
+    
+    Applies a 1D convolution over an input signal composed of several input
+    planes.
+
+    In the simplest case, the output value of the layer with input size
+    :math:`(N, C_{\\text{in}}, L)` and output :math:`(N, C_{\\text{out}}, L_{\\text{out}})` can be
+    precisely described as:
+
+    .. math::
+        \\text{out}(N_i, C_{\\text{out}_j}) = \\text{bias}(C_{\\text{out}_j}) +
+        \\sum_{k = 0}^{C_{in} - 1} \\text{weight}(C_{\\text{out}_j}, k)
+        \\star \\text{input}(N_i, k)
+
+    where :math:`\\star` is the valid `cross-correlation`_ operator,
+    :math:`N` is a batch size, :math:`C` denotes a number of channels,
+    :math:`L` is a length of signal sequence.
+
+    * :attr:`stride` controls the stride for the cross-correlation, a single
+      number or a one-element tuple.
+
+    * :attr:`padding` controls the amount of padding applied to the input. It
+      can be either a string {{'valid', 'same'}} or a tuple of ints giving the
+      amount of implicit padding applied on both sides.
+
+    * :attr:`dilation` controls the spacing between the kernel points; also
+      known as the 脿 trous algorithm. It is harder to describe, but this `link`_
+      has a nice visualization of what :attr:`dilation` does.
+
+    Note:
+        ``padding='valid'`` is the same as no padding. ``padding='same'`` pads
+        the input so the output has the shape as the input. However, this mode
+        doesn't support any stride values other than 1.
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int, tuple or str, optional): Padding added to both sides of
+            the input. Default: 0
+        padding_mode (string, optional): ``'zeros'``, ``'reflect'``,
+            ``'replicate'`` or ``'circular'``. Default: ``'zeros'``
+        dilation (int or tuple, optional): Spacing between kernel
+            elements. Default: 1
+        groups (int, optional): Number of blocked connections from input
+            channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the
+            output. Default: ``True``
+
+    Shape:
+        - Input: :math:`(N, C_{in}, L_{in})`
+        - Output: :math:`(N, C_{out}, L_{out})` where
+
+          .. math::
+              L_{out} = \\left\\lfloor\\frac{L_{in} + 2 \\times \\text{padding} - \\text{dilation}
+                        \\times (\\text{kernel\\_size} - 1) - 1}{\\text{stride}} + 1\\right\\rfloor
+
+    Attributes:
+        weight (Tensor): the learnable weights of the module of shape
+            :math:`(\\text{out\\_channels},
+            \\frac{\\text{in\\_channels}}{\\text{groups}}, \\text{kernel\\_size})`.
+            The values of these weights are sampled from
+            :math:`\\mathcal{U}(-\\sqrt{k}, \\sqrt{k})` where
+            :math:`k = \\frac{groups}{C_\\text{in} * \\text{kernel\\_size}}`
+        bias (Tensor):   the learnable bias of the module of shape
+            (out_channels). If :attr:`bias` is ``True``, then the values of these weights are
+            sampled from :math:`\\mathcal{U}(-\\sqrt{k}, \\sqrt{k})` where
+            :math:`k = \\frac{groups}{C_\\text{in} * \\text{kernel\\_size}}`
+
+    For example: 
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import oneflow.compatible.single_client.experimental.nn as nn
+        >>> flow.enable_eager_execution()
+
+        >>> arr = np.random.randn(20, 16, 50)
+        >>> input = flow.Tensor(arr)
+        >>> m = nn.Conv1d(16, 33, 3, stride=2)
+        >>> output = m(input)
+
+    .. _cross-correlation:
+        https://en.wikipedia.org/wiki/Cross-correlation
+
+    .. _link:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: _size_1_t,
+        stride: _size_1_t = 1,
+        padding: _size_1_t = 0,
+        dilation: _size_1_t = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = "zeros",
+    ):
+        super().__init__()
+        assert padding_mode == "zeros"
+        self.kernel_size = _single(kernel_size)
+        self.stride = _single(stride)
+        self.padding = _single(padding)
+        self.dilation = _single(dilation)
+        self.groups = groups
+        assert in_channels % groups == 0
+        assert out_channels % groups == 0
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.weight = flow.nn.Parameter(
+            flow.Tensor(out_channels, in_channels // groups, *self.kernel_size)
+        )
+        self.out_channel_groups = out_channels // groups
+        self.bias = None
+        if bias:
+            self.bias = flow.nn.Parameter(flow.Tensor(out_channels))
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+        if self.bias is not None:
+            (fan_in, _) = init._calculate_fan_in_and_fan_out(self.weight)
+            bound = 1 / math.sqrt(fan_in)
+            init.uniform_(self.bias, -bound, bound)
+
+    def forward(self, x):
+        if x.device.type == "cpu" and self.groups > 1:
+            in_channel_axis = 1
+            weight_channel_axis = 0
+            bias_channel_axis = 0
+            in_split_list = ConvUtil.split(
+                x, axis=in_channel_axis, split_num=self.groups
+            )
+            out_list = []
+            for i in range(len(in_split_list)):
+                out_list.append(
+                    flow.F.conv1d(
+                        in_split_list[i],
+                        self.weight[
+                            i
+                            * self.out_channel_groups : (i + 1)
+                            * self.out_channel_groups,
+                            :,
+                            :,
+                        ],
+                        self.bias[
+                            i
+                            * self.out_channel_groups : (i + 1)
+                            * self.out_channel_groups
+                        ]
+                        if self.bias
+                        else None,
+                        stride=self.stride,
+                        padding=self.padding,
+                        dilation=self.dilation,
+                        groups=1,
+                    )
+                )
+            res = flow.experimental.cat(out_list, dim=in_channel_axis)
+        else:
+            res = flow.F.conv1d(
+                x,
+                self.weight,
+                self.bias,
+                stride=self.stride,
+                padding=self.padding,
+                dilation=self.dilation,
+                groups=self.groups,
+            )
+        return res
+
+
+class Conv2d(Module):
+    """The interface is consistent with PyTorch.    
+    The documentation is referenced from: https://pytorch.org/docs/master/generated/torch.nn.Conv2d.html#conv2d
+    
+    Applies a 2D convolution over an input signal composed of several input
+    planes.
+
+    In the simplest case, the output value of the layer with input size
+    :math:`(N, C_{\\text{in}}, H, W)` and output :math:`(N, C_{\\text{out}}, H_{\\text{out}}, W_{\\text{out}})`
+    can be precisely described as:
+
+    .. math::
+        \\text{out}(N_i, C_{\\text{out}_j}) = \\text{bias}(C_{\\text{out}_j}) +
+        \\sum_{k = 0}^{C_{\\text{in}} - 1} \\text{weight}(C_{\\text{out}_j}, k) \\star \\text{input}(N_i, k)
+
+
+    where :math:`\\star` is the valid 2D `cross-correlation`_ operator,
+    :math:`N` is a batch size, :math:`C` denotes a number of channels,
+    :math:`H` is a height of input planes in pixels, and :math:`W` is
+    width in pixels.
+
+
+    * :attr:`stride` controls the stride for the cross-correlation, a single
+      number or a tuple.
+    * :attr:`padding` controls the amount of implicit padding on both
+      sides for :attr:`padding` number of points for each dimension.
+    * :attr:`dilation` controls the spacing between the kernel points; also
+      known as the 脿 trous algorithm. It is harder to describe, but this `link`_
+      has a nice visualization of what :attr:`dilation` does.
+    * :attr:`groups` controls the connections between inputs and outputs.
+      :attr:`in_channels` and :attr:`out_channels` must both be divisible by
+      :attr:`groups`. For example,
+
+        * At groups=1, all inputs are convolved to all outputs.
+        * At groups=2, the operation becomes equivalent to having two conv
+          layers side by side, each seeing half the input channels
+          and producing half the output channels, and both subsequently
+          concatenated.
+        * At groups= :attr:`in_channels`, each input channel is convolved with
+          its own set of filters (of size
+          :math:`\\frac{\\text{out_channels}}{\\text{in_channels}}`).,
+
+    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be:
+
+        - a single ``int`` -- in which case the same value is used for the height and width dimension
+        - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
+          and the second `int` for the width dimension
+
+    Note:
+        When `groups == in_channels` and `out_channels == K * in_channels`,
+        where `K` is a positive integer, this operation is also known as a "depthwise convolution".
+
+        In other words, for an input of size :math:`(N, C_{in}, L_{in})`,
+        a depthwise convolution with a depthwise multiplier `K` can be performed with the arguments
+        :math:`(C_\\text{in}=C_\\text{in}, C_\\text{out}=C_\\text{in} \\times \\text{K}, ..., \\text{groups}=C_\\text{in})`.
+
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of
+            the input. Default: 0
+        padding_mode (string, optional): ``'zeros'``, ``'reflect'``,
+            ``'replicate'`` or ``'circular'``. Default: ``'zeros'``
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+        groups (int, optional): Number of blocked connections from input
+            channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the
+            output. Default: ``True``
+
+    Shape:
+        - Input: :math:`(N, C_{in}, H_{in}, W_{in})`
+        - Output: :math:`(N, C_{out}, H_{out}, W_{out})` where
+
+          .. math::
+              H_{out} = \\left\\lfloor\\frac{H_{in}  + 2 \\times \\text{padding}[0] - \\text{dilation}[0]
+                        \\times (\\text{kernel_size}[0] - 1) - 1}{\\text{stride}[0]} + 1\\right\\rfloor
+
+          .. math::
+              W_{out} = \\left\\lfloor\\frac{W_{in}  + 2 \\times \\text{padding}[1] - \\text{dilation}[1]
+                        \\times (\\text{kernel_size}[1] - 1) - 1}{\\text{stride}[1]} + 1\\right\\rfloor
+
+    Attr:
+        - weight (Tensor): the learnable weights of the module of shape
+            :math:`(\\text{out_channels}, \\frac{\\text{in_channels}}{\\text{groups}},`
+            :math:`\\text{kernel_size[0]}, \\text{kernel_size[1]})`.
+            The values of these weights are sampled from
+            :math:`\\mathcal{U}(-\\sqrt{k}, \\sqrt{k})` where
+            :math:`k = \\frac{groups}{C_\\text{in} * \\prod_{i=0}^{1}\\text{kernel_size}[i]}`
+
+        - bias (Tensor):   the learnable bias of the module of shape
+            (out_channels). If :attr:`bias` is ``True``,
+            then the values of these weights are
+            sampled from :math:`\\mathcal{U}(-\\sqrt{k}, \\sqrt{k})` where
+            :math:`k = \\frac{groups}{C_\\text{in} * \\prod_{i=0}^{1}\\text{kernel_size}[i]}`
+
+    For example: 
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import oneflow.compatible.single_client.experimental.nn as nn
+        >>> flow.enable_eager_execution()
+
+        >>> arr = np.random.randn(20, 16, 50, 100)
+        >>> input = flow.Tensor(arr)
+        >>> m = nn.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1))
+        >>> output = m(input)
+
+    .. _cross-correlation:
+        https://en.wikipedia.org/wiki/Cross-correlation
+
+    .. _link:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: _size_2_t,
+        stride: _size_2_t = 1,
+        padding: _size_2_t = 0,
+        dilation: _size_2_t = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = "zeros",
+    ):
+        super().__init__()
+        assert padding_mode == "zeros"
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride)
+        self.padding = _pair(padding)
+        self.dilation = _pair(dilation)
+        self.groups = groups
+        assert in_channels % groups == 0
+        assert out_channels % groups == 0
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.weight = flow.nn.Parameter(
+            flow.Tensor(out_channels, in_channels // groups, *self.kernel_size)
+        )
+        self.out_channel_groups = out_channels // groups
+        self.bias = None
+        if bias:
+            self.bias = flow.nn.Parameter(flow.Tensor(out_channels))
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+        if self.bias is not None:
+            (fan_in, _) = init._calculate_fan_in_and_fan_out(self.weight)
+            bound = 1 / math.sqrt(fan_in)
+            init.uniform_(self.bias, -bound, bound)
+
+    def forward(self, x):
+        if x.shape[1] != self.in_channels:
+            raise ValueError("The input channels should be equal to self.in_channels")
+        if x.device.type == "cpu" and self.groups > 1:
+            in_channel_axis = 1
+            in_split_list = ConvUtil.split(
+                x, axis=in_channel_axis, split_num=self.groups
+            )
+            out_list = []
+            for i in range(len(in_split_list)):
+                out_list.append(
+                    flow.F.conv2d(
+                        in_split_list[i],
+                        self.weight[
+                            i
+                            * self.out_channel_groups : (i + 1)
+                            * self.out_channel_groups,
+                            :,
+                            :,
+                            :,
+                        ],
+                        self.bias[
+                            i
+                            * self.out_channel_groups : (i + 1)
+                            * self.out_channel_groups
+                        ]
+                        if self.bias
+                        else None,
+                        stride=self.stride,
+                        padding=self.padding,
+                        dilation=self.dilation,
+                        groups=1,
+                    )
+                )
+            res = flow.experimental.cat(out_list, dim=in_channel_axis)
+        else:
+            res = flow.F.conv2d(
+                x,
+                self.weight,
+                self.bias,
+                stride=self.stride,
+                padding=self.padding,
+                dilation=self.dilation,
+                groups=self.groups,
+            )
+        return res
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/dataset.py b/python/oneflow/compatible/single_client/nn/modules/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..23fd784f9a8146651cdff2d909bfc3acdd4deaae
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/dataset.py
@@ -0,0 +1,526 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import random
+import sys
+import traceback
+from typing import List, Optional, Sequence, Tuple, Union
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.nn.common_types import (
+    _size_1_t,
+    _size_2_t,
+    _size_3_t,
+    _size_any_t,
+)
+from oneflow.compatible.single_client.nn.module import Module
+from oneflow.compatible.single_client.nn.modules.utils import (
+    _pair,
+    _reverse_repeat_tuple,
+    _single,
+    _triple,
+)
+
+
+def mirrored_gen_random_seed(seed=None):
+    if seed is None:
+        seed = -1
+        has_seed = False
+    else:
+        has_seed = True
+    return (seed, has_seed)
+
+
+class OfrecordReader(Module):
+    def __init__(
+        self,
+        ofrecord_dir: str,
+        batch_size: int = 1,
+        data_part_num: int = 1,
+        part_name_prefix: str = "part-",
+        part_name_suffix_length: int = -1,
+        random_shuffle: bool = False,
+        shuffle_buffer_size: int = 1024,
+        shuffle_after_epoch: bool = False,
+        random_seed: int = -1,
+        name: Optional[str] = None,
+    ):
+        super().__init__()
+        (seed, has_seed) = mirrored_gen_random_seed(random_seed)
+        self._op = (
+            flow.builtin_op("OFRecordReader", name)
+            .Output("out")
+            .Attr("data_dir", ofrecord_dir)
+            .Attr("data_part_num", data_part_num)
+            .Attr("batch_size", batch_size)
+            .Attr("part_name_prefix", part_name_prefix)
+            .Attr("random_shuffle", random_shuffle)
+            .Attr("shuffle_buffer_size", shuffle_buffer_size)
+            .Attr("shuffle_after_epoch", shuffle_after_epoch)
+            .Attr("part_name_suffix_length", part_name_suffix_length)
+            .Attr("seed", seed)
+            .Build()
+        )
+
+    def forward(self):
+        res = self._op()[0]
+        return res
+
+
+class OfrecordRawDecoder(Module):
+    def __init__(
+        self,
+        blob_name: str,
+        shape: Sequence[int],
+        dtype: flow.dtype,
+        dim1_varying_length: bool = False,
+        truncate: bool = False,
+        auto_zero_padding: bool = False,
+        name: Optional[str] = None,
+    ):
+        super().__init__()
+        if auto_zero_padding:
+            print(
+                "WARNING: auto_zero_padding has been deprecated, Please use truncate instead.\n                "
+            )
+        self._op = (
+            flow.builtin_op("ofrecord_raw_decoder", name)
+            .Input("in")
+            .Output("out")
+            .Attr("name", blob_name)
+            .Attr("shape", shape)
+            .Attr("data_type", dtype)
+            .Attr("dim1_varying_length", dim1_varying_length)
+            .Attr("truncate", truncate or auto_zero_padding)
+            .Build()
+        )
+
+    def forward(self, input):
+        res = self._op(input)[0]
+        return res
+
+
+class CoinFlip(Module):
+    def __init__(
+        self,
+        batch_size: int = 1,
+        random_seed: Optional[int] = None,
+        probability: float = 0.5,
+    ):
+        super().__init__()
+        (seed, has_seed) = mirrored_gen_random_seed(random_seed)
+        self._op = (
+            flow.builtin_op("coin_flip")
+            .Output("out")
+            .Attr("batch_size", batch_size)
+            .Attr("probability", probability)
+            .Attr("has_seed", has_seed)
+            .Attr("seed", seed)
+            .Build()
+        )
+
+    def forward(self):
+        res = self._op()[0]
+        return res
+
+
+class CropMirrorNormalize(Module):
+    def __init__(
+        self,
+        color_space: str = "BGR",
+        output_layout: str = "NCHW",
+        crop_h: int = 0,
+        crop_w: int = 0,
+        crop_pos_y: float = 0.5,
+        crop_pos_x: float = 0.5,
+        mean: Sequence[float] = [0.0],
+        std: Sequence[float] = [1.0],
+        output_dtype: flow.dtype = flow.float,
+    ):
+        super().__init__()
+        self._op = (
+            flow.builtin_op("crop_mirror_normalize_from_uint8")
+            .Input("in")
+            .Input("mirror")
+            .Output("out")
+            .Attr("color_space", color_space)
+            .Attr("output_layout", output_layout)
+            .Attr("mean", mean)
+            .Attr("std", std)
+            .Attr("crop_h", crop_h)
+            .Attr("crop_w", crop_w)
+            .Attr("crop_pos_y", crop_pos_y)
+            .Attr("crop_pos_x", crop_pos_x)
+            .Attr("output_dtype", output_dtype)
+            .Build()
+        )
+        self._val_op = (
+            flow.builtin_op("crop_mirror_normalize_from_tensorbuffer")
+            .Input("in")
+            .Output("out")
+            .Attr("color_space", color_space)
+            .Attr("output_layout", output_layout)
+            .Attr("mean", mean)
+            .Attr("std", std)
+            .Attr("crop_h", crop_h)
+            .Attr("crop_w", crop_w)
+            .Attr("crop_pos_y", crop_pos_y)
+            .Attr("crop_pos_x", crop_pos_x)
+            .Attr("output_dtype", output_dtype)
+            .Build()
+        )
+
+    def forward(self, input, mirror=None):
+        if mirror != None:
+            res = self._op(input, mirror)[0]
+        else:
+            res = self._val_op(input)[0]
+        return res
+
+
+class OFRecordImageDecoderRandomCrop(Module):
+    def __init__(
+        self,
+        blob_name: str,
+        color_space: str = "BGR",
+        num_attempts: int = 10,
+        random_seed: Optional[int] = None,
+        random_area: Sequence[float] = [0.08, 1.0],
+        random_aspect_ratio: Sequence[float] = [0.75, 1.333333],
+    ):
+        super().__init__()
+        (seed, has_seed) = mirrored_gen_random_seed(random_seed)
+        self._op = (
+            flow.builtin_op("ofrecord_image_decoder_random_crop")
+            .Input("in")
+            .Output("out")
+            .Attr("name", blob_name)
+            .Attr("color_space", color_space)
+            .Attr("num_attempts", num_attempts)
+            .Attr("random_area", random_area)
+            .Attr("random_aspect_ratio", random_aspect_ratio)
+            .Attr("has_seed", has_seed)
+            .Attr("seed", seed)
+            .Build()
+        )
+
+    def forward(self, input):
+        res = self._op(input)[0]
+        return res
+
+
+class OFRecordImageDecoder(Module):
+    def __init__(self, blob_name: str, color_space: str = "BGR"):
+        super().__init__()
+        self._op = (
+            flow.builtin_op("ofrecord_image_decoder")
+            .Input("in")
+            .Output("out")
+            .Attr("name", blob_name)
+            .Attr("color_space", color_space)
+            .Build()
+        )
+
+    def forward(self, input):
+        res = self._op(input)[0]
+        return res
+
+
+class TensorBufferToListOfTensors(Module):
+    def __init__(
+        self, out_shapes, out_dtypes, out_num: int = 1, dynamic_out: bool = False
+    ):
+        super().__init__()
+        self._op = (
+            flow.builtin_op("tensor_buffer_to_list_of_tensors_v2")
+            .Input("in")
+            .Output("out", out_num)
+            .Attr("out_shapes", out_shapes)
+            .Attr("out_dtypes", out_dtypes)
+            .Attr("dynamic_out", dynamic_out)
+            .Build()
+        )
+
+    def forward(self, input):
+        return self._op(input)
+
+
+def tensor_buffer_to_list_of_tensors(tensor, out_shapes, out_dtypes):
+    return TensorBufferToListOfTensors(
+        [list(out_shape) for out_shape in out_shapes], out_dtypes, len(out_shapes)
+    )(tensor)
+
+
+class ImageResize(Module):
+    def __init__(
+        self,
+        target_size: Union[int, Sequence[int]] = None,
+        min_size: Optional[int] = None,
+        max_size: Optional[int] = None,
+        keep_aspect_ratio: bool = False,
+        resize_side: str = "shorter",
+        channels: int = 3,
+        dtype: Optional[flow.dtype] = None,
+        interpolation_type: str = "auto",
+        name: Optional[str] = None,
+        color_space: Optional[str] = None,
+        interp_type: Optional[str] = None,
+        resize_shorter: int = 0,
+        resize_x: int = 0,
+        resize_y: int = 0,
+    ):
+        super().__init__()
+        deprecated_param_used = False
+        if color_space is not None:
+            print(
+                "WARNING: color_space has been deprecated. Please use channels instead."
+            )
+            print(traceback.format_stack()[-2])
+            deprecated_param_used = True
+            assert isinstance(color_space, str)
+            if color_space.upper() == "RGB" or color_space.upper() == "BGR":
+                channels = 3
+            elif color_space.upper() == "GRAY":
+                channels = 1
+            else:
+                raise ValueError("invalid color_space")
+        if interp_type is not None:
+            print(
+                "WARNING: interp_type has been deprecated. Please use interpolation_type instead."
+            )
+            print(traceback.format_stack()[-2])
+            deprecated_param_used = True
+            assert isinstance(interp_type, str)
+            if interp_type == "Linear":
+                interpolation_type = "bilinear"
+            elif interp_type == "NN":
+                interpolation_type = "nearest_neighbor"
+            elif interp_type == "Cubic":
+                interpolation_type = "bicubic"
+            else:
+                raise ValueError("invalid interp_type")
+        if resize_x > 0 and resize_y > 0:
+            print(
+                "WARNING: resize_x and resize_y has been deprecated. Please use target_size instead."
+            )
+            print(traceback.format_stack()[-2])
+            deprecated_param_used = True
+            target_size = (resize_x, resize_y)
+            keep_aspect_ratio = False
+        if resize_shorter > 0:
+            print(
+                "WARNING: resize_shorter has been deprecated. Please use target_size instead."
+            )
+            print(traceback.format_stack()[-2])
+            deprecated_param_used = True
+            target_size = resize_shorter
+            keep_aspect_ratio = True
+            resize_side = "shorter"
+        if keep_aspect_ratio:
+            if not isinstance(target_size, int):
+                raise ValueError(
+                    "target_size must be an int when keep_aspect_ratio is True"
+                )
+            if min_size is None:
+                min_size = 0
+            if max_size is None:
+                max_size = 0
+            if resize_side == "shorter":
+                resize_longer = False
+            elif resize_side == "longer":
+                resize_longer = True
+            else:
+                raise ValueError('resize_side must be "shorter" or "longer"')
+            self._op = (
+                flow.builtin_op("image_resize_keep_aspect_ratio")
+                .Input("in")
+                .Output("out")
+                .Output("size")
+                .Output("scale")
+                .Attr("target_size", target_size)
+                .Attr("min_size", min_size)
+                .Attr("max_size", max_size)
+                .Attr("resize_longer", resize_longer)
+                .Attr("interpolation_type", interpolation_type)
+                .Build()
+            )
+        else:
+            if (
+                not isinstance(target_size, (list, tuple))
+                or len(target_size) != 2
+                or (not all((isinstance(size, int) for size in target_size)))
+            ):
+                raise ValueError(
+                    "target_size must be a form like (width, height) when keep_aspect_ratio is False"
+                )
+            if dtype is None:
+                dtype = flow.uint8
+            (target_w, target_h) = target_size
+            self._op = (
+                flow.builtin_op("image_resize_to_fixed")
+                .Input("in")
+                .Output("out")
+                .Output("scale")
+                .Attr("target_width", target_w)
+                .Attr("target_height", target_h)
+                .Attr("channels", channels)
+                .Attr("data_type", dtype)
+                .Attr("interpolation_type", interpolation_type)
+                .Build()
+            )
+
+    def forward(self, input):
+        res = self._op(input)[0]
+        return res
+
+
+def raw_decoder(
+    input_record,
+    blob_name: str,
+    shape: Sequence[int],
+    dtype: flow.dtype,
+    dim1_varying_length: bool = False,
+    truncate: bool = False,
+    auto_zero_padding: bool = False,
+    name: Optional[str] = None,
+):
+    if auto_zero_padding:
+        print(
+            "WARNING: auto_zero_padding has been deprecated, Please use truncate instead.\n            "
+        )
+    return OfrecordRawDecoder(
+        blob_name,
+        shape,
+        dtype,
+        dim1_varying_length,
+        truncate or auto_zero_padding,
+        name,
+    ).forward(input_record)
+
+
+def get_ofrecord_handle(
+    ofrecord_dir: str,
+    batch_size: int = 1,
+    data_part_num: int = 1,
+    part_name_prefix: str = "part-",
+    part_name_suffix_length: int = -1,
+    random_shuffle: bool = False,
+    shuffle_buffer_size: int = 1024,
+    shuffle_after_epoch: bool = False,
+    name: Optional[str] = None,
+):
+    return OfrecordReader(
+        ofrecord_dir,
+        batch_size,
+        data_part_num,
+        part_name_prefix,
+        part_name_suffix_length,
+        random_shuffle,
+        shuffle_buffer_size,
+        shuffle_after_epoch,
+        name,
+    )()
+
+
+class ImageDecode(Module):
+    def __init__(self, dtype: flow.dtype = flow.uint8, color_space: str = "BGR"):
+        super().__init__()
+        self._op = (
+            flow.builtin_op("image_decode")
+            .Input("in")
+            .Output("out")
+            .Attr("color_space", color_space)
+            .Attr("data_type", dtype)
+            .Build()
+        )
+
+    def forward(self, input):
+        return self._op(input)[0]
+
+
+class ImageNormalize(Module):
+    def __init__(self, std: Sequence[float], mean: Sequence[float]):
+        super().__init__()
+        self._op = (
+            flow.builtin_op("image_normalize")
+            .Input("in")
+            .Output("out")
+            .Attr("std", std)
+            .Attr("mean", mean)
+            .Build()
+        )
+
+    def forward(self, input):
+        return self._op(input)[0]
+
+
+class COCOReader(Module):
+    def __init__(
+        self,
+        annotation_file: str,
+        image_dir: str,
+        batch_size: int,
+        shuffle: bool = True,
+        random_seed: Optional[int] = None,
+        group_by_aspect_ratio: bool = True,
+        remove_images_without_annotations: bool = True,
+        stride_partition: bool = True,
+    ):
+        super().__init__()
+        if random_seed is None:
+            random_seed = random.randrange(sys.maxsize)
+        self._op = (
+            flow.builtin_op("COCOReader")
+            .Output("image")
+            .Output("image_id")
+            .Output("image_size")
+            .Output("gt_bbox")
+            .Output("gt_label")
+            .Output("gt_segm")
+            .Output("gt_segm_index")
+            .Attr("session_id", flow.current_scope().session_id)
+            .Attr("annotation_file", annotation_file)
+            .Attr("image_dir", image_dir)
+            .Attr("batch_size", batch_size)
+            .Attr("shuffle_after_epoch", shuffle)
+            .Attr("random_seed", random_seed)
+            .Attr("group_by_ratio", group_by_aspect_ratio)
+            .Attr(
+                "remove_images_without_annotations", remove_images_without_annotations
+            )
+            .Attr("stride_partition", stride_partition)
+            .Build()
+        )
+
+    def forward(self):
+        res = self._op()
+        return res
+
+
+class ImageBatchAlign(Module):
+    def __init__(self, shape: Sequence[int], dtype: flow.dtype, alignment: int):
+        super().__init__()
+        self._op = (
+            flow.builtin_op("image_batch_align")
+            .Input("in")
+            .Output("out")
+            .Attr("shape", shape)
+            .Attr("data_type", dtype)
+            .Attr("alignment", alignment)
+            .Build()
+        )
+
+    def forward(self, input):
+        return self._op(input)[0]
diff --git a/python/oneflow/compatible/single_client/nn/modules/deconv.py b/python/oneflow/compatible/single_client/nn/modules/deconv.py
new file mode 100644
index 0000000000000000000000000000000000000000..15c8dc81a175d746756647fd6ec164d425b2af3a
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/deconv.py
@@ -0,0 +1,238 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import math
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.nn import init
+from oneflow.compatible.single_client.nn.common_types import _size_2_t
+from oneflow.compatible.single_client.nn.module import Module
+from oneflow.compatible.single_client.nn.modules.utils import _pair
+
+
+def slice(x, begin, size):
+    ndim = len(x.shape)
+    if not isinstance(begin, (list, tuple)) or len(begin) != ndim:
+        raise ValueError(
+            "begin must be a list/tuple with the same length as input tensor's number of dimensions"
+        )
+    if not all((isinstance(b, int) or b is None for b in begin)):
+        raise ValueError("element of begin must be a int or None")
+    if not isinstance(size, (list, tuple)) or len(size) != ndim:
+        raise ValueError(
+            "size must be a list/tuple with the same length as input tensor's number of dimensions."
+        )
+    if not all((isinstance(s, int) or s is None for s in size)):
+        raise ValueError("element of size must be a int or None")
+    slice_tup_list = []
+    for (b, s, dim_size) in zip(begin, size, x.shape):
+        (start, stop, step) = (None, None, 1)
+        if b is not None:
+            if b < -dim_size or b >= dim_size:
+                raise ValueError("element of begin is out of range")
+            start = b
+        if s is not None:
+            if s == -1:
+                stop = dim_size
+            else:
+                if s <= 0 or s > dim_size:
+                    raise ValueError("element of size is invalid")
+                if b + s < dim_size:
+                    stop = b + s
+        slice_tup_list.append((start, stop, step))
+    return flow.experimental.slice(x, slice_tup_list)
+
+
+class ConvUtil(object):
+    @classmethod
+    def split(cls, x, axis, split_num):
+        split_len = x.shape[axis] // split_num
+        result_list = []
+        slice_begin = [0] * len(x.shape)
+        slice_size = [-1] * len(x.shape)
+        slice_size[axis] = split_len
+        for i in range(split_num):
+            slice_begin[axis] = i * split_len
+            result = slice(x, slice_begin, slice_size)
+            result_list.append(result)
+        return result_list
+
+
+class ConvTranspose2d(Module):
+    """
+    
+    Applies a 2D transposed convolution operator over an input image composed of several input planes.
+
+    This module can be seen as the gradient of Conv2d with respect to its input.
+    It is also known as a fractionally-strided convolution or
+    a deconvolution (although it is not an actual deconvolution operation).
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): ``dilation * (kernel_size - 1) - padding`` zero-padding
+            will be added to both sides of each dimension in the input. Default: 0
+        output_padding (int or tuple, optional): Additional size added to one side
+            of each dimension in the output shape. Default: 0
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True``
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+
+    Shape:
+        - Input: :math:`(N, C_{in}, H_{in}, W_{in})`
+        - Output: :math:`(N, C_{out}, H_{out}, W_{out})` where
+
+        .. math::
+              H_{out} = (H_{in} - 1) \\times \\text{stride}[0] - 2 \\times \\text{padding}[0] + \\text{dilation}[0] 
+
+                        \\times (\\text{kernel_size}[0] - 1) + \\text{output_padding}[0] + 1
+        .. math::
+              W_{out} = (W_{in} - 1) \\times \\text{stride}[1] - 2 \\times \\text{padding}[1] + \\text{dilation}[1]
+              
+                        \\times (\\text{kernel_size}[1] - 1) + \\text{output_padding}[1] + 1
+
+    Attributes:
+        ConvTranspose2d.weight (Tensor): the learnable weights of the module of shape
+                         :math:`(\\text{in_channels}, \\frac{\\text{out_channels}}{\\text{groups}},`
+                         :math:`\\text{kernel_size[0]}, \\text{kernel_size[1]})`.
+                         The values of these weights are sampled from
+                         :math:`\\mathcal{U}(-\\sqrt{k}, \\sqrt{k})` where
+                         :math:`k = \\frac{groups}{C_\\text{out} * \\prod_{i=0}^{1}\\text{kernel_size}[i]}`
+        ConvTranspose2d.bias (Tensor): the learnable bias of the module of shape (out_channels)
+                         If :attr:`bias` is ``True``, then the values of these weights are
+                         sampled from :math:`\\mathcal{U}(-\\sqrt{k}, \\sqrt{k})` where
+                         :math:`k = \\frac{groups}{C_\\text{out} * \\prod_{i=0}^{1}\\text{kernel_size}[i]}`
+
+    Examples::
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import oneflow.compatible.single_client.experimental.nn as nn
+        >>> flow.enable_eager_execution()
+
+        >>> m = nn.ConvTranspose2d(16, 33, 3, stride=2)
+        >>> # non-square kernels and unequal stride and with padding
+        >>> m = nn.ConvTranspose2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
+        >>> m = m.to("cuda")
+        >>> input = flow.Tensor(np.random.randn(20, 16, 50, 100), device=flow.device("cuda"))
+        >>> output = m(input)
+        >>> output.size()
+        flow.Size([20, 33, 93, 100])
+
+    .. _cross-correlation:
+        https://en.wikipedia.org/wiki/Cross-correlation
+
+    .. _link:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: _size_2_t,
+        stride: _size_2_t = 1,
+        padding: _size_2_t = 0,
+        output_padding: _size_2_t = 0,
+        groups: int = 1,
+        bias: bool = True,
+        dilation: int = 1,
+        padding_mode: str = "zeros",
+    ) -> None:
+        super().__init__()
+        assert padding_mode == "zeros"
+        kernel_size = _pair(kernel_size)
+        stride = _pair(stride)
+        padding = _pair(padding)
+        output_padding = _pair(output_padding)
+        dilation = _pair(dilation)
+        self.groups = groups
+        assert in_channels % groups == 0
+        assert out_channels % groups == 0
+        self.weight = flow.nn.Parameter(
+            flow.Tensor(in_channels, out_channels // groups, *kernel_size)
+        )
+        self.in_channel_groups = in_channels // groups
+        self.bias = None
+        self._bias_add_op = None
+        if bias:
+            self.bias = flow.nn.Parameter(flow.Tensor(out_channels))
+            self._bias_add_op = (
+                flow.builtin_op("bias_add")
+                .Input("a")
+                .Input("b")
+                .Output("out")
+                .Attr("axis", 1)
+                .Build()
+            )
+        self._op = (
+            flow.builtin_op("deconv2d")
+            .Input("in")
+            .Input("weight")
+            .Attr("filters", out_channels // groups)
+            .Attr("padding_before", padding)
+            .Attr("data_format", "channels_first")
+            .Attr("kernel_size", kernel_size)
+            .Attr("strides", stride)
+            .Attr("dilation_rate", dilation)
+            .Attr("output_padding", output_padding)
+            .Attr("groups", 1)
+            .Output("out")
+            .Build()
+        )
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+        if self.bias is not None:
+            (fan_in, _) = init._calculate_fan_in_and_fan_out(self.weight)
+            bound = 1 / math.sqrt(fan_in)
+            init.uniform_(self.bias, -bound, bound)
+
+    def forward(self, x):
+        if self.groups > 1:
+            in_channel_axis = 1
+            in_split_list = ConvUtil.split(
+                x, axis=in_channel_axis, split_num=self.groups
+            )
+            out_list = []
+            for i in range(len(in_split_list)):
+                out_list.append(
+                    self._op(
+                        in_split_list[i],
+                        self.weight[
+                            i
+                            * self.in_channel_groups : (i + 1)
+                            * self.in_channel_groups,
+                            :,
+                            :,
+                            :,
+                        ],
+                    )[0]
+                )
+            res = flow.experimental.cat(out_list, dim=in_channel_axis)
+        else:
+            res = self._op(x, self.weight)[0]
+        if self._bias_add_op is not None:
+            res = self._bias_add_op(res, self.bias)[0]
+        return res
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/dropout.py b/python/oneflow/compatible/single_client/nn/modules/dropout.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5fc6cb4dcce9c6a430484d6b22238fdbf4c1e8c
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/dropout.py
@@ -0,0 +1,107 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import random
+import sys
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class _DropoutNd(Module):
+    __constants__ = ["p", "inplace"]
+    p: float
+    inplace: bool
+
+    def __init__(self, p: float = 0.5, inplace: bool = False) -> None:
+        super(_DropoutNd, self).__init__()
+        if p < 0 or p > 1:
+            raise ValueError(
+                "dropout probability has to be between 0 and 1, but got {}".format(p)
+            )
+        self.p = p
+        self.inplace = inplace
+
+    def extra_repr(self) -> str:
+        return "p={}, inplace={}".format(self.p, self.inplace)
+
+
+class Dropout(_DropoutNd):
+    """During training, randomly zeroes some of the elements of the input
+    tensor with probability :attr:`p` using samples from a Bernoulli
+    distribution. Each channel will be zeroed out independently on every forward
+    call.
+
+    This has proven to be an effective technique for regularization and
+    preventing the co-adaptation of neurons as described in the paper
+    "Improving neural networks by preventing co-adaptation of feature
+    detectors".
+
+    Furthermore, the outputs are scaled by a factor of :math:`\\frac{1}{1-p}` during
+    training. This means that during evaluation the module simply computes an
+    identity function.
+
+    Args:
+        p: probability of an element to be zeroed. Default: 0.5
+        inplace: If set to ``True``, will do this operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(*)`. Input can be of any shape
+        - Output: :math:`(*)`. Output is of the same shape as input
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> m = flow.nn.Dropout(p=0)
+        >>> arr = np.array(
+        ...    [
+        ...        [-0.7797, 0.2264, 0.2458, 0.4163],
+        ...        [0.4299, 0.3626, -0.4892, 0.4141],
+        ...        [-1.4115, 1.2183, -0.5503, 0.6520],
+        ...    ]
+        ... )
+        >>> x = flow.Tensor(arr)
+        >>> y = m(x)
+        >>> y #doctest: +ELLIPSIS
+        tensor([[-0.7797,  0.2264,  0.2458,  0.4163],
+                ...
+                [-1.4115,  1.2183, -0.5503,  0.652 ]], dtype=oneflow.float32)
+
+
+    """
+
+    def __init__(self, p: float = 0.5, inplace: bool = False, generator=None):
+        _DropoutNd.__init__(self, p, inplace)
+        self.p = p
+        if generator is None:
+            generator = flow.Generator()
+        self.generator = generator
+
+    def forward(self, x):
+        if self.p == 0.0 or not self.training:
+            return x
+        return flow.F.dropout(x, self.p, self.generator)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/eq.py b/python/oneflow/compatible/single_client/nn/modules/eq.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7ef178be7690861d30c7a97e420e07262e6b6ba
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/eq.py
@@ -0,0 +1,80 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class Eq(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, input, other):
+        if isinstance(other, flow.Tensor) or isinstance(
+            other, oneflow._oneflow_internal.Tensor
+        ):
+            for i in range(len(input.size())):
+                assert (
+                    input.shape[i] >= other.shape[i]
+                ), "The second tensor's shape should broadcastable with the first argument."
+                if input.dtype != other.dtype:
+                    other = other.to(dtype=input.dtype)
+        elif isinstance(other, int) or isinstance(other, float):
+            other = flow.Tensor([other], dtype=input.dtype, device=input.device)
+        else:
+            raise NotImplementedError(
+                "Unsupport data type, The second argument can be a tensor whose shape is broadcastable with the first argument."
+            )
+        return flow.F.broadcast_equal(input, other)
+
+
+@register_tensor_op("eq")
+def eq_op(input, other):
+    """
+    Computes element-wise equality.
+    The second argument can be a number or a tensor whose shape is broadcastable with the first argument.
+
+    Args:
+        input (oneflow.compatible.single_client.Tensor): the tensor to compare
+        other (oneflow.compatible.single_client.Tensor, float or int): the target to compare
+
+    Returns:
+
+        - A boolean tensor that is True where :attr:`input` is equal to :attr:`other` and False elsewhere
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+
+        >>> input = flow.Tensor(np.array([2, 3, 4, 5]), dtype=flow.float32)
+        >>> other = flow.Tensor(np.array([2, 3, 4, 1]), dtype=flow.float32)
+
+        >>> y = flow.eq(input, other)
+        >>> y
+        tensor([1, 1, 1, 0], dtype=oneflow.int8)
+
+    """
+    return Eq()(input, other)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/exp.py b/python/oneflow/compatible/single_client/nn/modules/exp.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bd8e9b79e28d5fa1468229ac897ace890bfe8fb
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/exp.py
@@ -0,0 +1,65 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class Exp(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.exp(x)
+
+
+@register_tensor_op("exp")
+def exp_op(x):
+    """This operator computes the exponential of Tensor.
+
+    The equation is:
+
+    .. math::
+
+        out = e^x
+
+    Args:
+        x (oneflow.compatible.single_client.Tensor): A Tensor
+
+    Returns:
+        oneflow.compatible.single_client.Tensor: The result Tensor
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> x = flow.Tensor(np.array([1, 2, 3]).astype(np.float32))
+        >>> y = x.exp()
+        >>> y
+        tensor([ 2.7183,  7.3891, 20.0855], dtype=oneflow.float32)
+
+    """
+    return Exp()(x)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/expand.py b/python/oneflow/compatible/single_client/nn/modules/expand.py
new file mode 100644
index 0000000000000000000000000000000000000000..44323e125f030807f0b1f7d948c154f18a3ea2ed
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/expand.py
@@ -0,0 +1,104 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class Expand(Module):
+    def __init__(self, *sizes) -> None:
+        super().__init__()
+        self.expand_size = list(*sizes)
+
+    def forward(self, x):
+        if x.dtype == flow.int8:
+            x = flow.experimental.cast(x, flow.int32)
+        expand_size = self.expand_size
+        assert len(expand_size) >= len(
+            x.shape
+        ), "The desired expanded dims should not be less than the input dims."
+        original_stride = [1]
+        for i in range(len(x.shape) - 2, -1, -1):
+            original_stride.insert(0, original_stride[0] * x.shape[i + 1])
+        new_size = []
+        new_stride = []
+        diff = len(expand_size) - len(x.shape)
+        for i in range(len(expand_size) - 1, -1, -1):
+            if i >= diff:
+                if expand_size[i] == -1 or expand_size[i] == x.shape[i - diff]:
+                    new_size.insert(0, x.shape[i - diff])
+                    new_stride.insert(0, original_stride[i - diff])
+                else:
+                    assert expand_size[i] >= 1 and x.shape[i - diff] == 1
+                    new_size.insert(0, expand_size[i])
+                    new_stride.insert(0, 0)
+            else:
+                assert expand_size[i] >= 1
+                new_size.insert(0, expand_size[i])
+                if expand_size[i] == 1:
+                    new_stride.insert(0, new_stride[0])
+                else:
+                    new_stride.insert(0, 0)
+        return flow.F.expand(
+            x, in_shape=list(x.shape), out_shape=new_size, stride=new_stride
+        )
+
+
+@register_tensor_op("expand")
+def expand_op(x, *sizes):
+    """This operator expand the input tensor to a larger size.
+
+    Passing -1 as the size for a dimension means not changing the size of that dimension.
+
+    Tensor can be also expanded to a larger number of dimensions and the new ones will be appended at the front.
+
+    For the new dimensions, the size cannot be set to -1.
+
+    Args:
+        x (oneflow.compatible.single_client.Tensor): The input Tensor.
+        *sizes  (flow.Size or int): The desired expanded size.
+
+    Returns:
+        oneflow.compatible.single_client.Tensor: The result Tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+
+        >>> x = np.array([[[[0, 1]],
+        ...               [[2, 3]],
+        ...               [[4, 5]]]]).astype(np.int32)
+
+        >>> input = flow.Tensor(x)
+
+        >>> out = input.expand(1, 3, 2, 2)
+        >>> out.shape
+        flow.Size([1, 3, 2, 2])
+
+    """
+    return Expand(sizes)(x)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/flatten.py b/python/oneflow/compatible/single_client/nn/modules/flatten.py
new file mode 100644
index 0000000000000000000000000000000000000000..5eac3c0b7ce99f7a1d6931d9e1edbb616c478ba5
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/flatten.py
@@ -0,0 +1,80 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class Flatten(Module):
+    """Flattens a contiguous range of dims into a tensor. For use with: nn.Sequential.
+
+    Args:
+        start_dim: first dim to flatten (default = 1).
+        end_dim: last dim to flatten (default = -1).
+    
+
+    For example: 
+
+    .. code-block:: python 
+
+        import oneflow.compatible.single_client.experimental as flow
+        
+        input = flow.Tensor(32, 1, 5, 5)
+        m = flow.nn.Flatten()
+        output = m(input)
+        output.size()
+        # out flow.Size([32, 25])
+
+    """
+
+    def __init__(self, start_dim: int = 1, end_dim: int = -1) -> None:
+        super().__init__()
+        self.start_dim = start_dim
+        self.end_dim = end_dim
+
+    def forward(self, input):
+        return flow.F.flatten(input, start_dim=self.start_dim, end_dim=self.end_dim)
+
+
+@register_tensor_op("flatten")
+def _flow_flatten(input, start_dim: int = 0, end_dim: int = -1):
+    """Flattens a contiguous range of dims into a tensor.
+
+    Args:
+        start_dim: first dim to flatten (default = 0).
+        end_dim: last dim to flatten (default = -1).
+    
+    For example: 
+
+    .. code-block:: python 
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> input = flow.Tensor(32, 1, 5, 5)
+        >>> output = input.flatten(start_dim=1)
+        >>> output.size()
+        flow.Size([32, 25])
+
+    """
+    return Flatten(start_dim=start_dim, end_dim=end_dim)(input)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/floor.py b/python/oneflow/compatible/single_client/nn/modules/floor.py
new file mode 100644
index 0000000000000000000000000000000000000000..63fd9528945deddff6f0796a2f1fac70336faffb
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/floor.py
@@ -0,0 +1,80 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import collections
+from typing import Optional, Sequence, Union
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+from oneflow.compatible.single_client.nn.modules.utils import _check_axis
+
+
+class Floor(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.floor(x)
+
+
+def floor_op(x):
+    """
+    Returns a new tensor with the arcsine of the elements of :attr:`input`.
+
+    .. math::
+        \\text{out}_{i} = \\lfloor \\text{input}_{i} \\rfloor
+
+    Args:
+        input (Tensor): the input tensor.
+        
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+        >>> input = flow.Tensor(np.array([-0.5,  1.5, 0,  0.8]), dtype=flow.float32)
+        >>> output = flow.floor(input)
+        >>> output.shape
+        flow.Size([4])
+        >>> output.numpy()
+        array([-1.,  1.,  0.,  0.], dtype=float32)
+        
+        >>> input1 = flow.Tensor(np.array([[0.8, 1.0], [-0.6, 2.5]]), dtype=flow.float32)
+        >>> output1 = input1.floor()
+        >>> output1.shape
+        flow.Size([2, 2])
+        >>> output1.numpy()
+        array([[ 0.,  1.],
+               [-1.,  2.]], dtype=float32)
+
+    """
+    return Floor()(x)
+
+
+@register_tensor_op("floor")
+def floor_op_tensor(input):
+    """
+    See :func:`oneflow.compatible.single_client.experimental.floor`
+    """
+    return Floor()(input)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/gather.py b/python/oneflow/compatible/single_client/nn/modules/gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..47c6e9200389c62e23054ea934d42dab3e2937cb
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/gather.py
@@ -0,0 +1,87 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import List, Optional, Tuple
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import Tensor, register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class Gather(Module):
+    def __init__(self, dim: int = 0, sparse_grad: bool = False):
+        super().__init__()
+        assert sparse_grad is False, "Only support bool = False for now!"
+        self.dim = dim
+
+    def forward(self, input, index):
+        assert self.dim < len(
+            index.shape
+        ), "Value of dim is out of range(dim should be less than len(index.shape))"
+        assert len(input.shape) == len(
+            index.shape
+        ), "Dimensions of input and index should equal"
+        for i in range(0, len(input.shape)):
+            if self.dim == i:
+                continue
+            else:
+                assert (
+                    input.shape[i] == index.shape[i]
+                ), "Dimensions of input and index should be same except at dim"
+        return flow.F.dim_gather(input, index, dim=self.dim)
+
+
+@register_tensor_op("gather")
+def gather_op(input, index, dim=0, sparse_grad=False):
+    """Gathers values along an axis specified by `dim`.
+
+    For a 3-D tensor the output is specified by:
+
+        out[i][j][k] = input[index[i][j][k]][j][k]  # if dim == 0
+        out[i][j][k] = input[i][index[i][j][k]][k]  # if dim == 1
+        out[i][j][k] = input[i][j][index[i][j][k]]  # if dim == 2
+
+    :attr:`input` and :attr:`index` must have the same number of dimensions.
+    It is also required that ``index.size(d) <= input.size(d)`` for all
+    dimensions ``d != dim``.  :attr:`out` will have the same shape as :attr:`index`.
+    Note that ``input`` and ``index`` do not broadcast against each other.
+
+    Args:
+        input (Tensor): the source tensor
+        dim (int): the axis along which to index
+        index (LongTensor): the indices of elements to gather
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+
+        >>> input = np.random.randn(3, 4, 3, 5)
+        >>> index = np.random.choice(np.arange(3), size=180, replace=True).reshape((3, 4, 3, 5))
+        >>> output = flow.gather(flow.Tensor(input), flow.Tensor(index, dtype=flow.int), dim=1)
+        >>> output.shape
+        flow.Size([3, 4, 3, 5])
+
+    """
+    return Gather(dim, sparse_grad)(input, index)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/greater.py b/python/oneflow/compatible/single_client/nn/modules/greater.py
new file mode 100644
index 0000000000000000000000000000000000000000..f49ea8f46a9a411af6633dd7dcb0e00ff297814d
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/greater.py
@@ -0,0 +1,81 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class Greater(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x, y):
+        if x.dtype != flow.float32:
+            x = flow.experimental.cast(x, flow.float32)
+        if isinstance(y, int) or isinstance(y, float):
+            y = flow.Tensor(
+                [float(y)], dtype=flow.float32, device=flow.device(x.device.type)
+            )
+        if y.dtype != flow.float32:
+            y = flow.experimental.cast(y, flow.float32)
+        return flow.F.broadcast_greater(x, y)
+
+
+def greater_op(x, y):
+    """Returns the truth value of :math:`x > y` element-wise.
+
+    Args:
+        x (oneflow.compatible.single_client.Tensor): A Tensor
+        y (oneflow.compatible.single_client.Tensor): A Tensor
+
+    Returns:
+        oneflow.compatible.single_client.Tensor: A Tensor with int8 type.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> input1 = flow.Tensor(np.random.randn(2, 6, 5, 3), dtype=flow.float32)
+        >>> input2 = flow.Tensor(np.random.randn(2, 6, 5, 3), dtype=flow.float32)
+
+        >>> out = flow.gt(input1, input2).shape
+        >>> out
+        flow.Size([2, 6, 5, 3])
+
+    """
+    return Greater()(x, y)
+
+
+@register_tensor_op("gt")
+def greater_op_tensor(x, y):
+    """
+
+    gt() -> Tensor
+
+    See :func:`oneflow.compatible.single_client.experimental.gt`
+
+    """
+    return Greater()(x, y)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/greater_equal.py b/python/oneflow/compatible/single_client/nn/modules/greater_equal.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2bda3a0b7b4cfe7841b65de8f04ab7f637de491
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/greater_equal.py
@@ -0,0 +1,81 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class GreaterEqual(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x, y):
+        if x.dtype != flow.float32:
+            x = flow.experimental.cast(x, flow.float32)
+        if isinstance(y, int) or isinstance(y, float):
+            y = flow.Tensor(
+                [float(y)], dtype=flow.float32, device=flow.device(x.device.type)
+            )
+        if y.dtype != flow.float32:
+            y = flow.experimental.cast(y, flow.float32)
+        return flow.F.broadcast_greater_equal(x, y)
+
+
+def greater_equal_op(x, y):
+    """Returns the truth value of :math:`x >= y` element-wise.
+
+    Args:
+        x (oneflow.compatible.single_client.Tensor): A Tensor
+        y (oneflow.compatible.single_client.Tensor): A Tensor
+
+    Returns:
+        oneflow.compatible.single_client.Tensor: A Tensor with int8 type.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> input1 = flow.Tensor(np.array([1, 2, 3]).astype(np.float32), dtype=flow.float32)
+        >>> input2 = flow.Tensor(np.array([1, 1, 4]).astype(np.float32), dtype=flow.float32)
+
+        >>> out = flow.ge(input1, input2)
+        >>> out
+        tensor([1, 1, 0], dtype=oneflow.int8)
+
+    """
+    return GreaterEqual()(x, y)
+
+
+@register_tensor_op("ge")
+def greater_equal_op_tensor(x, y):
+    """
+
+    ge() -> Tensor
+
+    See :func:`oneflow.compatible.single_client.experimental.ge`
+
+    """
+    return GreaterEqual()(x, y)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/instancenorm.py b/python/oneflow/compatible/single_client/nn/modules/instancenorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..b995628852cd9d0ffcf0575a32cf16595a3e402b
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/instancenorm.py
@@ -0,0 +1,318 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.nn.modules.batchnorm import _NormBase
+
+
+class _InstanceNorm(_NormBase):
+    def __init__(
+        self,
+        num_features: int,
+        eps: float = 1e-05,
+        momentum: float = 0.1,
+        affine: bool = False,
+        track_running_stats: bool = False,
+    ):
+        super().__init__(num_features, eps, momentum, affine, track_running_stats)
+
+    def _forward(self, x):
+        axis = 1
+        params_shape = [x.shape[axis]]
+        weight = self.weight
+        bias = self.bias
+        nd_params_shape = [1] * len(x.shape)
+        nd_params_shape[axis] = params_shape[0]
+        mean = x.mean(2, keepdim=True)
+        variance = x.var(2, keepdim=True)
+        normalized = (x - mean) / flow.experimental.sqrt(variance + self.eps)
+        if self.weight and params_shape[0] == self.weight.nelement():
+            weight = self.weight.reshape(shape=nd_params_shape)
+        if self.bias and params_shape[0] == self.bias.nelement():
+            bias = self.bias.reshape(shape=nd_params_shape)
+        if self.weight:
+            normalized = normalized * weight
+        if self.bias:
+            normalized = normalized + bias
+        return normalized
+
+    def forward(self, x):
+        self._check_input_dim(x)
+        reshape_to_1d = x.reshape([x.shape[0], x.shape[1], -1])
+        normalized_1d_out = self._forward(reshape_to_1d)
+        reshape_back_to_nd = normalized_1d_out.reshape(list(x.shape))
+        return reshape_back_to_nd
+
+
+class InstanceNorm1d(_InstanceNorm):
+    """The interface is consistent with PyTorch.
+    The documentation is referenced from: https://pytorch.org/docs/stable/generated/torch.nn.InstanceNorm1d.html
+
+    Applies Instance Normalization over a 3D input (a mini-batch of 1D
+    inputs with optional additional channel dimension) as described in the paper
+    `Instance Normalization: The Missing Ingredient for Fast Stylization
+    <https://arxiv.org/abs/1607.08022>`__.
+
+    .. math::
+
+        y = \\frac{x - \\mathrm{E}[x]}{ \\sqrt{\\mathrm{Var}[x] + \\epsilon}} * \\gamma + \\beta
+
+    The mean and standard-deviation are calculated per-dimension separately
+    for each object in a mini-batch. :math:`\\gamma` and :math:`\\beta` are learnable parameter vectors
+    of size `C` (where `C` is the input size) if :attr:`affine` is ``True``.
+    The standard-deviation is calculated via the biased estimator, equivalent to
+    `torch.var(input, unbiased=False)`.
+
+    By default, this layer uses instance statistics computed from input data in
+    both training and evaluation modes.
+
+    If :attr:`track_running_stats` is set to ``True``, during training this
+    layer keeps running estimates of its computed mean and variance, which are
+    then used for normalization during evaluation. The running estimates are
+    kept with a default :attr:`momentum` of 0.1.
+
+    .. note::
+        This :attr:`momentum` argument is different from one used in optimizer
+        classes and the conventional notion of momentum. Mathematically, the
+        update rule for running statistics here is
+        :math:`\\hat{x}_\\text{new} = (1 - \\text{momentum}) \\times \\hat{x} + \\text{momentum} \\times x_t`,
+        where :math:`\\hat{x}` is the estimated statistic and :math:`x_t` is the
+        new observed value.
+
+    .. note::
+        :class:`InstanceNorm1d` and :class:`LayerNorm` are very similar, but
+        have some subtle differences. :class:`InstanceNorm1d` is applied
+        on each channel of channeled data like multidimensional time series, but
+        :class:`LayerNorm` is usually applied on entire sample and often in NLP
+        tasks. Additionally, :class:`LayerNorm` applies elementwise affine
+        transform, while :class:`InstanceNorm1d` usually don't apply affine
+        transform.
+
+    Args:
+        num_features: :math:`C` from an expected input of size
+            :math:`(N, C, L)` or :math:`L` from input of size :math:`(N, L)`
+        eps: a value added to the denominator for numerical stability. Default: 1e-5
+        momentum: the value used for the running_mean and running_var computation. Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters, initialized the same way as done for batch normalization.
+            Default: ``False``.
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics and always uses batch
+            statistics in both training and eval modes. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, C, L)`
+        - Output: :math:`(N, C, L)` (same shape as input)
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+
+
+        >>> # Without Learnable Parameters
+        >>> m = flow.nn.InstanceNorm1d(100)
+        >>> # With Learnable Parameters
+        >>> m = flow.nn.InstanceNorm1d(100, affine=True)
+        >>> x = flow.Tensor(np.random.randn(20, 100, 40))
+        >>> output = m(x)
+
+    """
+
+    def _check_input_dim(self, input):
+        if input.dim() == 2:
+            raise ValueError(
+                "InstanceNorm1d returns 0-filled tensor to 2D tensor.This is because InstanceNorm1d reshapes inputs to(1, N * C, ...) from (N, C,...) and this makesvariances 0."
+            )
+        if input.dim() != 3:
+            raise ValueError("expected 3D input (got {}D input)".format(input.dim()))
+
+
+class InstanceNorm2d(_InstanceNorm):
+    """The interface is consistent with PyTorch.
+    The documentation is referenced from: https://pytorch.org/docs/stable/generated/torch.nn.InstanceNorm2d.html
+
+    Applies Instance Normalization over a 4D input (a mini-batch of 2D inputs
+    with additional channel dimension) as described in the paper
+    `Instance Normalization: The Missing Ingredient for Fast Stylization
+    <https://arxiv.org/abs/1607.08022>`__.
+
+    .. math::
+
+        y = \\frac{x - \\mathrm{E}[x]}{ \\sqrt{\\mathrm{Var}[x] + \\epsilon}} * \\gamma + \\beta
+
+    The mean and standard-deviation are calculated per-dimension separately
+    for each object in a mini-batch. :math:`\\gamma` and :math:`\\beta` are learnable parameter vectors
+    of size `C` (where `C` is the input size) if :attr:`affine` is ``True``.
+    The standard-deviation is calculated via the biased estimator, equivalent to
+    `torch.var(input, unbiased=False)`.
+
+    By default, this layer uses instance statistics computed from input data in
+    both training and evaluation modes.
+
+    If :attr:`track_running_stats` is set to ``True``, during training this
+    layer keeps running estimates of its computed mean and variance, which are
+    then used for normalization during evaluation. The running estimates are
+    kept with a default :attr:`momentum` of 0.1.
+
+    .. note::
+        This :attr:`momentum` argument is different from one used in optimizer
+        classes and the conventional notion of momentum. Mathematically, the
+        update rule for running statistics here is
+        :math:`\\hat{x}_\\text{new} = (1 - \\text{momentum}) \\times \\hat{x} + \\text{momentum} \\times x_t`,
+        where :math:`\\hat{x}` is the estimated statistic and :math:`x_t` is the
+        new observed value.
+
+    .. note::
+        :class:`InstanceNorm2d` and :class:`LayerNorm` are very similar, but
+        have some subtle differences. :class:`InstanceNorm2d` is applied
+        on each channel of channeled data like RGB images, but
+        :class:`LayerNorm` is usually applied on entire sample and often in NLP
+        tasks. Additionally, :class:`LayerNorm` applies elementwise affine
+        transform, while :class:`InstanceNorm2d` usually don't apply affine
+        transform.
+
+    Args:
+        num_features: :math:`C` from an expected input of size
+            :math:`(N, C, H, W)`
+        eps: a value added to the denominator for numerical stability. Default: 1e-5
+        momentum: the value used for the running_mean and running_var computation. Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters, initialized the same way as done for batch normalization.
+            Default: ``False``.
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics and always uses batch
+            statistics in both training and eval modes. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, C, H, W)`
+        - Output: :math:`(N, C, H, W)` (same shape as input)
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+
+
+        >>> # Without Learnable Parameters
+        >>> m = flow.nn.InstanceNorm2d(100)
+        >>> # With Learnable Parameters
+        >>> m = flow.nn.InstanceNorm2d(100, affine=True)
+        >>> x = flow.Tensor(np.random.randn(20, 100, 35, 45))
+        >>> output = m(x)
+
+    """
+
+    def _check_input_dim(self, input):
+        if input.dim() != 4:
+            raise ValueError("expected 4D input (got {}D input)".format(input.dim()))
+
+
+class InstanceNorm3d(_InstanceNorm):
+    """The interface is consistent with PyTorch.
+    The documentation is referenced from: https://pytorch.org/docs/stable/generated/torch.nn.InstanceNorm3d.html
+
+    Applies Instance Normalization over a 5D input (a mini-batch of 3D inputs
+    with additional channel dimension) as described in the paper
+    `Instance Normalization: The Missing Ingredient for Fast Stylization
+    <https://arxiv.org/abs/1607.08022>`__.
+
+    .. math::
+
+        y = \\frac{x - \\mathrm{E}[x]}{ \\sqrt{\\mathrm{Var}[x] + \\epsilon}} * \\gamma + \\beta
+
+    The mean and standard-deviation are calculated per-dimension separately
+    for each object in a mini-batch. :math:`\\gamma` and :math:`\\beta` are learnable parameter vectors
+    of size C (where C is the input size) if :attr:`affine` is ``True``.
+    The standard-deviation is calculated via the biased estimator, equivalent to
+    `torch.var(input, unbiased=False)`.
+
+    By default, this layer uses instance statistics computed from input data in
+    both training and evaluation modes.
+
+    If :attr:`track_running_stats` is set to ``True``, during training this
+    layer keeps running estimates of its computed mean and variance, which are
+    then used for normalization during evaluation. The running estimates are
+    kept with a default :attr:`momentum` of 0.1.
+
+    .. note::
+        This :attr:`momentum` argument is different from one used in optimizer
+        classes and the conventional notion of momentum. Mathematically, the
+        update rule for running statistics here is
+        :math:`\\hat{x}_\\text{new} = (1 - \\text{momentum}) \\times \\hat{x} + \\text{momentum} \\times x_t`,
+        where :math:`\\hat{x}` is the estimated statistic and :math:`x_t` is the
+        new observed value.
+
+    .. note::
+        :class:`InstanceNorm3d` and :class:`LayerNorm` are very similar, but
+        have some subtle differences. :class:`InstanceNorm3d` is applied
+        on each channel of channeled data like 3D models with RGB color, but
+        :class:`LayerNorm` is usually applied on entire sample and often in NLP
+        tasks. Additionally, :class:`LayerNorm` applies elementwise affine
+        transform, while :class:`InstanceNorm3d` usually don't apply affine
+        transform.
+
+    Args:
+        num_features: :math:`C` from an expected input of size
+            :math:`(N, C, D, H, W)`
+        eps: a value added to the denominator for numerical stability. Default: 1e-5
+        momentum: the value used for the running_mean and running_var computation. Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters, initialized the same way as done for batch normalization.
+            Default: ``False``.
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics and always uses batch
+            statistics in both training and eval modes. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, C, D, H, W)`
+        - Output: :math:`(N, C, D, H, W)` (same shape as input)
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+
+
+        >>> # Without Learnable Parameters
+        >>> m = flow.nn.InstanceNorm3d(100)
+        >>> # With Learnable Parameters
+        >>> m = flow.nn.InstanceNorm3d(100, affine=True)
+        >>> x = flow.Tensor(np.random.randn(20, 100, 35, 45, 10))
+        >>> output = m(x)
+
+    """
+
+    def _check_input_dim(self, input):
+        if input.dim() != 5:
+            raise ValueError("expected 5D input (got {}D input)".format(input.dim()))
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/less.py b/python/oneflow/compatible/single_client/nn/modules/less.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a10893f2d4603cf1ea1d5537af10a52473e1a84
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/less.py
@@ -0,0 +1,70 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class Less(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x, y):
+        if x.dtype != flow.float32:
+            x = flow.experimental.cast(x, flow.float32)
+        if isinstance(y, int) or isinstance(y, float):
+            y = flow.Tensor(
+                [float(y)], dtype=flow.float32, device=flow.device(x.device.type)
+            )
+        if y.dtype != flow.float32:
+            y = flow.experimental.cast(y, flow.float32)
+        return flow.F.broadcast_less(x, y)
+
+
+@register_tensor_op("lt")
+def less_op(x, y):
+    """Returns the truth value of :math:`x < y` element-wise.
+
+    Args:
+        x (oneflow.compatible.single_client.Tensor): A Tensor
+        y (oneflow.compatible.single_client.Tensor): A Tensor
+
+    Returns:
+        oneflow.compatible.single_client.Tensor: A Tensor with int8 type.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> input1 = flow.Tensor(np.array([1, 2, 3]).astype(np.float32), dtype=flow.float32)
+        >>> input2 = flow.Tensor(np.array([1, 2, 4]).astype(np.float32), dtype=flow.float32)
+
+        >>> out = flow.lt(input1, input2)
+        >>> out
+        tensor([0, 0, 1], dtype=oneflow.int8)
+
+    """
+    return Less()(x, y)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/less_equal.py b/python/oneflow/compatible/single_client/nn/modules/less_equal.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3d8145b8f9985450f2c24d31ed93441be18bc23
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/less_equal.py
@@ -0,0 +1,70 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class LessEqual(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x, y):
+        if x.dtype != flow.float32:
+            x = flow.experimental.cast(x, flow.float32)
+        if isinstance(y, int) or isinstance(y, float):
+            y = flow.Tensor(
+                [float(y)], dtype=flow.float32, device=flow.device(x.device.type)
+            )
+        if y.dtype != flow.float32:
+            y = flow.experimental.cast(y, flow.float32)
+        return flow.F.broadcast_less_equal(x, y)
+
+
+@register_tensor_op("le")
+def less_equal_op(x, y):
+    """Returns the truth value of :math:`x <= y` element-wise.
+
+    Args:
+        x (oneflow.compatible.single_client.Tensor): A Tensor
+        y (oneflow.compatible.single_client.Tensor): A Tensor
+
+    Returns:
+        oneflow.compatible.single_client.Tensor: A Tensor with int8 type.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> input1 = flow.Tensor(np.array([1, 2, 3]).astype(np.float32), dtype=flow.float32)
+        >>> input2 = flow.Tensor(np.array([1, 1, 4]).astype(np.float32), dtype=flow.float32)
+
+        >>> out = flow.le(input1, input2)
+        >>> out
+        tensor([1, 0, 1], dtype=oneflow.int8)
+
+    """
+    return LessEqual()(x, y)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/linear.py b/python/oneflow/compatible/single_client/nn/modules/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..af5f8d98a168db08471aa7558bb5464d287cfff5
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/linear.py
@@ -0,0 +1,128 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import math
+from typing import List, Optional, Tuple
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import Tensor
+from oneflow.compatible.single_client.nn.init import _calculate_fan_in_and_fan_out
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class Identity(Module):
+    """A placeholder identity operator that is argument-insensitive.
+
+    Args:
+        args: any argument (unused)
+        kwargs: any keyword argument (unused)
+
+    For example:
+
+    .. code-block:: python
+
+        import numpy as np
+        import oneflow.compatible.single_client as flow
+
+        m = flow.nn.Identity()
+        input = flow.Tensor(np.random.rand(2, 3, 4, 5))
+
+        output = m(input)
+
+        # output = input
+
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+
+    def forward(self, input: Tensor) -> Tensor:
+        return input
+
+
+class Linear(Module):
+    """Applies a linear transformation to the incoming data: :math:`y = xA^T + b`
+
+    Args:
+
+        - in_features: size of each input sample
+
+        - out_features: size of each output sample
+
+        - bias: If set to ``False``, the layer will not learn an additive bias. Default: ``True``
+
+    Shape:
+        - Input: :math:`(N, *, H_{in})` where :math:`*` means any number of
+          additional dimensions and :math:`H_{in} = {in\\_features}`
+
+        - Output: :math:`(N, *, H_{out})` where all but the last dimension
+          are the same shape as the input and :math:`H_{out} = {out\\_features}`.
+
+    Attr:
+        - :attr:`weight`: the learnable weights of the module of shape :math:`({out\\_features}, {in\\_features})`. The values are initialized from :math:`\\mathcal{U}(-\\sqrt{k}, \\sqrt{k})`, where :math:`(k = 1 / {in\\_features})`
+
+        - :attr:`bias`: the learnable bias of the module of shape :math:`({out\\_features})`. If :attr:`bias` is ``True``, the values are initialized from :math:`\\mathcal{U}(-\\sqrt{k}, \\sqrt{k})` where :math:`(k = 1 / {in\\_features})`
+
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+
+        >>> m = flow.nn.Linear(20, 30, False)
+        >>> input = flow.Tensor(np.random.randn(128, 20))
+        >>> output = m(input)
+        >>> output.size()
+        flow.Size([128, 30])
+
+    """
+
+    def __init__(self, in_features: int, out_features: int, bias: bool = True) -> None:
+        super().__init__()
+        self.use_bias = bias
+        self.weight = flow.nn.Parameter(flow.Tensor(out_features, in_features))
+        self.bias = None
+        if bias:
+            self.bias = flow.nn.Parameter(flow.Tensor(out_features))
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        flow.nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+        if self.bias is not None:
+            (fan_in, _) = _calculate_fan_in_and_fan_out(self.weight)
+            bound = 1 / math.sqrt(fan_in)
+            flow.nn.init.uniform_(self.bias, -bound, bound)
+
+    def forward(self, x):
+        assert len(x.shape) >= 2, "Tensor x's dim should >=2"
+        if len(x.shape) == 2:
+            res = flow.F.matmul(x, self.weight, transpose_a=False, transpose_b=True)
+        else:
+            res = flow.F.broadcast_matmul(
+                x, self.weight, transpose_a=False, transpose_b=True
+            )
+        if self.use_bias:
+            res += self.bias
+        return res
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/log1p.py b/python/oneflow/compatible/single_client/nn/modules/log1p.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc12c1b326515255d1dfc47cd94b84d5d5a313cd
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/log1p.py
@@ -0,0 +1,56 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class Log1p(Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op = flow.builtin_op("log1p").Input("x").Output("y").Build()
+
+    def forward(self, x):
+        return self._op(x)[0]
+
+
+@register_tensor_op("log1p")
+def log1p_op(input):
+    """Returns a new tensor with the natural logarithm of (1 + input).
+
+    .. math::
+        \\text{out}_{i}=\\log_e(1+\\text{input}_{i})
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+        >>> x = flow.Tensor(np.array([1.3, 1.5, 2.7]))
+        >>> out = flow.log1p(x).numpy()
+        >>> out
+        array([0.8329091 , 0.91629076, 1.3083328 ], dtype=float32)
+
+    """
+    return Log1p()(input)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/loss.py b/python/oneflow/compatible/single_client/nn/modules/loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..25af04ed9d10f50b9425bb0789da2f70b53f626c
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/loss.py
@@ -0,0 +1,1073 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import Tensor
+from oneflow.compatible.single_client.nn.module import Module
+from oneflow.compatible.single_client.nn.modules.constant import _ConstantBase
+
+
+class L1Loss(Module):
+    """This operator computes the L1 Loss between each element in `input` and `target`.
+
+    The equation is:
+
+    if reduction = "none":
+
+    .. math::
+
+        output = |Target - Input|
+
+    if reduction = "mean":
+
+    .. math::
+
+        output = \\frac{1}{n}\\sum_{i=1}^n|Target_i - Input_i|
+
+    if reduction = "sum":
+
+    .. math::
+
+        output = \\sum_{i=1}^n|Target_i - Input_i|
+
+    Args:
+        input (oneflow.compatible.single_client.experimental.Tensor): The input Tensor.
+        target (oneflow.compatible.single_client.experimental.Tensor): The target Tensor.
+        reduction (str): The reduce type, it can be one of "none", "mean", "sum". Defaults to "mean".
+
+    Returns:
+        oneflow.compatible.single_client.experimental.Tensor: The result Tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+
+        >>> input = flow.Tensor([[1, 1, 1], [2, 2, 2], [7, 7, 7]], dtype = flow.float32)
+        >>> target = flow.Tensor([[4, 4, 4], [4, 4, 4], [4, 4, 4]], dtype = flow.float32)
+        >>> m = flow.nn.L1Loss(reduction="none")
+        >>> out = m(input, target)
+        >>> out
+        tensor([[3., 3., 3.],
+                [2., 2., 2.],
+                [3., 3., 3.]], dtype=oneflow.float32)
+        >>> m_mean = flow.nn.L1Loss(reduction="mean")
+        >>> out = m_mean(input, target)
+        >>> out
+        tensor([2.6667], dtype=oneflow.float32)
+        >>> m_mean = flow.nn.L1Loss(reduction="sum")
+        >>> out = m_mean(input, target)
+        >>> out
+        tensor([24.], dtype=oneflow.float32)
+    """
+
+    def __init__(self, reduction: str = "mean", reduce=True) -> None:
+        super().__init__()
+        if reduce is not None and (not reduce):
+            raise ValueError("Argument reduce is not supported yet")
+        assert reduction in [
+            "none",
+            "mean",
+            "sum",
+            None,
+        ], "only 'sum', 'mean' and 'none' supported by now"
+        self.reduction = reduction
+
+    def forward(self, input, target):
+        assert (
+            input.shape == target.shape
+        ), "The Input shape must be the same as Target shape"
+        l1_value = flow.experimental.abs(flow.experimental.sub(input, target))
+        if self.reduction == "mean":
+            return flow.experimental.mean(l1_value)
+        elif self.reduction == "sum":
+            return flow.experimental.sum(l1_value)
+        else:
+            return l1_value
+
+
+class CrossEntropyLoss(Module):
+    """This criterion combines :class:`~flow.nn.LogSoftmax` and :class:`~flow.nn.NLLLoss` in one single class.
+
+    It is useful when training a classification problem with `C` classes.
+
+    The `input` is expected to contain raw, unnormalized scores for each class.
+
+    `input` has to be a Tensor of size either :math:`(minibatch, C)` or
+    :math:`(minibatch, C, d_1, d_2, ..., d_K)`
+    with :math:`K \\geq 1` for the `K`-dimensional case (described later).
+
+    This criterion expects a class index in the range :math:`[0, C-1]` as the
+    `target` for each value of a 1D tensor of size `minibatch`;
+
+    The loss can be described as:
+
+    .. math::
+        \\text{loss}(x, class) = -\\log\\left(\\frac{\\exp(x[class])}{\\sum_j \\exp(x[j])}\\right)
+                       = -x[class] + \\log\\left(\\sum_j \\exp(x[j])\\right)
+
+    Can also be used for higher dimension inputs, such as 2D images, by providing
+    an input of size :math:`(minibatch, C, d_1, d_2, ..., d_K)` with :math:`K \\geq 1`,
+    where :math:`K` is the number of dimensions, and a target of appropriate shape
+    (see below).
+
+    Args:
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will
+            be applied, ``'mean'``: the weighted mean of the output is taken,
+            ``'sum'``: the output will be summed. Default: ``'mean'``
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+
+        >>> input = flow.Tensor(
+        ...    [[-0.1664078, -1.7256707, -0.14690138],
+        ...        [-0.21474946, 0.53737473, 0.99684894],
+        ...        [-1.135804, -0.50371903, 0.7645404]], dtype=flow.float32)
+        >>> target = flow.Tensor(np.array([0, 1, 2]), dtype=flow.int32)
+        >>> out = flow.nn.CrossEntropyLoss(reduction="none")(input, target)
+        >>> out
+        tensor([0.802 , 1.1167, 0.3583], dtype=oneflow.float32)
+        >>> out_sum = flow.nn.CrossEntropyLoss(reduction="sum")(input, target)
+        >>> out_sum
+        tensor([2.2769], dtype=oneflow.float32)
+        >>> out_mean = flow.nn.CrossEntropyLoss(reduction="mean")(input, target)
+        >>> out_mean
+        tensor([0.759], dtype=oneflow.float32)
+
+    """
+
+    def __init__(
+        self,
+        weight=None,
+        ignore_index: Optional[int] = None,
+        reduction: Optional[str] = "mean",
+    ) -> None:
+        super().__init__()
+        if weight is not None:
+            raise ValueError("Argument weight is not supported yet")
+        assert reduction in [
+            "sum",
+            "none",
+            "mean",
+            None,
+        ], "only 'sum', 'mean' and None supported by now"
+        self.ignore_index = ignore_index
+        self.reduction = reduction
+
+    def forward(self, input, target):
+        assert len(input.shape) <= 4
+        assert len(target.shape) == len(input.shape) - 1
+        input_shape_len = len(input.shape)
+        if input_shape_len == 3:
+            (b, c, h) = (input.shape[0], input.shape[1], input.shape[2])
+            input = flow.F.transpose(input, perm=(0, 2, 1))
+            input = input.reshape(shape=[-1, input.shape[2]])
+            target = target.flatten()
+        elif input_shape_len == 4:
+            (b, c, h, w) = (
+                input.shape[0],
+                input.shape[1],
+                input.shape[2],
+                input.shape[3],
+            )
+            input = flow.F.transpose(input, perm=(0, 2, 3, 1))
+            input = input.reshape(shape=[-1, input.shape[3]])
+            target = target.flatten()
+        elif input_shape_len >= 5:
+            raise NotImplemented
+        out = flow.F.sparse_softmax_cross_entropy(
+            input, target, depth=input.shape[len(input.shape) - 1]
+        )
+        if self.ignore_index is not None:
+            zeros = flow.experimental.zeros(
+                size=out.shape, dtype=out.dtype, device=out.device
+            )
+            condition = flow.experimental.eq(target, self.ignore_index)
+            ones = flow.experimental.ones(
+                size=condition.shape, dtype=condition.dtype, device=condition.device
+            )
+            condition = ones.sub(condition).reshape(tuple(out.shape))
+            out = flow.experimental.where(condition, out, zeros)
+            if self.reduction == "mean":
+                reduce_sum = out.sum()
+                reduce_count = condition.argwhere().shape[0]
+                out = flow.experimental.mul(reduce_sum, 1.0 / reduce_count)
+        if self.reduction == "mean":
+            return out.mean()
+        elif self.reduction == "sum":
+            return out.sum()
+        else:
+            if input_shape_len == 4:
+                out = out.reshape((b, h, w))
+            return out
+
+
+class BCELoss(Module):
+    """This operator computes the binary cross entropy loss.
+
+    The equation is:
+
+    if reduction = "none":
+
+    .. math::
+
+        out = -(Target_i*log(Input_i) + (1-Target_i)*log(1-Input_i))
+
+    if reduction = "mean":
+
+    .. math::
+
+        out = -\\frac{1}{n}\\sum_{i=1}^n(Target_i*log(Input_i) + (1-Target_i)*log(1-Input_i))
+
+    if reduction = "sum":
+
+    .. math::
+
+        out = -\\sum_{i=1}^n(Target_i*log(Input_i) + (1-Target_i)*log(1-Input_i))
+
+    Args:
+        weight (oneflow.compatible.single_client.experimental.Tensor, optional): The manual rescaling weight to the loss. Default to None, whose corresponding weight value is 1.
+        reduction (str, optional): The reduce type, it can be one of "none", "mean", "sum". Defaults to "mean".
+
+    Attention:
+        The input value must be in the range of (0, 1). Or the loss function may return `nan` value.
+
+    Returns:
+        oneflow.compatible.single_client.experimental.Tensor: The result Tensor.
+
+    For example:
+
+    .. code-block:: python
+    
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+
+        >>> input = flow.Tensor(np.array([[1.2, 0.2, -0.3], [0.7, 0.6, -2]]).astype(np.float32))
+        >>> target = flow.Tensor(np.array([[0, 1, 0], [1, 0, 1]]).astype(np.float32))
+        >>> weight = flow.Tensor(np.array([[2, 2, 2], [2, 2, 2]]).astype(np.float32))
+        >>> activation = flow.nn.Sigmoid()
+        >>> sigmoid_input = activation(input)
+        >>> m = flow.nn.BCELoss(weight, reduction="none")
+        >>> out = m(sigmoid_input, target)
+        >>> out
+        tensor([[2.9266, 1.1963, 1.1087],
+                [0.8064, 2.075 , 4.2539]], dtype=oneflow.float32)
+        >>> m_sum = flow.nn.BCELoss(weight, reduction="sum")
+        >>> out = m_sum(sigmoid_input, target)
+        >>> out
+        tensor([12.3668], dtype=oneflow.float32)
+        >>> m_mean = flow.nn.BCELoss(weight, reduction="mean")
+        >>> out = m_mean(sigmoid_input, target)
+        >>> out
+        tensor([2.0611], dtype=oneflow.float32)
+        >>> m_none = flow.nn.BCELoss()
+        >>> out = m_none(sigmoid_input, target)
+        >>> out
+        tensor([1.0306], dtype=oneflow.float32)
+
+    """
+
+    def __init__(self, weight: Tensor = None, reduction: str = "mean") -> None:
+        super().__init__()
+        assert reduction in [
+            "none",
+            "sum",
+            "mean",
+            None,
+        ], "only 'sum', 'mean' and 'none' supported by now"
+        self.weight = weight
+        self.reduction = reduction
+
+    def forward(self, input, target):
+        assert (
+            input.shape == target.shape
+        ), "The Input shape must be the same as Target shape"
+        _cross_entropy_loss = flow.experimental.negative(
+            target * flow.experimental.log(input)
+            + (1 - target) * flow.experimental.log(1 - input)
+        )
+        if self.weight is not None:
+            assert (
+                self.weight.shape == input.shape
+            ), "The weight shape must be the same as Input shape"
+            _weighted_loss = self.weight * _cross_entropy_loss
+        else:
+            _weighted_loss = _cross_entropy_loss
+        if self.reduction == "mean":
+            return flow.experimental.mean(_weighted_loss)
+        elif self.reduction == "sum":
+            return flow.experimental.sum(_weighted_loss)
+        else:
+            return _weighted_loss
+
+
+class NLLLoss(Module):
+    """ The negative log likelihood loss. It is useful to train a classification
+    problem with `C` classes.
+
+    The `input` given through a forward call is expected to contain
+    log-probabilities of each class. `input` has to be a Tensor of size either
+    :math:`(minibatch, C)` or :math:`(minibatch, C, d_1, d_2, ..., d_K)`
+    with :math:`K \\geq 1` for the `K`-dimensional case (described later).
+
+    Obtaining log-probabilities in a neural network is easily achieved by
+    adding a  `LogSoftmax`  layer in the last layer of your network.
+    You may use `CrossEntropyLoss` instead, if you prefer not to add an extra
+    layer.
+
+    The `target` that this loss expects should be a class index in the range :math:`[0, C-1]`
+    where `C = number of classes`;
+
+    The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
+
+    .. math::
+        \\ell(x, y) = L = \\{l_1,\\dots,l_N\\}^\\top, \\quad
+        l_n = - w_{y_n} x_{n,y_n}, \\quad
+        w_{c} = \\mathbb{1},
+
+    where :math:`x` is the input, :math:`y` is the target, :math:`w` is the weight, and
+    :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
+    (default ``'mean'``), then
+
+    .. math::
+        \\ell(x, y) = \\begin{cases}
+            \\sum_{n=1}^N \\frac{1}{N} l_n, &
+            \\text{if reduction} = \\text{`mean';}\\\\
+            \\sum_{n=1}^N l_n,  &
+            \\text{if reduction} = \\text{`sum'.}
+        \\end{cases}
+
+    Can also be used for higher dimension inputs, such as 2D images, by providing
+    an input of size :math:`(minibatch, C, d_1, d_2, ..., d_K)` with :math:`K \\geq 1`,
+    where :math:`K` is the number of dimensions, and a target of appropriate shape
+    (see below). In the case of images, it computes NLL loss per-pixel.
+
+    Args:
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will
+            be applied, ``'mean'``: the weighted mean of the output is taken,
+            ``'sum'``: the output will be summed. Default: ``'mean'``
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+        >>> import numpy as np
+
+        >>> input = flow.Tensor(
+        ... [[-0.1664078, -1.7256707, -0.14690138],
+        ... [-0.21474946, 0.53737473, 0.99684894],
+        ... [-1.135804, -0.50371903, 0.7645404]], dtype=flow.float32)
+        >>> target = flow.Tensor(np.array([0, 1, 2]), dtype=flow.int32)
+        >>> m = flow.nn.NLLLoss(reduction="none")
+        >>> out = m(input, target)
+        >>> out
+        tensor([ 0.1664, -0.5374, -0.7645], dtype=oneflow.float32)
+
+        >>> m = flow.nn.NLLLoss(reduction="sum")
+        >>> out = m(input, target)
+        >>> out
+        tensor([-1.1355], dtype=oneflow.float32)
+
+        >>> m = flow.nn.NLLLoss(reduction="mean")
+        >>> out = m(input, target)
+        >>> out
+        tensor([-0.3785], dtype=oneflow.float32)
+
+    """
+
+    def __init__(
+        self, weight=None, ignore_index: int = None, reduction: str = "mean"
+    ) -> None:
+        super().__init__()
+        if weight != None:
+            raise ValueError("Argument weight is not supported yet")
+        assert reduction in [
+            "sum",
+            "none",
+            "mean",
+            None,
+        ], "only 'sum', 'mean' and None supported by now"
+        self.ignore_index = ignore_index
+        self.reduction = reduction
+
+    def nllloss_1d(self, input, target):
+        target = flow.F.reshape(target, shape=(target.shape[0], 1))
+        res = flow.F.dim_gather(input, target, dim=1)
+        res = flow.F.squeeze(res, dim=[1])
+        return res
+
+    def forward(self, input, target):
+        assert len(input.shape) <= 4
+        assert len(target.shape) == len(input.shape) - 1
+        input = input.negative()
+        if len(input.shape) == 2:
+            res = self.nllloss_1d(input, target)
+        elif len(input.shape) == 3:
+            (b, c, h) = (input.shape[0], input.shape[1], input.shape[2])
+            input = flow.F.transpose(input, perm=(0, 2, 1))
+            input = input.reshape(shape=[-1, input.shape[2]])
+            target = target.flatten()
+            res = self.nllloss_1d(input, target)
+            res = res.reshape((b, h))
+        elif len(input.shape) == 4:
+            (b, c, h, w) = (
+                input.shape[0],
+                input.shape[1],
+                input.shape[2],
+                input.shape[3],
+            )
+            input = flow.F.transpose(input, perm=(0, 2, 3, 1))
+            input = input.reshape(shape=[-1, input.shape[3]])
+            target = target.flatten()
+            res = self.nllloss_1d(input, target)
+            res = res.reshape((b, h, w))
+        else:
+            raise NotImplemented
+        if self.ignore_index is not None:
+            zeros = flow.experimental.zeros(
+                size=res.shape, dtype=res.dtype, device=res.device
+            )
+            condition = flow.experimental.eq(target, self.ignore_index)
+            ones = flow.experimental.ones(
+                size=condition.shape, dtype=condition.dtype, device=condition.device
+            )
+            condition = ones.sub(condition).reshape(tuple(res.shape))
+            res = flow.experimental.where(condition, res, zeros)
+            if self.reduction == "mean":
+                res = res.sum()
+                reduce_count = condition.argwhere().shape[0]
+                res = flow.experimental.mul(res, 1.0 / reduce_count)
+        if self.reduction == "none":
+            return res
+        elif self.reduction == "sum":
+            return res.sum()
+        else:
+            return res.mean()
+
+
+class KLDivLoss(Module):
+    """The interface is consistent with PyTorch.
+    The documentation is referenced from:
+    https://pytorch.org/docs/stable/generated/torch.nn.KLDivLoss.html?highlight=kldivloss#torch.nn.KLDivLoss
+
+    The Kullback-Leibler divergence loss measure
+
+    `Kullback-Leibler divergence`_ is a useful distance measure for continuous
+    distributions and is often useful when performing direct regression over
+    the space of (discretely sampled) continuous output distributions.
+
+    As with :class:`~torch.nn.NLLLoss`, the `input` given is expected to contain
+    *log-probabilities* and is not restricted to a 2D Tensor.
+    The targets are interpreted as *probabilities* by default, but could be considered
+    as *log-probabilities* with :attr:`log_target` set to ``True``.
+
+    This criterion expects a `target` `Tensor` of the same size as the
+    `input` `Tensor`.
+
+    The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
+
+    .. math::
+        l(x,y) = L = \\{ l_1,\\dots,l_N \\}, \\quad
+        l_n = y_n \\cdot \\left( \\log y_n - x_n \\right)
+
+    where the index :math:`N` spans all dimensions of ``input`` and :math:`L` has the same
+    shape as ``input``. If :attr:`reduction` is not ``'none'`` (default ``'mean'``), then:
+
+    .. math::
+        \\ell(x, y) = \\begin{cases}
+            \\operatorname{mean}(L), & \\text{if reduction} = \\text{`mean';} \\\\
+            \\operatorname{sum}(L),  & \\text{if reduction} = \\text{`sum'.}
+        \\end{cases}
+
+    In default :attr:`reduction` mode ``'mean'``, the losses are averaged for each minibatch over observations
+    **as well as** over dimensions. ``'batchmean'`` mode gives the correct KL divergence where losses
+    are averaged over batch dimension only. ``'mean'`` mode's behavior will be changed to the same as
+    ``'batchmean'`` in the next major release.
+
+    .. _`kullback-leibler divergence`: https://en.wikipedia.org/wiki/Kullback-Leibler_divergence
+
+    Args:
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'batchmean'`` | ``'sum'`` | ``'mean'``.
+            ``'none'``: no reduction will be applied.
+            ``'batchmean'``: the sum of the output will be divided by batchsize.
+            ``'sum'``: the output will be summed.
+            ``'mean'``: the output will be divided by the number of elements in the output.
+            Default: ``'mean'``
+        log_target (bool, optional): Specifies whether `target` is passed in the log space.
+            Default: ``False``
+
+    .. note::
+        :attr:`reduction` = ``'mean'`` doesn't return the true kl divergence value, please use
+        :attr:`reduction` = ``'batchmean'`` which aligns with KL math definition.
+        In the next major release, ``'mean'`` will be changed to be the same as ``'batchmean'``.
+
+    Shape:
+        - Input: :math:`(N, *)` where :math:`*` means, any number of additional
+          dimensions
+        - Target: :math:`(N, *)`, same shape as the input
+        - Output: scalar by default. If :attr:``reduction`` is ``'none'``, then :math:`(N, *)`,
+          the same shape as the input
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+
+        >>> input = flow.Tensor([-0.9021705, 0.08798598, 1.04686249], dtype=flow.float32)
+        >>> target = flow.Tensor([1.22386942, -0.89729659, 0.01615712], dtype=flow.float32)
+        >>> m = flow.nn.KLDivLoss(reduction="none", log_target=False)
+        >>> out = m(input, target)
+        >>> out
+        tensor([ 1.3514,  0.    , -0.0836], dtype=oneflow.float32)
+        >>> m = flow.nn.KLDivLoss(reduction="mean", log_target=False)
+        >>> out = m(input, target)
+        >>> out
+        tensor([0.4226], dtype=oneflow.float32)
+        >>> m = flow.nn.KLDivLoss(reduction="sum", log_target=True)
+        >>> out = m(input, target)
+        >>> out
+        tensor([5.7801], dtype=oneflow.float32)
+
+    """
+
+    def __init__(self, reduction: str = "mean", log_target: bool = False) -> None:
+        super().__init__()
+        assert reduction in [
+            "sum",
+            "none",
+            "mean",
+            None,
+        ], "Argument reduction only support 'sum'/'mean'/'none'/None for now!"
+        self.reduction = reduction
+        self.log_target = log_target
+
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        if self.log_target:
+            _kl_div_loss = flow.experimental.exp(target) * (target - input)
+        else:
+            _kl_div_out_loss = target * (flow.experimental.log(target) - input)
+            _zeros = flow.experimental.zeros(
+                size=_kl_div_out_loss.shape,
+                dtype=_kl_div_out_loss.dtype,
+                device=_kl_div_out_loss.device,
+            )
+            _condition = flow.experimental.gt(target, 0)
+            _kl_div_loss = flow.experimental.where(_condition, _kl_div_out_loss, _zeros)
+        if self.reduction == "mean":
+            return flow.experimental.mean(_kl_div_loss)
+        elif self.reduction == "sum":
+            return flow.experimental.sum(_kl_div_loss)
+        else:
+            return _kl_div_loss
+
+
+class MSELoss(Module):
+    """The interface is consistent with PyTorch.
+    The documentation is referenced from:
+    https://pytorch.org/docs/stable/generated/torch.nn.MSELoss.html?highlight=mseloss#torch.nn.MSELoss
+
+    Creates a criterion that measures the mean squared error (squared L2 norm) between
+    each element in the input :math:`x` and target :math:`y`.
+
+    The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
+
+    .. math::
+        \\ell(x, y) = L = \\{l_1,\\dots,l_N\\}^\\top, \\quad
+        l_n = \\left( x_n - y_n \\right)^2,
+
+    where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
+    (default ``'mean'``), then:
+
+    .. math::
+        \\ell(x, y) =
+        \\begin{cases}
+            \\operatorname{mean}(L), &  \\text{if reduction} = \\text{`mean';}\\\\
+            \\operatorname{sum}(L),  &  \\text{if reduction} = \\text{`sum'.}
+        \\end{cases}
+
+    :math:`x` and :math:`y` are tensors of arbitrary shapes with a total
+    of :math:`n` elements each.
+
+    The mean operation still operates over all the elements, and divides by :math:`n`.
+
+    The division by :math:`n` can be avoided if one sets ``reduction = 'sum'``.
+
+    Args:
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Default: ``'mean'``
+
+    Shape:
+        - Input: :math:`(N, *)` where :math:`*` means, any number of additional
+          dimensions
+        - Target: :math:`(N, *)`, same shape as the input
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+
+        >>> input = flow.Tensor(
+        ... [[-0.02557137, 0.03101675, 1.37493674],
+        ... [0.25599439, -1.08372561, -0.21006816]], dtype=flow.float32)
+        >>> target = flow.Tensor(
+        ... [[-1.53105064, -0.68137555, 0.5931354],
+        ... [-0.49158347, 0.93673637, 0.1324141]], dtype=flow.float32)
+        >>> m = flow.nn.MSELoss(reduction="none")
+        >>> out = m(input, target)
+        >>> out
+        tensor([[2.2665, 0.5075, 0.6112],
+                [0.5589, 4.0823, 0.1173]], dtype=oneflow.float32)
+        >>> m = flow.nn.MSELoss(reduction="mean")
+        >>> out = m(input, target)
+        >>> out
+        tensor([1.3573], dtype=oneflow.float32)
+        >>> m = flow.nn.MSELoss(reduction="sum")
+        >>> out = m(input, target)
+        >>> out
+        tensor([8.1436], dtype=oneflow.float32)
+
+    """
+
+    def __init__(self, reduction: str = "mean") -> None:
+        super().__init__()
+        assert reduction in [
+            "sum",
+            "none",
+            "mean",
+            None,
+        ], "Argument reduction only support 'sum'/'mean'/'none'/None for now!"
+        self.reduction = reduction
+
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        mean_squared_difference = flow.experimental.square(
+            flow.experimental.sub(input, target)
+        )
+        if self.reduction == "mean":
+            return flow.experimental.mean(mean_squared_difference)
+        elif self.reduction == "sum":
+            return flow.experimental.sum(mean_squared_difference)
+        else:
+            return mean_squared_difference
+
+
+class MarginRankingLoss(Module):
+    """Creates a criterion that measures the loss given
+    inputs :math:`x1`, :math:`x2`, two 1D mini-batch `Tensors`,
+    and a label 1D mini-batch tensor :math:`y` (containing 1 or -1).
+
+    If :math:`y = 1` then it assumed the first input should be ranked higher
+    (have a larger value) than the second input, and vice-versa for :math:`y = -1`.
+
+    The loss function for each sample in the mini-batch is:
+
+    .. math::
+        \\text{loss}(x1, x2, y) = \\max(0, -y * (x1 - x2) + \\text{margin})
+
+    Args:
+        margin (float, optional): Has a default value of :math:`0`.
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Default: ``'mean'``
+
+    Shape:
+        - `x1` : :math:`(N, D)` where `N` is the batch size and `D` is the size of a sample.
+        - `x2` : :math:`(N, D)` where `N` is the batch size and `D` is the size of a sample.
+        - Target: :math:`(N)`
+        - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(N)`.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+        >>> import numpy as np
+
+        >>> x1 = flow.Tensor(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), dtype=flow.float32)
+        >>> x2 = flow.Tensor(np.array([[2, 2, 2], [2, 2, 2], [2, 2, 2]]), dtype=flow.float32)
+        >>> target = flow.Tensor(np.array([[1, -1, 1],[-1, 1, -1], [1, 1, 1]]), dtype=flow.float32)
+        >>> m = flow.nn.MarginRankingLoss(margin =1.0, reduction="none")
+        >>> out = m(x1, x2, target)
+        >>> out
+        tensor([[2., 1., 0.],
+                [3., 0., 5.],
+                [0., 0., 0.]], dtype=oneflow.float32)
+
+        >>> m = flow.nn.MarginRankingLoss(margin = 0.3, reduction="sum")
+        >>> out = m(x1, x2, target)
+        >>> out
+        tensor([8.2], dtype=oneflow.float32)
+
+        >>> m = flow.nn.MarginRankingLoss(margin = 10, reduction="mean")
+        >>> out = m(x1, x2, target)
+        >>> out
+        tensor([8.3333], dtype=oneflow.float32)
+
+
+    """
+
+    def __init__(self, margin=0.0, reduction: str = "mean") -> None:
+        super().__init__()
+        self.margin = margin
+        assert reduction in [
+            "sum",
+            "none",
+            "mean",
+            None,
+        ], "only 'sum', 'mean' and None supported by now"
+        self.reduction = reduction
+
+    def forward(self, input1, input2, target):
+        res = flow.experimental.clip(
+            flow.experimental.add(
+                self.margin,
+                flow.experimental.mul(
+                    target,
+                    flow.experimental.mul(-1, flow.experimental.sub(input1, input2)),
+                ),
+            ),
+            min=0.0,
+        )
+        if self.reduction == "none":
+            return res
+        elif self.reduction == "sum":
+            return res.sum()
+        else:
+            return res.mean()
+
+
+class CTCLoss(Module):
+    """The Connectionist Temporal Classification loss.
+    The interface is consistent with PyTorch.
+    The documentation is referenced from:
+    https://pytorch.org/docs/stable/generated/torch.nn.CTCLoss.html#torch.nn.CTCLoss
+
+    Calculates loss between a continuous (unsegmented) time series and a target sequence. CTCLoss sums over the
+    probability of possible alignments of input to target, producing a loss value which is differentiable
+    with respect to each input node. The alignment of input to target is assumed to be "many-to-one", which
+    limits the length of the target sequence such that it must be :math:`\\leq` the input length.
+
+    Args:
+        blank (int, optional): blank label. Default :math:`0`.
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the output losses will be divided by the target lengths and
+            then the mean over the batch is taken. Default: ``'mean'``
+        zero_infinity (bool, optional):
+            Whether to zero infinite losses and the associated gradients.
+            Default: ``False``
+            Infinite losses mainly occur when the inputs are too short
+            to be aligned to the targets.
+
+    Shape:
+        - Log_probs: Tensor of size :math:`(T, N, C)`,
+          where :math:`T = \\text{input length}`,
+          :math:`N = \\text{batch size}`, and
+          :math:`C = \\text{number of classes (including blank)}`.
+        - Targets: Tensor of size :math:`(N, S)` or
+          :math:`(\\operatorname{sum}(\\text{target\\_lengths}))`,
+          where :math:`N = \\text{batch size}` and
+          :math:`S = \\text{max target length, if shape is } (N, S)`.
+          It represent the target sequences. Each element in the target
+          sequence is a class index. And the target index cannot be blank (default=0).
+          In the :math:`(N, S)` form, targets are padded to the
+          length of the longest sequence, and stacked.
+          In the :math:`(\\operatorname{sum}(\\text{target\\_lengths}))` form,
+          the targets are assumed to be un-padded and
+          concatenated within 1 dimension.
+        - Input_lengths: Tuple or tensor of size :math:`(N)`,
+          where :math:`N = \\text{batch size}`. It represent the lengths of the
+          inputs (must each be :math:`\\leq T`). And the lengths are specified
+          for each sequence to achieve masking under the assumption that sequences
+          are padded to equal lengths.
+        - Target_lengths: Tuple or tensor of size :math:`(N)`,
+          where :math:`N = \\text{batch size}`. It represent lengths of the targets.
+          Lengths are specified for each sequence to achieve masking under the
+          assumption that sequences are padded to equal lengths. If target shape is
+          :math:`(N,S)`, target_lengths are effectively the stop index
+          :math:`s_n` for each target sequence, such that ``target_n = targets[n,0:s_n]`` for
+          each target in a batch. Lengths must each be :math:`\\leq S`
+          If the targets are given as a 1d tensor that is the concatenation of individual
+          targets, the target_lengths must add up to the total length of the tensor.
+
+    Reference:
+        A. Graves et al.: Connectionist Temporal Classification:
+        Labelling Unsegmented Sequence Data with Recurrent Neural Networks:
+        https://www.cs.toronto.edu/~graves/icml_2006.pdf
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+        >>> import numpy as np
+        >>> log_probs = np.array(
+        ...             [
+        ...                 [[-1.1031, -0.7998, -1.5200], [-0.9808, -1.1363, -1.1908]],
+        ...                 [[-1.2258, -1.0665, -1.0153], [-1.1135, -1.2331, -0.9671]],
+        ...                 [[-1.3348, -0.6611, -1.5118], [-0.9823, -1.2355, -1.0941]],
+        ...                 [[-1.3850, -1.3273, -0.7247], [-0.8235, -1.4783, -1.0994]],
+        ...                 [[-0.9049, -0.8867, -1.6962], [-1.4938, -1.3630, -0.6547]],
+        ...             ]
+        ...         ).astype(np.float32)
+        >>> log_probs = flow.Tensor(log_probs, dtype=flow.float32)
+        >>> targets = flow.Tensor(np.array([[1, 2, 2], [1, 2, 2]]).astype("int32"), dtype=flow.int32)
+        >>> input_lengths = flow.Tensor(np.array([5, 5]).astype("int32"), dtype=flow.int32)
+        >>> target_lengths = flow.Tensor(np.array([3, 3]).astype("int32"), dtype=flow.int32)
+        >>> loss_mean = flow.nn.CTCLoss()
+        >>> out = loss_mean(log_probs, targets, input_lengths, target_lengths)
+        >>> out
+        tensor([1.1376], dtype=oneflow.float32)
+        >>> loss_sum = flow.nn.CTCLoss(blank=0, reduction="sum")
+        >>> out = loss_sum(log_probs, targets, input_lengths, target_lengths)
+        >>> out
+        tensor([6.8257], dtype=oneflow.float32)
+        >>> 
+
+    """
+
+    def __init__(
+        self, blank: int = 0, reduction: str = "mean", zero_infinity: bool = False
+    ) -> None:
+        super().__init__()
+        assert reduction in [
+            "sum",
+            "none",
+            "mean",
+            None,
+        ], "only 'sum', 'mean' and None supported by now"
+        self.reduction = reduction
+        self.zero_infinity = zero_infinity
+        self._op = (
+            flow.builtin_op("ctc_loss")
+            .Input("log_probs")
+            .Input("targets")
+            .Input("input_lengths")
+            .Input("target_lengths")
+            .Output("loss")
+            .Output("alpha")
+            .Attr("blank", int(blank))
+            .Attr("zero_infinity", zero_infinity)
+            .Build()
+        )
+        self._xdivy_op = (
+            flow.builtin_op("xdivy").Input("x").Input("y").Output("z").Build()
+        )
+        self.constant = _ConstantBase
+
+    def forward(
+        self,
+        log_probs: Tensor,
+        targets: Tensor,
+        input_lengths: Tensor,
+        target_lengths: Tensor,
+    ) -> Tensor:
+        (loss, _) = self._op(log_probs, targets, input_lengths, target_lengths)
+        if self.zero_infinity:
+            cond = flow.experimental.eq(
+                loss,
+                self.constant(
+                    size=loss.shape,
+                    value=float("inf"),
+                    dtype=loss.dtype,
+                    device=loss.device,
+                )(),
+            )
+            loss = flow.experimental.where(
+                cond,
+                flow.experimental.zeros(
+                    size=loss.shape, dtype=loss.dtype, device=loss.device
+                ),
+                loss,
+            )
+        if self.reduction == "mean":
+            return flow.experimental.mean(
+                self._xdivy_op(
+                    loss,
+                    flow.experimental.cast(
+                        flow.experimental.clamp(target_lengths, min=1),
+                        dtype=log_probs.dtype,
+                    ),
+                )[0]
+            )
+        elif self.reduction == "sum":
+            return flow.experimental.sum(loss)
+        else:
+            return loss
+
+
+class BCEWithLogitsLoss(Module):
+    """This operator combines the `Sigmoid` and `BCELoss` together. For numerical stability,
+    we apply some math tricks instead of using `Sigmoid` layer with `BCELoss`.
+
+    The equation is:
+
+    if :attr:`reduction` = ``"none"``:
+
+    .. math::
+
+        out = -weight*[Pos\\_weight*y*log\\sigma({x}) + (1-y)*log(1-\\sigma(x))]
+
+    if :attr:`reduction` = ``"mean"``:
+
+    .. math::
+
+        out = -\\frac{weight}{n}\\sum_{i=1}^n[Pos\\_weight*y*log\\sigma({x}) + (1-y)*log(1-\\sigma(x))]
+
+    if :attr:`reduction` = ``"sum"``:
+
+    .. math::
+
+        out = -weight*\\sum_{i=1}^n[Pos\\_weight*y*log\\sigma({x}) + (1-y)*log(1-\\sigma(x))]
+
+    Args:
+        weight (Tensor, optional): The manual rescaling weight to the loss. Default: ``None``
+        size_average (bool, optional) 鈥� Deprecated (see :attr:`reduction`). Default: ``True``
+        reduce (bool, optional) 鈥� Deprecated (see :attr:`reduction`). Default: ``True``
+        reduction (str, optional): The reduce type, it can be one of ``"none"``, ``"mean"``, ``"sum"``.
+            ``'none'``: no reduction will be applied, ``'mean'``: the sum of the output will be divided
+            by the number of elements in the output, ``'sum'``: the output will be summed. Default: ``"mean"``
+        pos_weight (Tensor, optional): The manual rescaling weight to the positive examples.
+            Default: ``None``
+
+    Shape:
+        - Input: :math:`(N,*)` where `*` means, any number of additional dimensions
+        - Target: :math:`(N,*)`, same shape as the input
+        - Output: scalar. If :attr:`reduction` is ``"none"``, then :math:`(N,*)`, same shape as input.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+        >>> import oneflow.compatible.single_client.typing as tp
+
+        >>> input = flow.Tensor([[1.2, 0.2, -0.3], [0.7, 0.6, -2], [0.7, 0.6, -2]], dtype=flow.float32)
+        >>> target = flow.Tensor([[0, 1, 0], [1, 0, 1], [1, 0, 1]], dtype=flow.float32)
+        >>> weight = flow.Tensor([[2, 2, 2], [2, 2, 2], [2, 2, 2]], dtype=flow.float32)
+        >>> pos_weight = flow.Tensor([1.2, 1.3, 1.4], dtype=flow.float32)
+
+        >>> m = flow.nn.BCEWithLogitsLoss(weight=weight, pos_weight=pos_weight, reduction="none")
+        >>> out = m(input, target)
+        >>> out
+        tensor([[2.9266, 1.5552, 1.1087],
+                [0.9676, 2.075 , 5.9554],
+                [0.9676, 2.075 , 5.9554]], dtype=oneflow.float32)
+
+        >>> m = flow.nn.BCEWithLogitsLoss(weight=weight, pos_weight=pos_weight, reduction="mean")
+        >>> out = m(input, target)
+        >>> out
+        tensor([2.6207], dtype=oneflow.float32)
+
+        >>> m = flow.nn.BCEWithLogitsLoss(weight=weight, pos_weight=pos_weight, reduction="sum")
+        >>> out = m(input, target)
+        >>> out
+        tensor([23.5865], dtype=oneflow.float32)
+
+
+    """
+
+    def __init__(
+        self,
+        weight=None,
+        size_average: bool = True,
+        reduce: bool = True,
+        reduction: Optional[str] = "mean",
+        pos_weight=None,
+    ) -> None:
+        super().__init__()
+        assert reduction in [
+            "sum",
+            "none",
+            "mean",
+            None,
+        ], "only 'sum', 'mean' and None supported by now"
+        self.weight = weight
+        self.size_average = size_average
+        self.reduce = reduce
+        self.reduction = reduction
+        self.pos_weight = pos_weight
+
+    def forward(self, input, target):
+        if not target.shape == input.shape:
+            raise ValueError(
+                "Target size ({}) must be the same as input size ({})".format(
+                    target.size(), input.size()
+                )
+            )
+        _neg_input = flow.experimental.negative(input)
+        _max_val = flow.experimental.clip(_neg_input, 0)
+        _neg_max_val = flow.experimental.negative(_max_val)
+        if self.pos_weight:
+            _log_weight = (self.pos_weight - 1) * target + 1
+            _loss = (1 - target) * input + _log_weight * (
+                flow.experimental.log(
+                    flow.experimental.exp(_neg_max_val)
+                    + flow.experimental.exp(_neg_input - _max_val)
+                )
+                + _max_val
+            )
+        else:
+            _loss = (1 - target) * input + _max_val
+            _loss += flow.experimental.log(
+                flow.experimental.exp(_neg_max_val)
+                + flow.experimental.exp(_neg_input - _max_val)
+            )
+        if self.weight is not None:
+            assert (
+                self.weight.shape == input.shape
+            ), "The weight shape must be the same as Input shape"
+            _weighted_loss = self.weight * _loss
+        else:
+            _weighted_loss = _loss
+        if self.reduction == "mean":
+            return flow.experimental.mean(_weighted_loss)
+        elif self.reduction == "sum":
+            return flow.experimental.sum(_weighted_loss)
+        else:
+            return _weighted_loss
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/masked_fill.py b/python/oneflow/compatible/single_client/nn/modules/masked_fill.py
new file mode 100644
index 0000000000000000000000000000000000000000..50449c415912ccaab017886fd8089bac707e28fa
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/masked_fill.py
@@ -0,0 +1,78 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class MaskedFill(Module):
+    def __init__(self, value) -> None:
+        super().__init__()
+        self.value = value
+
+    def forward(self, input, mask):
+        in_shape = tuple(input.shape)
+        value_like_x = flow.Tensor(*in_shape, device=input.device)
+        value_like_x.fill_(self.value)
+        return flow.F.where(mask, value_like_x, input)
+
+
+@register_tensor_op("masked_fill")
+def masked_fill_op(tensor, mask, value):
+    """
+    Fills elements of :attr:`self` tensor with :attr:`value` where :attr:`mask` is True.
+    The shape of :attr:`mask` must be broadcastable with the shape of the underlying tensor.
+
+    Args:
+        mask (BoolTensor): the boolean mask
+        value (float): the value to fill in with
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+        >>> in_arr = np.array(
+        ...     [[[-0.13169311,  0.97277078,  1.23305363,  1.56752789],
+        ...     [-1.51954275,  1.87629473, -0.53301206,  0.53006478],
+        ...     [-1.38244183, -2.63448052,  1.30845795, -0.67144869]],
+        ...     [[ 0.41502161,  0.14452418,  0.38968   , -1.76905653],
+        ...     [ 0.34675095, -0.7050969 , -0.7647731 , -0.73233418],
+        ...     [-1.90089858,  0.01262963,  0.74693893,  0.57132389]]]
+        ... )
+        >>> fill_value = 8.7654321 # random value e.g. -1e9 3.1415
+        >>> input = flow.Tensor(in_arr, dtype=flow.float32)
+        >>> mask = flow.Tensor((in_arr > 0).astype(np.int8), dtype=flow.int)
+        >>> output = flow.masked_fill(input, mask, fill_value)
+
+        # tensor([[[-0.1317,  8.7654,  8.7654,  8.7654],
+        #  [-1.5195,  8.7654, -0.533 ,  8.7654],
+        #  [-1.3824, -2.6345,  8.7654, -0.6714]],
+
+        # [[ 8.7654,  8.7654,  8.7654, -1.7691],
+        #  [ 8.7654, -0.7051, -0.7648, -0.7323],
+        #  [-1.9009,  8.7654,  8.7654,  8.7654]]], dtype=oneflow.float32)
+
+    """
+    return MaskedFill(value)(tensor, mask)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/masked_select.py b/python/oneflow/compatible/single_client/nn/modules/masked_select.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3e5c59a14dfd1ae3085b42e0028f0176139eaa1
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/masked_select.py
@@ -0,0 +1,100 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+from oneflow.compatible.single_client.ops.array_ops import argwhere, gather, gather_nd
+
+
+class MaskedSelect(Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, mask):
+        assert len(x.shape) == len(
+            mask.shape
+        ), f"The dim of masked_select module's inputs can not match, please check!"
+        broadcast_like_shape = []
+        broadcast_x_axes = []
+        broadcast_mask_axes = []
+        for i in range(len(x.shape)):
+            max_dim = max(x.shape[i], mask.shape[i])
+            broadcast_like_shape.append(max_dim)
+            if max_dim != x.shape[i]:
+                broadcast_x_axes.append(i)
+            if max_dim != mask.shape[i]:
+                broadcast_mask_axes.append(i)
+        broadcast_like_tensor = flow.experimental.zeros(
+            tuple(broadcast_like_shape), dtype=flow.float32, device=x.device
+        )
+        broadcast_like_tensor.requires_grad = x.requires_grad or mask.requires_grad
+        if len(broadcast_x_axes) != 0:
+            x = flow.experimental.broadcast_like(
+                x, broadcast_like_tensor, broadcast_axes=tuple(broadcast_x_axes)
+            )
+        if len(broadcast_mask_axes) != 0:
+            mask = flow.experimental.broadcast_like(
+                mask, broadcast_like_tensor, broadcast_axes=tuple(broadcast_mask_axes)
+            )
+        mask = mask.to(dtype=x.dtype)
+        res = flow.F.mul(x, mask)
+        indices = flow.experimental.argwhere(res)
+        gather_res = flow.F.gather_nd(res, indices)
+        return gather_res.flatten()
+
+
+def masked_select_op(x, mask):
+    """
+
+    Returns a new 1-D tensor which indexes the input tensor according to the boolean mask mask which is a BoolTensor(In oneFlow BoolTensor is replaced by Int8Tensor).
+
+    The shapes of the mask tensor and the input tensor don鈥檛 need to match, but they must be broadcastable.
+
+    Args:
+        input (Tensor): the input tensor.
+        mask (Tensor): the tensor containing the binary mask to index with
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+
+        >>> x = flow.Tensor(np.array([[-0.4620, 0.3139], [0.3898, -0.7197], [0.0478, -0.1657]]), dtype=flow.float32)
+        >>> mask = x.gt(0.05)
+        >>> out = flow.masked_select(x, mask)
+        >>> out
+        tensor([0.3139, 0.3898], dtype=oneflow.float32)
+    """
+    return MaskedSelect()(x, mask)
+
+
+@register_tensor_op("masked_select")
+def tensor_masked_select_op(x, mask):
+    """
+
+    See :func:`oneflow.compatible.single_client.experimental.masked_select`
+
+    """
+    return MaskedSelect()(x, mask)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/math_ops.py b/python/oneflow/compatible/single_client/nn/modules/math_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..94abb3c7c043378ed7c36aa9af8b39a3e23fb211
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/math_ops.py
@@ -0,0 +1,1612 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import collections
+from typing import Optional, Sequence, Union
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+from oneflow.compatible.single_client.nn.modules.utils import _check_axis
+from oneflow.compatible.single_client.ops.transpose_util import (
+    get_inversed_perm,
+    get_perm_when_transpose_axis_to_last_dim,
+)
+
+
+class ScalarMul(Module):
+    def __init__(self, alpha) -> None:
+        super().__init__()
+        if not isinstance(alpha, (int, float)):
+            raise ValueError("alpha type can only be int or float")
+        self.alpha = alpha
+
+    def forward(self, x):
+        return flow.F.mul_scalar(x, self.alpha)
+
+
+class ScalarMulByTensor(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x, y):
+        return flow.F.mul_scalar_by_tensor(x, y)
+
+
+class ElementwiseMul(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x, y):
+        return flow.F.mul(x, y)
+
+
+class BroadcastMul(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x, y):
+        return flow.F.broadcast_mul(x, y)
+
+
+@register_tensor_op("mul")
+def _mul(x, y):
+    """Computes the multiplication of x by y for each element, scalar and broadcast promotation are supported.
+    
+    The formula is:
+
+    .. math::
+        out = x \\times y
+    
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        # element-wise multiply
+        >>> x = flow.Tensor(np.random.randn(2,3))
+        >>> y = flow.Tensor(np.random.randn(2,3))
+        >>> out = flow.mul(x,y).numpy()
+        >>> out.shape
+        (2, 3)
+
+        # scalar mutiply
+        >>> x = 5
+        >>> y = flow.Tensor(np.random.randn(2,3))
+        >>> out = flow.mul(x,y).numpy()
+        >>> out.shape
+        (2, 3)
+
+        # broadcast mutiply
+        >>> x = flow.Tensor(np.random.randn(1,1))
+        >>> y = flow.Tensor(np.random.randn(2,3))
+        >>> out = flow.mul(x,y).numpy()
+        >>> out.shape 
+        (2, 3)
+
+    """
+    if isinstance(x, (int, float)):
+        return ScalarMul(x)(y)
+    elif isinstance(y, (int, float)):
+        return ScalarMul(y)(x)
+    elif x.shape == y.shape:
+        return ElementwiseMul()(x, y)
+    elif x.shape == (1,):
+        return ScalarMulByTensor()(y, x)
+    elif y.shape == (1,):
+        return ScalarMulByTensor()(x, y)
+    else:
+        return BroadcastMul()(x, y)
+
+
+class Variance(Module):
+    def __init__(self, dim: int = None, keepdim: bool = False) -> None:
+        super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
+
+    def forward(self, input):
+        axis = _check_axis(self.dim, input.shape)
+        if isinstance(axis, list) and len(axis) == 0:
+            return flow.experimental.zeros(size=input.shape)
+        else:
+            return flow.experimental.sub(
+                flow.experimental.mean(
+                    flow.experimental.square(input), axis, self.keepdim
+                ),
+                flow.experimental.square(
+                    flow.experimental.mean(input, axis, self.keepdim)
+                ),
+            )
+
+
+@register_tensor_op("var")
+def variance_op(input, dim=None, keepdim=False):
+    """Returns the variance of each row of the `input` tensor in the given dimension `dim`.
+
+    If `keepdim` is `True`, the output tensor is of the same size as `input` except in the dimension(s) `dim` 
+    where it is of size 1. Otherwise, dim is squeezed (see `flow.squeeze()`), resulting in the output 
+    tensor having 1 (or `len(dim)`) fewer dimension(s).
+
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of python:ints): the dimension or dimensions to reduce. Defaults to None.
+        keepdim (bool, optional): whether the output tensor has dim retained or not. Defaults to False.
+
+    Returns:
+        Tensor: The result of variance on the specified axis of input Tensor
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> np_arr = np.random.randn(2,3,4,5)
+        >>> input = flow.Tensor(np_arr)
+        >>> output = flow.var(input, 1, True)
+
+    """
+    return Variance(dim, keepdim)(input)
+
+
+class ScalarSubByTensor(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x, y):
+        return flow.F.sub_scalar_by_tensor(x, y)
+
+
+class BroadcastSub(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x, y):
+        return flow.F.broadcast_sub(x, y)
+
+
+class ScalarAdd(Module):
+    def __init__(self, alpha) -> None:
+        super().__init__()
+        if not isinstance(alpha, int) and (not isinstance(alpha, float)):
+            raise ValueError("scalar type can only be int or float")
+        self.alpha = alpha
+
+    def forward(self, x):
+        return flow.F.add_scalar(x, self.alpha)
+
+
+@register_tensor_op("sub")
+def _sub(x, y):
+    """Computes the subtraction of x by y for each element, scalar and broadcast promotation are supported.
+    The formula is:
+
+    .. math::
+        out = x - y
+    
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        # element-wise subtract
+        >>> x = flow.Tensor(np.random.randn(2,3))
+        >>> y = flow.Tensor(np.random.randn(2,3))
+        >>> out = flow.sub(x,y).numpy()
+        >>> out.shape
+        (2, 3)
+
+        # scalar subtract
+        >>> x = 5
+        >>> y = flow.Tensor(np.random.randn(2,3))
+        >>> out = flow.sub(x,y).numpy()
+        >>> out.shape
+        (2, 3)
+
+        # broadcast subtract
+        >>> x = flow.Tensor(np.random.randn(1,1))
+        >>> y = flow.Tensor(np.random.randn(2,3))
+        >>> out = flow.sub(x,y).numpy()
+        >>> out.shape
+        (2, 3)
+
+    """
+    if isinstance(x, (int, float)):
+        return ScalarAdd(x)(ScalarMul(-1)(y))
+    elif isinstance(y, (int, float)):
+        return ScalarAdd(-1 * y)(x)
+    elif x.shape == y.shape:
+        return BroadcastSub()(x, y)
+    elif y.shape == (1,):
+        return ScalarSubByTensor()(x, y)
+    else:
+        return BroadcastSub()(x, y)
+
+
+class BroadcastDiv(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x, y):
+        return flow.F.broadcast_div(x, y)
+
+
+class ScalarDivByTensor(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x, scalar):
+        return flow.F.div_scalar_by_tensor(x, scalar)
+
+
+@register_tensor_op("div")
+def _div(x, y):
+    """Computes the division of x by y for each element, scalar and broadcast promotation are supported.
+    The formula is:
+
+    .. math::
+        out = \\frac{X}{Y}
+    
+    Args:
+        x (Union[int, float, flow.Tensor]): X.
+        y (Union[int, float, flow.Tensor]): Y.
+    
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        # element-wise divide
+        >>> x = flow.Tensor(np.random.randn(2,3))
+        >>> y = flow.Tensor(np.random.randn(2,3))
+        >>> out = flow.div(x,y).numpy()
+        >>> out.shape
+        (2, 3)
+
+        # scalar divide
+        >>> x = 5
+        >>> y = flow.Tensor(np.random.randn(2,3))
+        >>> out = flow.div(x,y).numpy()
+        >>> out.shape
+        (2, 3)
+
+        # broadcast divide
+        >>> x = flow.Tensor(np.random.randn(1,1))
+        >>> y = flow.Tensor(np.random.randn(2,3))
+        >>> out = flow.div(x,y).numpy()
+        >>> out.shape 
+        (2, 3)
+
+    """
+    if isinstance(x, (int, float)):
+        return ScalarMul(x)(flow.experimental.reciprocal(y))
+    elif isinstance(y, (int, float)):
+        if y == 0 or y == 0.0:
+            y = 0.0
+        else:
+            y = 1.0 / float(y)
+        return ScalarMul(y)(x)
+    elif x.shape == y.shape:
+        return BroadcastDiv()(x, y)
+    elif y.shape == (1,):
+        return ScalarDivByTensor()(x, y)
+    else:
+        return BroadcastDiv()(x, y)
+
+
+class Reciprocal(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.reciprocal_no_nan(x)
+
+
+@register_tensor_op("reciprocal")
+def _reciprocal(x):
+    """Computes the safe reciprocal of x. If x is zero, the reciprocal will
+    be also set to zero.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> x = flow.Tensor(np.array([[1, 2, 3], [4, 5, 6]]))
+        >>> out = flow.reciprocal(x)
+        >>> out.numpy()
+        array([[1.        , 0.5       , 0.33333334],
+               [0.25      , 0.2       , 0.16666667]], dtype=float32)
+    """
+    return Reciprocal()(x)
+
+
+class ScalarAddByTensor(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x, y):
+        return flow.F.add_scalar_by_tensor(x, y)
+
+
+class ElementwiseAdd(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x, y):
+        return flow.F.add(x, y)
+
+
+class BroadcastAdd(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x, y):
+        return flow.F.broadcast_add(x, y)
+
+
+@register_tensor_op("add")
+def _add(x, y):
+    """Computes the addition of x by y for each element, scalar and broadcast promotation are supported.
+    The formula is:
+
+    .. math::
+        out = x + y
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        # element-wise add
+        >>> x = flow.Tensor(np.random.randn(2,3))
+        >>> y = flow.Tensor(np.random.randn(2,3))
+        >>> out = flow.add(x, y).numpy()
+        >>> out.shape
+        (2, 3)
+
+        # scalar add
+        >>> x = 5
+        >>> y = flow.Tensor(np.random.randn(2,3))
+        >>> out = flow.add(x, y).numpy()
+        >>> out.shape
+        (2, 3)
+
+        # broadcast add
+        >>> x = flow.Tensor(np.random.randn(1,1))
+        >>> y = flow.Tensor(np.random.randn(2,3))
+        >>> out = flow.add(x, y).numpy()
+        >>> out.shape
+        (2, 3)
+
+    """
+    if isinstance(x, (int, float)):
+        return ScalarAdd(x)(y)
+    elif isinstance(y, (int, float)):
+        return ScalarAdd(y)(x)
+    elif x.shape == y.shape:
+        return ElementwiseAdd()(x, y)
+    elif x.shape == (1,):
+        return ScalarAddByTensor()(y, x)
+    elif y.shape == (1,):
+        return ScalarAddByTensor()(x, y)
+    else:
+        return BroadcastAdd()(x, y)
+
+
+class Asin(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.asin(x)
+
+
+def asin_op(input):
+    """
+    Returns a new tensor with the arcsine of the elements of :attr:`input`.
+
+    .. math::
+        \\text{out}_{i} = \\sin^{-1}(\\text{input}_{i})
+
+    Args:
+        input (Tensor): the input tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+        >>> input = flow.Tensor(np.array([-0.5,  0.8, 1.0,  -0.8]), dtype=flow.float32)
+        >>> output = flow.asin(input)
+        >>> output.shape
+        flow.Size([4])
+        >>> output
+        tensor([-0.5236,  0.9273,  1.5708, -0.9273], dtype=oneflow.float32)
+        >>> input1 = flow.Tensor(np.array([[0.8, 1.0], [-0.6, -1.0]]), dtype=flow.float32)
+        >>> output1 = input1.asin()
+        >>> output1.shape
+        flow.Size([2, 2])
+        >>> output1
+        tensor([[ 0.9273,  1.5708],
+                [-0.6435, -1.5708]], dtype=oneflow.float32)
+    """
+    return Asin()(input)
+
+
+@register_tensor_op("asin")
+def asin_op_tensor(input):
+    """
+
+    See :func:`oneflow.compatible.single_client.experimental.asin`
+    """
+    return Asin()(input)
+
+
+def arcsin_op(input):
+    """
+  
+    Alias for :func:`oneflow.compatible.single_client.experimental.asin`
+    """
+    return Asin()(input)
+
+
+@register_tensor_op("arcsin")
+def arcsin_op_tensor(input):
+    """
+
+    See :func:`oneflow.compatible.single_client.experimental.asin`
+    """
+    return Asin()(input)
+
+
+class Asinh(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.asinh(x)
+
+
+def asinh_op(input):
+    """
+    Returns a new tensor with the inverse hyperbolic sine of the elements of :attr:`input`.
+
+    .. math::
+        \\text{out}_{i} = \\sinh^{-1}(\\text{input}_{i})
+
+    Args:
+        input (Tensor): the input tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution() 
+        >>> input = flow.Tensor(np.array([2, 3, 4]), dtype=flow.float32)
+        >>> output = flow.asinh(input)
+        >>> output.shape
+        flow.Size([3])
+        >>> output
+        tensor([1.4436, 1.8184, 2.0947], dtype=oneflow.float32)
+
+        >>> input1 = flow.Tensor(np.array([[-1, 0, -0.4], [5, 7, 0.8]]), dtype=flow.float32)
+        >>> output1 = input1.asinh()
+        >>> output1.shape
+        flow.Size([2, 3])
+        >>> output1
+        tensor([[-0.8814,  0.    , -0.39  ],
+                [ 2.3124,  2.6441,  0.7327]], dtype=oneflow.float32)
+
+    """
+    return Asinh()(input)
+
+
+def arcsinh_op(input):
+    """
+  
+    Alias for :func:`oneflow.compatible.single_client.experimental.asinh`
+    """
+    return Asinh()(input)
+
+
+@register_tensor_op("asinh")
+def asinh_op_tensor(input):
+    """
+
+    See :func:`oneflow.compatible.single_client.experimental.asinh`
+    """
+    return Asinh()(input)
+
+
+@register_tensor_op("arcsinh")
+def arcsinh_op_tensor(input):
+    """
+
+    See :func:`oneflow.compatible.single_client.experimental.asinh`
+    """
+    return Asinh()(input)
+
+
+class Sin(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.sin(x)
+
+
+def sin_op(tensor):
+    """
+    Returns a new tensor with the sine of the elements of :attr:`input`.
+
+    .. math::
+
+        \\text{out}_{i} = \\sin(\\text{input}_{i})
+
+    Args:
+        input (Tensor): the input tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+        >>> x1 = flow.Tensor(np.array([-0.5461,  0.1347, -2.7266, -0.2746]).astype(np.float32))
+        >>> out1 = flow.sin(x1)
+        >>> out1
+        tensor([-0.5194,  0.1343, -0.4032, -0.2712], dtype=oneflow.float32)
+        >>> x2 = flow.Tensor(np.array([-1.4, 2.6, 3.7]).astype(np.float32),device=flow.device('cuda'))
+        >>> out2 = flow.sin(x2)
+        >>> out2
+        tensor([-0.9854,  0.5155, -0.5298], device='cuda:0', dtype=oneflow.float32)
+
+    """
+    return Sin()(tensor)
+
+
+@register_tensor_op("sin")
+def sin_op_tensor(tensor):
+    """
+
+    sin() -> Tensor
+
+    See :func:`oneflow.compatible.single_client.experimental.sin`
+    
+    """
+    return Sin()(tensor)
+
+
+class Cos(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.cos(x)
+
+
+@register_tensor_op("cos")
+def cos_op(tensor):
+    """
+    Returns a new tensor with the cosine  of the elements of :attr:`input`.
+    
+    .. math::
+        \\text{out}_{i} = \\cos(\\text{input}_{i})
+
+    Args:
+        input (Tensor): the input tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+        >>> arr = np.array([1.4309,  1.2706, -0.8562,  0.9796])
+        >>> input = flow.Tensor(arr, dtype=flow.float32)
+        >>> output = flow.cos(input).numpy()
+
+    """
+    return Cos()(tensor)
+
+
+class Atan(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.atan(x)
+
+
+def atan_op(tensor):
+    """
+    Returns a new tensor with the arctangent of the elements of :attr:`input`.
+
+    .. math::
+        \\text{out}_{i} = \\tan^{-1}(\\text{input}_{i})
+
+    Args:
+        input (Tensor): the input tensor.
+
+    For example:
+
+    .. code-block:: python
+    
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+        >>> input = flow.Tensor(np.array([0.5, 0.6, 0.7]), dtype=flow.float32)
+        >>> output = flow.atan(input)
+        >>> output.shape
+        flow.Size([3])
+        
+    """
+    return Atan()(tensor)
+
+
+@register_tensor_op("atan")
+def atan_op_tensor(tensor):
+    """
+
+    See :func:`oneflow.compatible.single_client.experimental.atan`
+    
+    """
+    return Atan()(tensor)
+
+
+def arctan_op(tensor):
+    """
+    Alias for :func:`oneflow.compatible.single_client.experimental.atan`
+    
+    """
+    return Atan()(tensor)
+
+
+@register_tensor_op("arctan")
+def arctan_op_tensor(tensor):
+    """
+
+    See :func:`oneflow.compatible.single_client.experimental.arctan`
+    
+    """
+    return Atan()(tensor)
+
+
+class Log(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.log(x)
+
+
+@register_tensor_op("log")
+def log_op(tensor):
+    """
+    Returns a new tensor with the natural logarithm of the elements of :attr:`input`.
+    
+    .. math::
+        y_{i} = \\log_{e} (x_{i})
+
+    Args:
+        input (Tensor): the input tensor.
+    
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+        >>> arr = np.random.randn(2, 3, 4, 5)
+        >>> input = flow.Tensor(arr, dtype=flow.float32)
+        >>> output = flow.log(input)
+
+
+    """
+    return Log()(tensor)
+
+
+class Subtract(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x, y):
+        if isinstance(x, (int, float)):
+            return ScalarAdd(x)(-1 * y)
+        elif isinstance(y, (int, float)):
+            return ScalarAdd(-1 * y)(x)
+        elif x.shape == y.shape:
+            return BroadcastSub()(x, y)
+        elif x.shape == (1,):
+            return ScalarSubByTensor()(y, x)
+        elif y.shape == (1,):
+            return ScalarSubByTensor()(x, y)
+        else:
+            return BroadcastSub()(x, y)
+
+
+class Sqrt(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, input):
+        return flow.F.sqrt(input)
+
+
+@register_tensor_op("rsqrt")
+def rsqrt_op(input):
+    """Returns a new tensor with the reciprocal of the square-root of each of
+        the elements of :attr:`input`.
+
+        .. math::
+            \\text{out}_{i} = \\frac{1}{\\sqrt{\\text{input}_{i}}}
+
+        Args:
+            input (Tensor) 鈥� the input tensor.
+
+         For example:
+
+        .. code-block:: python
+
+            >>> import oneflow.compatible.single_client.experimental as flow
+            >>> import numpy as np
+            >>> flow.enable_eager_execution()
+
+            >>> a = flow.Tensor(np.array([1.0, 2.0, 3.0]))
+            >>> out = flow.rsqrt(a).numpy()
+            >>> out
+            array([1.        , 0.70710677, 0.57735026], dtype=float32)
+    """
+    return Rsqrt()(input)
+
+
+class Rsqrt(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, input):
+        return flow.F.rsqrt(input)
+
+
+@register_tensor_op("sqrt")
+def sqrt_op(input):
+    """Returns a new tensor with the square-root of the elements of :attr:`input`.
+
+        .. math::
+            \\text{out}_{i} = \\sqrt{\\text{input}_{i}}
+
+        Args:
+            input (Tensor): the input tensor.
+
+         For example:
+
+        .. code-block:: python
+
+            >>> import oneflow.compatible.single_client.experimental as flow
+            >>> import numpy as np
+            >>> flow.enable_eager_execution()
+
+            >>> arr = np.array([1.0, 2.0, 3.0])
+            >>> input = flow.Tensor(arr)
+            >>> output = flow.sqrt(input).numpy()
+            >>> output
+            array([1.       , 1.4142135, 1.7320508], dtype=float32)
+        """
+    return Sqrt()(input)
+
+
+class Square(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, input):
+        return flow.F.square(input)
+
+
+@register_tensor_op("square")
+def square_op(input):
+    """Returns a new tensor with the square of the elements of :attr:`input`.
+
+        .. math::
+            \\text{out}_{i} = \\sqrt{\\text{input}_{i}}
+
+        Args:
+            input (Tensor): the input tensor.
+
+         For example:
+
+        .. code-block:: python
+
+            >>> import oneflow.compatible.single_client.experimental as flow
+            >>> import numpy as np
+            >>> flow.enable_eager_execution()
+
+            >>> arr = np.array([1.0, 2.0, 3.0])
+            >>> input = flow.Tensor(arr)
+            >>> output = flow.square(input).numpy()
+            >>> output
+            array([1., 4., 9.], dtype=float32)
+        """
+    return Square()(input)
+
+
+class Std(Module):
+    def __init__(self, dim=None, unbiased=True, keepdim=False) -> None:
+        super().__init__()
+        assert unbiased == True, "Only support 'unbiased=True' for now!"
+        self.unbiased = unbiased
+        self.keepdim = keepdim
+        self.dim = dim
+        self.reduce_count = 1
+        self.square_op = Square()
+        self.sqrt_op = Sqrt()
+        self.subtract_op = Subtract()
+
+    def forward(self, x):
+        self.axis = _check_axis(self.dim, x.shape)
+        if isinstance(self.axis, list) and len(self.axis) == 0:
+            return flow.experimental.zeros(size=x.shape)
+        else:
+            if len(self.axis) == 0:
+                self.reduce_count = x.nelement()
+            else:
+                for i in self.axis:
+                    self.reduce_count *= x.shape[i]
+            sum = (
+                flow.experimental.sum(self.square_op(x), self.axis, self.keepdim)
+                / self.reduce_count
+            )
+            square = self.square_op(
+                flow.experimental.sum(x, self.axis, self.keepdim) / self.reduce_count
+            )
+            subtract = self.subtract_op(sum, square)
+            res = self.sqrt_op(subtract)
+            return res
+
+
+@register_tensor_op("std")
+def std_op(tensor, dim, unbiased=True, keepdim=False):
+    """
+    Returns the standard-deviation of each row of the :attr:`input` tensor in the
+    dimension :attr:`dim`. If :attr:`dim` is a list of dimensions,
+    reduce over all of them.
+
+    If keepdim is True, the output tensor is of the same size as input except in 
+    the dimension(s) dim where it is of size 1. Otherwise, dim is squeezed, 
+    resulting in the output tensor having 1 (or len(dim)) fewer dimension(s).
+
+    If :attr:`unbiased` is ``False``, then the standard-deviation will be calculated
+    via the biased estimator. Otherwise, Bessel's correction will be used.
+
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of python:ints): the dimension or dimensions to reduce.
+        unbiased (bool): whether to use the unbiased estimation or not
+        keepdim (bool): whether the output tensor has `dim` retained or not.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+
+        >>> arr = np.array([1.0, 2.0, 3.0])
+        >>> input = flow.Tensor(arr)
+        >>> output = flow.std(input, dim=0).numpy()
+        >>> output
+        array([0.8164968], dtype=float32)
+
+    """
+    return Std(dim, unbiased, keepdim)(tensor)
+
+
+class Pow(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x, y):
+        if isinstance(y, (int, float)):
+            return flow.F.pow_scalar(x, alpha=y)
+        else:
+            return flow.F.pow(x, y)
+
+
+@register_tensor_op("pow")
+def pow_op(tensor, exponent):
+    """Takes the power of each element in input with exponent and returns a tensor with the result. Exponent can be either a single float number, a single int number, or a tensor with the same shape as input.
+    When exponent is a scalar value, the operation applied is:
+
+    .. math::
+        \\text{out}_i = x_i ^ \\text{exponent}
+\u200b
+    When exponent is a tensor, the operation applied is:
+
+    .. math::
+        \\text{out}_i = x_i ^ {\\text{exponent}_i}
+
+    Args:
+        - input (Tensor): the input tensor.
+        - exponent (int, float, Tensor): the exponent.
+
+    Returns:
+        Tensor: The result of variance on the specified axis of input Tensor
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+
+        >>> x = flow.Tensor(np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]))
+        >>> out = flow.pow(x, 2).numpy()
+        >>> out
+        array([ 1.,  4.,  9., 16., 25., 36.], dtype=float32)
+
+        >>> x = flow.Tensor(np.array([1.0, 2.0, 3.0, 4.0]))
+        >>> y = flow.Tensor(np.array([1.0, 2.0, 3.0, 4.0]))
+        >>> out = flow.pow(x, y).numpy()
+        >>> out
+        array([  1.,   4.,  27., 256.], dtype=float32)
+        
+    """
+    return Pow()(tensor, exponent)
+
+
+class Addmm(Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self._matmul_op = (
+            flow.builtin_op("matmul")
+            .Input("a")
+            .Input("b")
+            .Output("out")
+            .Attr("transpose_a", False)
+            .Attr("transpose_b", False)
+            .Attr("alpha", 1.0)
+            .Build()
+        )
+
+    def forward(self, x, mat1, mat2, alpha=1, beta=1):
+        if len(x.shape) > 2 or len(mat1.shape) > 2 or len(mat2.shape) > 2:
+            raise ValueError("input matrixes shape can not be greater than 2")
+        else:
+            return _mul(x, beta) + _mul(self._matmul_op(mat1, mat2)[0], alpha)
+
+
+def addmm_op(input, mat1, mat2, alpha=1, beta=1):
+    """addmm(beta=1, input, alpha=1, mat1, mat2, out=None) -> Tensor
+
+    Performs a matrix multiplication of the matrices :attr:`mat1` and :attr:`mat2`.
+    The matrix :attr:`input` is added to the final result.
+
+    If :attr:`mat1` is a :math:`(n \\times m)` tensor, :attr:`mat2` is a
+    :math:`(m \\times p)` tensor, then :attr:`input` must be
+    broadcastable with a :math:`(n \\times p)` tensor
+    and :attr:`out` will be a :math:`(n \\times p)` tensor.
+
+    :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between
+    :attr:`mat1` and :attr:`mat2` and the added matrix :attr:`input` respectively.
+
+    .. math::
+        \\text{out} = \\beta\\ \\text{input} + \\alpha\\ (\\text{mat1}_i \\mathbin{@} \\text{mat2}_i)
+
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+
+    Args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\\beta`)
+        input (Tensor): matrix to be added
+        alpha (Number, optional): multiplier for :math:`mat1 @ mat2` (:math:`\\alpha`)
+        mat1 (Tensor): the first matrix to be multiplied
+        mat2 (Tensor): the second matrix to be multiplied
+        out (Tensor, optional): the output tensor.
+
+    For example:
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+        >>> input = flow.tensor(np.array([[1,2,4],[5,11,9.1]]))
+        >>> mat1 = flow.tensor(np.array([[7.3,1.9,7.3],[10.2,1,5.5]])) 
+        >>> mat2 = flow.tensor(np.array([[7.3,1.9,7.3],[10.2,1,5.5],[3.7,2.2,8.1]])) 
+        >>> output = flow.addmm(input, mat1, mat2)
+        >>> output
+        tensor([[100.68,  33.83, 126.87],
+                [110.01,  43.48, 133.61]], dtype=oneflow.float64)
+        >>> output.shape
+        flow.Size([2, 3])
+
+        >>> input2 = flow.tensor(np.array([1.7]))
+        >>> mat1 = flow.tensor(np.array([[1,2],[5,9.1],[7.7,1.4]]))
+        >>> mat2 = flow.tensor(np.array([[1,2,3.7],[5,9.1,6.8]]))
+        >>> output2 = flow.addmm(input2, mat1, mat2, alpha=1, beta=2)
+        >>> output2
+        tensor([[14.4 , 23.6 , 20.7 ],
+                [53.9 , 96.21, 83.78],
+                [18.1 , 31.54, 41.41]], dtype=oneflow.float64)
+        >>> output2.shape
+        flow.Size([3, 3])
+    """
+    return Addmm()(input, mat1, mat2, alpha, beta)
+
+
+@register_tensor_op("addmm")
+def addmm_op_tensor(input, mat1, mat2, alpha=1, beta=1):
+    """
+    See :func:`oneflow.compatible.single_client.experimental.addmm`
+    """
+    return Addmm()(input, mat1, mat2, alpha, beta)
+
+
+class Clamp(Module):
+    def __init__(self, min_value=None, max_value=None) -> None:
+        super().__init__()
+        if min_value is not None:
+            floating_min_value = float(min_value)
+            integral_min_value = int(min_value)
+        if max_value is not None:
+            floating_max_value = float(max_value)
+            integral_max_value = int(max_value)
+        if min_value is not None and max_value is not None:
+            self._op = (
+                flow.builtin_op("clip_by_scalar")
+                .Input("x")
+                .Output("y")
+                .Attr("floating_min", floating_min_value)
+                .Attr("integral_min", integral_min_value)
+                .Attr("floating_max", floating_max_value)
+                .Attr("integral_max", integral_max_value)
+                .Build()
+            )
+        elif min_value is not None:
+            self._op = (
+                flow.builtin_op("clip_by_scalar_min")
+                .Input("x")
+                .Output("y")
+                .Attr("floating_min", floating_min_value)
+                .Attr("integral_min", integral_min_value)
+                .Build()
+            )
+        elif max_value is not None:
+            self._op = (
+                flow.builtin_op("clip_by_scalar_max")
+                .Input("x")
+                .Output("y")
+                .Attr("floating_max", floating_max_value)
+                .Attr("integral_max", integral_max_value)
+                .Build()
+            )
+        else:
+            raise ValueError("min_value and max_value cannot be None at the same time")
+
+    def forward(self, x):
+        return self._op(x)[0]
+
+
+def clamp_op(tensor, min=None, max=None):
+    """
+    Clamp all elements in :attr:`input` into the range `[` :attr:`min`, :attr:`max` `]` and return
+    a resulting tensor:
+
+    .. math::
+        y_i = \\begin{cases}
+            \\text{min} & \\text{if } x_i < \\text{min} \\\\
+            x_i & \\text{if } \\text{min} \\leq x_i \\leq \\text{max} \\\\
+            \\text{max} & \\text{if } x_i > \\text{max}
+        \\end{cases}
+
+    If :attr:`input` is of type `FloatTensor` or `DoubleTensor`, args :attr:`min`
+    and :attr:`max` must be real numbers, otherwise they should be integers.
+
+    Args:
+        input (Tensor): the input tensor.
+        min (Number): lower-bound of the range to be clamped to. Defaults to None.
+        max (Number): upper-bound of the range to be clamped to. Defaults to None.
+        out (Tensor, optional): the output tensor.
+
+    For example:
+
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+        >>> arr = np.array([0.2, 0.6, -1.5, -0.3])
+        >>> input = flow.Tensor(arr)
+        >>> output = flow.clamp(input, min=-0.5, max=0.5)
+        >>> output
+        tensor([ 0.2,  0.5, -0.5, -0.3], dtype=oneflow.float32)
+
+        >>> arr = np.array([0.2, 0.6, -1.5, -0.3])
+        >>> input = flow.Tensor(arr)
+        >>> output = flow.clamp(input, min=None, max=0.5)
+        >>> output
+        tensor([ 0.2,  0.5, -1.5, -0.3], dtype=oneflow.float32)
+
+        >>> arr = np.array([0.2, 0.6, -1.5, -0.3])
+        >>> input = flow.Tensor(arr)
+        >>> output = flow.clamp(input, min=-0.5, max=None)
+        >>> output
+        tensor([ 0.2,  0.6, -0.5, -0.3], dtype=oneflow.float32)
+
+    """
+    return Clamp(min, max)(tensor)
+
+
+@register_tensor_op("clamp")
+def clamp_op_tensor(tensor, min=None, max=None):
+    """
+    See :func:`oneflow.compatible.single_client.experimental.clamp`
+    """
+    return Clamp(min, max)(tensor)
+
+
+def clip_op(tensor, min=None, max=None):
+    """
+    Alias for :func:`oneflow.compatible.single_client.experimental.clamp`
+    """
+    return Clamp(min, max)(tensor)
+
+
+@register_tensor_op("clip")
+def clip_op_tensor(tensor, min=None, max=None):
+    """
+    See :func:`oneflow.compatible.single_client.experimental.clamp`
+    """
+    return Clamp(min, max)(tensor)
+
+
+class Cosh(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.cosh(x)
+
+
+@register_tensor_op("cosh")
+def cosh_op(tensor):
+    """
+    Returns a new tensor with the hyperbolic cosine of the elements of :attr:`input`.
+
+    .. math::
+        \\text{out}_{i} = \\cosh(\\text{input}_{i})
+
+    Args:
+        input (Tensor): the input tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> arr = np.array([ 0.1632,  1.1835, -0.6979, -0.7325])
+        >>> input = flow.Tensor(arr, dtype=flow.float32)
+        >>> output = flow.cosh(input).numpy()
+        >>> output
+        array([1.0133467, 1.7859949, 1.2535787, 1.2804903], dtype=float32)
+
+    """
+    return Cosh()(tensor)
+
+
+class Erf(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, input):
+        return flow.F.erf(input)
+
+
+@register_tensor_op("erf")
+def erf_op(input):
+    """Computes the error function of each element. The error function is defined as follows:
+
+    .. math::
+            \\operatorname{erf}(x)=\\frac{2}{\\sqrt{\\pi}} \\int_{0}^{x} e^{-t^{2}} d t
+
+    Args:
+        x (oneflow.compatible.single_client.Tensor): A Tensor
+
+    Returns:
+        oneflow.compatible.single_client.Tensor: The result Tensor   
+               
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+
+        >>> x = flow.Tensor(np.array([0, -1., 10.]), dtype=flow.float32)
+        >>> out = flow.erf(x)
+        >>> out.shape
+        flow.Size([3])
+        >>> out.numpy()
+        array([ 0.       , -0.8427008,  1.       ], dtype=float32)
+
+        >>> x = flow.Tensor(np.array([[0, -1., 10.], [5, 7, 0.8]]), dtype=flow.float32)
+        >>> out = flow.erf(x)
+        >>> out.shape
+        flow.Size([2, 3])
+        >>> out.numpy()
+        array([[ 0.        , -0.8427008 ,  1.        ],
+               [ 1.        ,  1.        ,  0.74210095]], dtype=float32)
+
+        >>> x = flow.Tensor(np.array([[0, -1., 10.], [5, 7, 0.8], [2, 3, 4]]), dtype=flow.float32)
+        >>> out = x.erf()
+        >>> out.shape
+        flow.Size([3, 3])
+        >>> out.numpy()
+        array([[ 0.        , -0.8427008 ,  1.        ],
+               [ 1.        ,  1.        ,  0.74210095],
+               [ 0.9953223 ,  0.9999779 ,  1.        ]], dtype=float32)
+
+    """
+    return Erf()(input)
+
+
+@register_tensor_op("erf")
+def erf_op_tensor(input):
+    """
+    See :func:`oneflow.compatible.single_client.experimental.erf`
+    """
+    return Erf()(input)
+
+
+class Erfc(Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.erfc_op = flow.builtin_op("erfc").Input("x").Output("y").Build()
+
+    def forward(self, input):
+        return self.erfc_op(input)[0]
+
+
+@register_tensor_op("erfc")
+def erfc_op(input):
+    """Computes the complementary error function of each element of input. The complementary error 
+    function is defined as follows:
+
+    .. math::
+            \\operatorname{erfc}(x)=1-\\frac{2}{\\sqrt{\\pi}} \\int_{0}^{x} e^{-t^{2}} d t
+
+    Args:
+        x (oneflow.compatible.single_client.Tensor): A Tensor
+
+    Returns:
+        oneflow.compatible.single_client.Tensor: The result Tensor
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+
+        >>> x = flow.Tensor(np.array([0, -1., 10.]), dtype=flow.float32)
+        >>> out = flow.erfc(x)
+        >>> out.shape
+        flow.Size([3])
+        >>> out.numpy()
+        array([1.0000000e+00, 1.8427007e+00, 2.8025969e-45], dtype=float32)
+
+        >>> x = flow.Tensor(np.array([[0, -1., 10.], [5, 7, 0.8]]), dtype=flow.float32)
+        >>> out = flow.erfc(x)
+        >>> out.shape
+        flow.Size([2, 3])
+        >>> out.numpy()
+        array([[1.0000000e+00, 1.8427007e+00, 2.8025969e-45],
+               [1.5374597e-12, 4.1838257e-23, 2.5789905e-01]], dtype=float32)
+
+        >>> x = flow.Tensor(np.array([[0, -1., 10.], [5, 7, 0.8], [2, 3, 4]]), dtype=flow.float32)
+        >>> out = x.erfc()
+        >>> out.shape
+        flow.Size([3, 3])
+        >>> out.numpy()
+        array([[1.0000000e+00, 1.8427007e+00, 2.8025969e-45],
+               [1.5374597e-12, 4.1838257e-23, 2.5789905e-01],
+               [4.6777348e-03, 2.2090499e-05, 1.5417259e-08]], dtype=float32)
+        
+    """
+    return Erfc()(input)
+
+
+@register_tensor_op("erfc")
+def erfc_op_tensor(input):
+    """
+    See :func:`oneflow.compatible.single_client.experimental.erfc`
+    """
+    return Erfc()(input)
+
+
+class Ceil(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.ceil(x)
+
+
+def ceil_op(x):
+    """Returns a new tensor with the ceil of the elements of :attr:`x`,
+    the smallest integer greater than or equal to each element.
+
+    The equation is: 
+
+    .. math::
+        \\text{out}_{i} = \\left\\lceil \\text{input}_{i} \\right\\rceil = \\left\\lfloor \\text{input}_{i} \\right\\rfloor + 1
+
+    Args:
+        x (oneflow.compatible.single_client.Tensor): A Tensor.
+    
+    Returns:
+        oneflow.compatible.single_client.Tensor: The result Tensor
+
+    For example: 
+
+
+    .. code-block:: python 
+        
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution() 
+        
+        >>> x = flow.Tensor(np.array([0.1, -2, 3.4]).astype(np.float32))
+        >>> y = flow.ceil(x)
+        >>> print(y.shape)
+        flow.Size([3])
+        >>> print(y.numpy())
+        [ 1. -2.  4.]
+
+
+        >>> x = flow.Tensor(np.array([[2.5, 4.6, 0.6],[7.8, 8.3, 9.2]]).astype(np.float32))
+        >>> y = x.ceil()
+        >>> print(y.shape)
+        flow.Size([2, 3])
+        >>> print(y.numpy())
+        [[ 3.  5.  1.]
+         [ 8.  9. 10.]]
+
+
+
+
+        >>> x = flow.Tensor(np.array([[[2.2, 4.4, 6.5],[7.1, 8.2, 9.3]],[[10.6,11.2,12.2],[13.5,14.8,15.9]]]).astype(np.float32))
+        >>> y = flow.ceil(x)
+        >>> print(y.shape)
+        flow.Size([2, 2, 3])
+        >>> print(y.numpy())
+        [[[ 3.  5.  7.]
+          [ 8.  9. 10.]]
+        <BLANKLINE>
+         [[11. 12. 13.]
+          [14. 15. 16.]]]
+
+    """
+    return Ceil()(x)
+
+
+@register_tensor_op("ceil")
+def ceil_op_tensor(x):
+    """
+    See :func:`oneflow.compatible.single_client.experimental.ceil`
+    """
+    return Ceil()(x)
+
+
+class Expm1(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.expm1(x)
+
+
+def expm1_op(x):
+    """Returns a new tensor with the exponential of the elements minus 1
+    of :attr:`x`.
+
+
+    The equation is: 
+
+    .. math::
+        y_{i} = e^{x_{i}} - 1
+
+    Args:
+        x (oneflow.compatible.single_client.Tensor): A Tensor.
+    
+    Returns:
+        oneflow.compatible.single_client.Tensor: The result Tensor
+
+    For example: 
+
+    .. code-block:: python 
+        
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution() 
+        
+        >>> x = flow.Tensor(np.array([1, 2, 3]).astype(np.float32))
+        >>> y = flow.expm1(x)
+        >>> print(y.shape)
+        flow.Size([3])
+        >>> print(y.numpy())
+        [ 1.7182817  6.389056  19.085537 ]
+
+
+        >>> x = flow.Tensor(np.array([[2, 4, 6],[7, 8, 9]]).astype(np.float32))
+        >>> y = x.expm1()
+        >>> print(y.shape)
+        flow.Size([2, 3])
+        >>> print(y.numpy())
+        [[6.3890562e+00 5.3598152e+01 4.0242880e+02]
+         [1.0956332e+03 2.9799580e+03 8.1020840e+03]]
+
+
+
+        >>> x = flow.Tensor(np.array([[[2, 4, 6],[7, 8, 9]],[[10,11,12],[13,14,15]]]).astype(np.float32))
+        >>> y = flow.expm1(x)
+        >>> print(y.shape)
+        flow.Size([2, 2, 3])
+        >>> print(y.numpy())
+        [[[6.3890562e+00 5.3598152e+01 4.0242880e+02]
+          [1.0956332e+03 2.9799580e+03 8.1020840e+03]]
+        <BLANKLINE>
+         [[2.2025465e+04 5.9873141e+04 1.6275380e+05]
+          [4.4241238e+05 1.2026032e+06 3.2690165e+06]]]
+
+
+    """
+    return Expm1()(x)
+
+
+@register_tensor_op("expm1")
+def expm1_op_tensor(x):
+    """
+    See :func:`oneflow.compatible.single_client.experimental.expm1`
+    """
+    return Expm1()(x)
+
+
+class Topk(Module):
+    def __init__(
+        self, k, dim: int = None, largest: bool = True, sorted: bool = True
+    ) -> None:
+        super().__init__()
+        self._op_topk_last_dim = (
+            flow.builtin_op("top_k")
+            .Input("in")
+            .Output("out")
+            .Attr("k", k)
+            .Attr("sorted", sorted)
+            .Build()
+        )
+        self.dim = dim
+        self.largest = largest
+
+    def forward(self, input):
+        if self.dim == None:
+            self.dim = -1
+        num_axes = len(input.shape)
+        axis = self.dim if self.dim >= 0 else self.dim + num_axes
+        assert 0 <= axis < num_axes, "axis out of range"
+        if axis == num_axes - 1:
+            if self.largest:
+                indices = self._op_topk_last_dim(input)[0]
+            else:
+                neg_input = flow.experimental.mul(input, -1)
+                indices = self._op_topk_last_dim(neg_input)[0]
+            return (flow.experimental.gather(input, indices, dim=axis), indices)
+        else:
+            perm = get_perm_when_transpose_axis_to_last_dim(num_axes, axis)
+            x = flow.F.transpose(input, perm=perm)
+            if self.largest:
+                indices = self._op_topk_last_dim(x)[0]
+            else:
+                neg_input = flow.experimental.mul(x, -1)
+                indices = self._op_topk_last_dim(neg_input)[0]
+            indices = flow.F.transpose(indices, perm=get_inversed_perm(perm))
+            return (flow.experimental.gather(input, indices, dim=axis), indices)
+
+
+@register_tensor_op("topk")
+def topk_op(input, k, dim: int = None, largest: bool = True, sorted: bool = True):
+    """Finds the values and indices of the k largest entries at specified axis.
+
+    Args:
+        input (oneflow.compatible.single_client.Tensor): Input Tensor
+        dim (int, optional): the dimension to sort along. Defaults to the last dim (-1)
+        largest (bool, optional): controls whether to return largest or smallest elements
+        sorted (bool, optional): controls whether to return the elements in sorted order
+
+    Returns:
+        Tuple(oneflow.compatible.single_client.Tensor, oneflow.compatible.single_client.Tensor(dtype=int32)): A tuple of (values, indices), where
+        the indices are the indices of the elements in the original input tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+        >>> x = np.array([[1, 3, 8, 7, 2], [1, 9, 4, 3, 2]], dtype=np.float32)
+        >>> (values, indices) = flow.topk(flow.Tensor(x), k=3, dim=1)
+        >>> values
+        tensor([[8., 7., 3.],
+                [9., 4., 3.]], dtype=oneflow.float32)
+        >>> indices
+        tensor([[2, 3, 1],
+                [1, 2, 3]], dtype=oneflow.int32)
+        >>> values.shape
+        flow.Size([2, 3])
+        >>> indices.shape
+        flow.Size([2, 3])
+        >>> (values, indices) = flow.topk(flow.Tensor(x), k=2, dim=1, largest=False)
+        >>> values
+        tensor([[1., 2.],
+                [1., 2.]], dtype=oneflow.float32)
+        >>> indices
+        tensor([[0, 4],
+                [0, 4]], dtype=oneflow.int32)
+        >>> values.shape
+        flow.Size([2, 2])
+        >>> indices.shape
+        flow.Size([2, 2])
+
+    """
+    return Topk(k=k, dim=dim, largest=largest, sorted=sorted)(input)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/matmul.py b/python/oneflow/compatible/single_client/nn/modules/matmul.py
new file mode 100644
index 0000000000000000000000000000000000000000..570c0b14536ba272c713cdda7ce96f8550cb63b6
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/matmul.py
@@ -0,0 +1,75 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional, Sequence
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class MatMul(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, a, b):
+        assert len(a.shape) >= 2, "Tensor a's dim should >=2"
+        assert len(b.shape) >= 2, "Tensor b's dim should >=2"
+        if len(a.shape) == len(b.shape):
+            if len(a.shape) == 2:
+                res = flow.F.matmul(a, b)
+            else:
+                res = flow.F.batch_matmul(a, b)
+        else:
+            assert (
+                len(b.shape) == 2
+            ), "Not support number of dimensions of a being less than number of dimensions of b!"
+            res = flow.F.broadcast_matmul(a, b)
+        return res
+
+
+@register_tensor_op("matmul")
+def matmul_op(a, b):
+    """This operator applies matrix multiplication to two Tensor.
+
+    Args:
+        a (oneflow.compatible.single_client.Tensor): A Tensor
+        b (oneflow.compatible.single_client.Tensor): A Tensor
+
+    Returns:
+        oneflow.compatible.single_client.Tensor: The result Tensor
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+        >>> input1 = flow.Tensor(np.random.randn(2, 6), dtype=flow.float32)
+        >>> input2 = flow.Tensor(np.random.randn(6, 5), dtype=flow.float32)
+        >>> of_out = flow.matmul(input1, input2)
+        >>> of_out.shape
+        flow.Size([2, 5])
+
+    """
+    return MatMul()(a, b)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/meshgrid.py b/python/oneflow/compatible/single_client/nn/modules/meshgrid.py
new file mode 100644
index 0000000000000000000000000000000000000000..b55638d8199b0f4c1e1d03e398e504812e3a8120
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/meshgrid.py
@@ -0,0 +1,93 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class MeshGrid(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, inputs):
+        size = len(inputs)
+        assert size > 0, f"meshgrid expects a non-empty TensorList"
+        shape = list()
+        for i in range(size):
+            assert inputs[i].dim() <= 1, f(
+                "Expected scalar or 1D tensor in the tensor list but got: ", inputs[i]
+            )
+            if inputs[i].dim() == 0:
+                shape.append(1)
+            else:
+                shape.append(inputs[i].shape[0])
+        for i in range(size - 1):
+            assert (
+                inputs[i].dtype == inputs[i + 1].dtype
+                and inputs[i].device == inputs[i + 1].device
+            ), f"meshgrid expects all tensors to have the same dtype and device"
+        outputs = []
+        for i in range(size):
+            view_shape = [1] * size
+            view_shape[i] = -1
+            outputs.append(inputs[i].reshape(view_shape).expand(*shape))
+        return outputs
+
+
+def meshgrid_op(*inputs):
+    """The interface is consistent with PyTorch.
+    The documentation is referenced from:
+    https://pytorch.org/docs/stable/_modules/torch/functional.html#meshgrid
+    
+    Take :math:`N` tensors, each of which can be either scalar or 1-dimensional
+    vector, and create :math:`N` N-dimensional grids, where the :math:`i` :sup:`th` grid is defined by
+    expanding the :math:`i` :sup:`th` input over dimensions defined by other inputs.
+
+    Args:
+        tensors (list of Tensor): list of scalars or 1 dimensional tensors. Scalars will be
+            treated as tensors of size :math:`(1,)` automatically
+
+    Returns:
+        seq (sequence of Tensors): If the input has :math:`k` tensors of size
+        :math:`(N_1,), (N_2,), \\ldots , (N_k,)`, then the output would also have :math:`k` tensors,
+        where all tensors are of size :math:`(N_1, N_2, \\ldots , N_k)`.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> input1 = flow.Tensor(np.array([1, 2, 3]), dtype=flow.float32)
+        >>> input2 = flow.Tensor(np.array([4, 5, 6]), dtype=flow.float32)
+        >>> of_x, of_y = flow.meshgrid(input1, input2)
+        >>> of_x
+        tensor([[1., 1., 1.],
+                [2., 2., 2.],
+                [3., 3., 3.]], dtype=oneflow.float32)
+        >>> of_y
+        tensor([[4., 5., 6.],
+                [4., 5., 6.],
+                [4., 5., 6.]], dtype=oneflow.float32)
+    """
+    return MeshGrid()(inputs)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/ne.py b/python/oneflow/compatible/single_client/nn/modules/ne.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb5238dfa8642c46862c3cedc5ef46a0c3255a78
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/ne.py
@@ -0,0 +1,80 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class Ne(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, input, other):
+        if isinstance(other, flow.Tensor) or isinstance(
+            other, oneflow._oneflow_internal.Tensor
+        ):
+            for i in range(len(input.size())):
+                assert (
+                    input.shape[i] >= other.shape[i]
+                ), "The second tensor's shape should broadcastable with the first argument."
+                if input.dtype != other.dtype:
+                    other = other.to(dtype=input.dtype)
+        elif isinstance(other, int) or isinstance(other, float):
+            other = flow.Tensor([other], dtype=input.dtype, device=input.device)
+        else:
+            raise NotImplementedError(
+                "Unsupport data type, The second argument can be a tensor whose shape is broadcastable with the first argument."
+            )
+        return flow.F.broadcast_not_equal(input, other)
+
+
+@register_tensor_op("ne")
+def ne_op(input, other):
+    """
+    Computes element-wise not equality.
+    The second argument can be a number or a tensor whose shape is broadcastable with the first argument.
+
+    Args:
+        input (oneflow.compatible.single_client.Tensor): the tensor to compare
+        other (oneflow.compatible.single_client.Tensor, float or int): the target to compare
+
+    Returns:
+
+        - A boolean tensor that is True where :attr:`input` is not equal to :attr:`other` and False elsewhere
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+
+        >>> input = flow.Tensor(np.array([2, 3, 4, 5]), dtype=flow.float32)
+        >>> other = flow.Tensor(np.array([2, 3, 4, 1]), dtype=flow.float32)
+
+        >>> y = flow.ne(input, other)
+        >>> y
+        tensor([0, 0, 0, 1], dtype=oneflow.int8)
+
+    """
+    return Ne()(input, other)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/negative.py b/python/oneflow/compatible/single_client/nn/modules/negative.py
new file mode 100644
index 0000000000000000000000000000000000000000..3473e0cce38a33480e7c20106a0d3deddb542140
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/negative.py
@@ -0,0 +1,61 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class Negative(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.negative(x)
+
+
+@register_tensor_op("negative")
+def negative_op(x):
+    """This operator computes the negative value of Tensor.
+
+    Args:
+        x (oneflow.compatible.single_client.Tensor): A Tensor
+
+    Returns:
+        oneflow.compatible.single_client.Tensor: The result Tensor
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> input = flow.Tensor(
+        ...    np.array([1.0, -1.0, 2.3]).astype(np.float32), dtype=flow.float32
+        ... )
+        >>> out = flow.negative(input)
+        >>> out
+        tensor([-1. ,  1. , -2.3], dtype=oneflow.float32)
+
+    """
+    return Negative()(x)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/norm.py b/python/oneflow/compatible/single_client/nn/modules/norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..6117925498434372e0c2621e191ef21f48660629
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/norm.py
@@ -0,0 +1,256 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class Norm(Module):
+    def __init__(self, ord=None, dim=None, keepdim=False) -> None:
+        super().__init__()
+        self.ord = ord
+        self.dim = dim
+        self.keepdim = keepdim
+
+    def _vector_norm(self, x, ord, dim):
+        if isinstance(ord, str) and ord in ["fro", "nuc"]:
+            raise ValueError("Norm order {} is not supported for vectors".format(ord))
+        elif isinstance(ord, float) and ord in [float("inf"), float("-inf")]:
+            if ord == float("inf"):
+                return flow.experimental.max(flow.experimental.abs(x), dim=dim)
+            else:
+                return flow.experimental.min(flow.experimental.abs(x), dim=dim)
+        elif isinstance(ord, int):
+            if ord == 0:
+                return flow.tensor([flow.experimental.argwhere(x).shape[0]])
+            else:
+                return flow.experimental.pow(
+                    flow.experimental.sum(
+                        flow.experimental.pow(flow.experimental.abs(x), ord), dim=dim
+                    ),
+                    1.0 / ord,
+                )
+        else:
+            raise ValueError("Invalid norm order: {}".format(ord))
+
+    def _matrix_norm(self, x, ord, dim):
+        if isinstance(ord, str) and ord in ["fro", "nuc"]:
+            if ord == "nuc":
+                raise NotImplementedError
+            else:
+                return flow.experimental.sqrt(
+                    flow.experimental.sum(flow.experimental.square(x), dim=dim)
+                )
+        elif isinstance(ord, float) and ord in [float("inf"), float("-inf")]:
+            if ord == float("inf"):
+                return flow.experimental.max(
+                    flow.experimental.sum(flow.experimental.abs(x), dim=1)
+                )
+            else:
+                return flow.experimental.min(
+                    flow.experimental.sum(flow.experimental.abs(x), dim=1)
+                )
+        elif isinstance(ord, int):
+            if ord == 1:
+                return flow.experimental.max(
+                    flow.experimental.sum(flow.experimental.abs(x), dim=0)
+                )
+            elif ord == -1:
+                return flow.experimental.min(
+                    flow.experimental.sum(flow.experimental.abs(x), dim=0)
+                )
+            elif ord == 2:
+                raise NotImplementedError
+            elif ord == -2:
+                raise NotImplementedError
+            else:
+                raise ValueError(
+                    "Norm order {} is not supported for matrices".format(ord)
+                )
+        else:
+            raise ValueError("Invalid norm order: {}".format(ord))
+
+    def _whether_keepdim(self, x):
+        if self.keepdim == True and self.dim != None:
+            return flow.experimental.unsqueeze(x, self.dim)
+        else:
+            return x
+
+    def forward(self, x):
+        num_axes = len(x.shape)
+        if self.dim == None and self.ord == None:
+            res = self._vector_norm(x.reshape((1, -1))[0], ord=2, dim=self.dim)
+        elif self.dim == None and self.ord != None:
+            assert (
+                num_axes <= 2
+            ), "input must be 1-D or 2-D when dim is None and ord is not None"
+            res = (
+                self._vector_norm(x, self.ord, self.dim)
+                if num_axes == 1
+                else self._matrix_norm(x, self.ord, self.dim)
+            )
+        elif isinstance(self.dim, (int, tuple, list)):
+            if isinstance(self.dim, int):
+                self.dim = self.dim if self.dim >= 0 else self.dim + num_axes
+                assert 0 <= self.dim < num_axes, "dim out of range"
+                res = self._vector_norm(
+                    x, ord=2 if self.ord == None else self.ord, dim=self.dim
+                )
+            else:
+                temp = list(self.dim) if isinstance(self.dim, tuple) else self.dim
+                for i in range(len(temp)):
+                    temp[i] = temp[i] if temp[i] >= 0 else temp[i] + num_axes
+                    assert 0 <= temp[i] < num_axes, "dim out of range"
+                self.dim = temp
+                res = self._matrix_norm(
+                    x, ord="fro" if self.ord == None else self.ord, dim=self.dim
+                )
+        else:
+            raise ValueError("Invalid dimension: {}".format(self.dim))
+        return self._whether_keepdim(res)
+
+
+def norm_op(input, ord=None, dim=None, keepdim=False):
+    """linalg.norm(input, ord=None, dim=None, keepdim=False, *, out=None) -> Tensor
+
+    Returns the matrix norm or vector norm of a given tensor.
+
+    This function can calculate one of eight different types of matrix norms, or one
+    of an infinite number of vector norms, depending on both the number of reduction
+    dimensions and the value of the `ord` parameter.
+
+    Args:
+        input (Tensor): The input tensor. If dim is None, input must be 1-D or 2-D, unless :attr:`ord`
+            is None. If both :attr:`dim` and :attr:`ord` are None, the 2-norm of the input flattened to 1-D
+            will be returned. Its data type must be either a floating point or complex type. For complex
+            inputs, the norm is calculated on of the absolute values of each element. If the input is
+            complex and neither :attr:`dtype` nor :attr:`out` is specified, the result's data type will
+            be the corresponding floating point type (e.g. float if :attr:`input` is complexfloat).
+
+        ord (int, float, inf, -inf, 'fro', 'nuc', optional): The order of norm.
+            inf refers to :attr:`float('inf')`, numpy's :attr:`inf` object, or any equivalent object.
+            The following norms can be calculated:
+
+            =====  ============================  ==========================
+            ord    norm for matrices             norm for vectors
+            =====  ============================  ==========================
+            None   Frobenius norm                2-norm
+            'fro'  Frobenius norm                -- not supported --
+            'nuc'  -- not supported yet --       -- not supported --
+            inf    max(sum(abs(x), dim=1))       max(abs(x))
+            -inf   min(sum(abs(x), dim=1))       min(abs(x))
+            0      -- not supported --           sum(x != 0)
+            1      max(sum(abs(x), dim=0))       as below
+            -1     min(sum(abs(x), dim=0))       as below
+            2      -- not supported yet --       as below
+            -2     -- not supported yet --       as below
+            other  -- not supported --           sum(abs(x)**ord)**(1./ord)
+            =====  ============================  ==========================
+
+            Default: ``None``
+
+        dim (int, 2-tuple of ints, 2-list of ints, optional): If :attr:`dim` is an int,
+            vector norm will be calculated over the specified dimension. If :attr:`dim`
+            is a 2-tuple of ints, matrix norm will be calculated over the specified
+            dimensions. If :attr:`dim` is None, matrix norm will be calculated
+            when the input tensor has two dimensions, and vector norm will be
+            calculated when the input tensor has one dimension. Default: ``None``
+
+        keepdim (bool, optional): If set to True, the reduced dimensions are retained
+            in the result as dimensions with size one. Default: ``False``
+
+        out (Tensor, optional): The output tensor.
+
+    Examples::
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> from oneflow.compatible.single_client.experimental import linalg as LA
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+        >>> a = flow.tensor(np.arange(9, dtype=np.float32) - 4)
+        >>> a
+        tensor([-4., -3., -2., -1.,  0.,  1.,  2.,  3.,  4.], dtype=oneflow.float32)
+        >>> b = a.reshape((3, 3))
+        >>> b
+        tensor([[-4., -3., -2.],
+                [-1.,  0.,  1.],
+                [ 2.,  3.,  4.]], dtype=oneflow.float32)
+
+        >>> LA.norm(a)
+        tensor([7.746], dtype=oneflow.float32)
+        >>> LA.norm(b)
+        tensor([7.746], dtype=oneflow.float32)
+        >>> LA.norm(b, 'fro')
+        tensor([7.746], dtype=oneflow.float32)
+        >>> LA.norm(a, float('inf'))
+        tensor([4.], dtype=oneflow.float32)
+        >>> LA.norm(b, float('inf'))
+        tensor([9.], dtype=oneflow.float32)
+        >>> LA.norm(a, -float('inf'))
+        tensor([0.], dtype=oneflow.float32)
+        >>> LA.norm(b, -float('inf'))
+        tensor([2.], dtype=oneflow.float32)
+
+        >>> LA.norm(a, 1)
+        tensor([20.], dtype=oneflow.float32)
+        >>> LA.norm(b, 1)
+        tensor([7.], dtype=oneflow.float32)
+        >>> LA.norm(a, -1)
+        tensor([0.], dtype=oneflow.float32)
+        >>> LA.norm(b, -1)
+        tensor([6.], dtype=oneflow.float32)
+        >>> LA.norm(a, 2)
+        tensor([7.746], dtype=oneflow.float32)
+        >>> LA.norm(a, -2)
+        tensor([0.], dtype=oneflow.float32)
+        >>> LA.norm(a, 3)
+        tensor([5.848], dtype=oneflow.float32)
+        >>> LA.norm(a, -3)
+        tensor([0.], dtype=oneflow.float32)
+
+    Using the :attr:`dim` argument to compute vector norms::
+
+        >>> c = flow.tensor([[1., 2., 3.],
+        ...                   [-1, 1, 4]])
+        >>> LA.norm(c, dim=0)
+        tensor([1.4142, 2.2361, 5.    ], dtype=oneflow.float32)
+        >>> LA.norm(c, dim=1, keepdim = True)
+        tensor([[3.7417],
+                [4.2426]], dtype=oneflow.float32)
+        >>> LA.norm(c, ord=1, dim=1)
+        tensor([6., 6.], dtype=oneflow.float32)
+
+    Using the :attr:`dim` argument to compute matrix norms::
+
+        >>> m = flow.tensor(np.arange(8, dtype=np.float32)).reshape((2, 2, 2))
+        >>> LA.norm(m, dim=(1,2))
+        tensor([ 3.7417, 11.225 ], dtype=oneflow.float32)
+    """
+    return Norm(ord, dim, keepdim)(input)
+
+
+@register_tensor_op("norm")
+def norm_tensor_op(input, ord=None, dim=None, keepdim=False):
+    """
+    See :func:`oneflow.compatible.single_client.experimental.linalg.norm.`
+    """
+    return Norm(ord, dim, keepdim)(input)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/normalization.py b/python/oneflow/compatible/single_client/nn/modules/normalization.py
new file mode 100644
index 0000000000000000000000000000000000000000..e207bdad9ccbd49e4e1cdb129b567b65b5d3d5dd
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/normalization.py
@@ -0,0 +1,325 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Tuple, Union
+
+import oneflow
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import Tensor
+from oneflow.compatible.single_client.nn import init
+from oneflow.compatible.single_client.nn.module import Module
+
+_shape_t = Union[int, Tuple[int], oneflow._oneflow_internal.Size]
+
+
+class GroupNorm(Module):
+    """The interface is consistent with PyTorch.
+    The documentation is referenced from:
+    https://pytorch.org/docs/stable/generated/torch.nn.GroupNorm.html
+
+    Applies Group Normalization over a mini-batch of inputs as described in
+    the paper `Group Normalization <https://arxiv.org/abs/1803.08494>`__
+
+    .. math::
+
+        y = \\frac{x - \\mathrm{E}[x]}{ \\sqrt{\\mathrm{Var}[x] + \\epsilon}} * \\gamma + \\beta
+
+    The input channels are separated into :attr:`num_groups` groups, each containing
+    ``num_channels / num_groups`` channels. The mean and standard-deviation are calculated
+    separately over the each group. :math:`\\gamma` and :math:`\\beta` are learnable
+    per-channel affine transform parameter vectors of size :attr:`num_channels` if
+    :attr:`affine` is ``True``.
+    The standard-deviation is calculated via the biased estimator, equivalent to
+    `torch.var(input, unbiased=False)`.
+
+    This layer uses statistics computed from input data in both training and
+    evaluation modes.
+
+    Args:
+        num_groups (int): number of groups to separate the channels into
+        num_channels (int): number of channels expected in input
+        eps: a value added to the denominator for numerical stability. Default: 1e-5
+        affine: a boolean value that when set to ``True``, this module
+            has learnable per-channel affine parameters initialized to ones (for weights)
+            and zeros (for biases). Default: ``True``.
+
+    Shape:
+        - Input: :math:`(N, C, *)` where :math:`C=\\text{num_channels}`
+        - Output: :math:`(N, C, *)` (same shape as input)
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+
+        >>> input = flow.Tensor(np.random.randn(20, 6, 10, 10))
+        >>> # Separate 6 channels into 3 groups
+        >>> m = flow.nn.GroupNorm(3, 6)
+        >>> # Separate 6 channels into 6 groups (equivalent with InstanceNorm)
+        >>> m = flow.nn.GroupNorm(6, 6)
+        >>> # Put all 6 channels into a single group (equivalent with LayerNorm)
+        >>> m = flow.nn.GroupNorm(1, 6)
+        >>> # Activating the module
+        >>> output = m(input)
+    
+"""
+
+    def __init__(
+        self,
+        num_groups: int,
+        num_channels: int,
+        eps: float = 1e-05,
+        affine: bool = True,
+    ) -> None:
+        super().__init__()
+        assert num_groups > 0, "The num_groups must larger than zero"
+        assert num_channels > 0, "The num_channels must larger than zero"
+        self.num_groups = num_groups
+        self.num_channels = num_channels
+        self.eps = eps
+        self.affine = affine
+        if self.affine:
+            self.weight = flow.nn.Parameter(flow.Tensor(1, num_channels, 1))
+            self.bias = flow.nn.Parameter(flow.Tensor(1, num_channels, 1))
+        else:
+            self.register_parameter("weight", None)
+            self.register_parameter("bias", None)
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        if self.affine:
+            flow.nn.init.ones_(self.weight)
+            flow.nn.init.zeros_(self.bias)
+
+    def forward(self, input: Tensor) -> Tensor:
+        assert (
+            len(input.shape) >= 3
+        ), "The dimensions of input tensor must larger than 2"
+        assert (
+            input.shape[1] == self.num_channels
+        ), "The channels of input tensor must equal num_channels"
+        origin_shape = input.shape
+        reshape_to_1d = flow.experimental.reshape(
+            input, shape=[origin_shape[0], self.num_groups, -1]
+        )
+        mean = flow.experimental.mean(reshape_to_1d, dim=2, keepdim=True)
+        variance = flow.experimental.var(reshape_to_1d, dim=2, keepdim=True)
+        normalized = (reshape_to_1d - mean) / flow.experimental.sqrt(
+            variance + self.eps
+        )
+        normalized = flow.experimental.reshape(
+            normalized, shape=[origin_shape[0], self.num_channels, -1]
+        )
+        if self.weight:
+            normalized = normalized * self.weight
+        if self.bias:
+            normalized = normalized + self.bias
+        res = flow.experimental.reshape(normalized, shape=tuple(input.shape))
+        return res
+
+
+class LayerNorm(Module):
+    """Applies Layer Normalization over a mini-batch of inputs as described in
+    the paper `Layer Normalization <https://arxiv.org/abs/1607.06450>`__
+
+    .. math::
+        y = \\frac{x - \\mathrm{E}[x]}{ \\sqrt{\\mathrm{Var}[x] + \\epsilon}} * \\gamma + \\beta
+
+    The mean and standard-deviation are calculated separately over the last
+    certain number dimensions which have to be of the shape specified by
+    :attr:`normalized_shape`.
+    :math:`\\gamma` and :math:`\\beta` are learnable affine transform parameters of
+    :attr:`normalized_shape` if :attr:`elementwise_affine` is ``True``.
+    The standard-deviation is calculated via the biased estimator.
+
+    .. note::
+        Unlike Batch Normalization and Instance Normalization, which applies
+        scalar scale and bias for each entire channel/plane with the
+        :attr:`affine` option, Layer Normalization applies per-element scale and
+        bias with :attr:`elementwise_affine`.
+
+    This layer uses statistics computed from input data in both training and
+    evaluation modes.
+
+    Args:
+        normalized_shape (int or list or oneflow.compatible.single_client.Size): input shape from an expected input of size
+
+            .. math::
+                [* \\times \\text{normalized_shape}[0] \\times \\text{normalized_shape}[1] \\times \\ldots \\times \\text{normalized_shape}[-1]]
+
+            If a single integer is used, it is treated as a singleton list, and this module will
+
+            normalize over the last dimension which is expected to be of that specific size.
+
+        eps: a value added to the denominator for numerical stability. Default: 1e-5
+        elementwise_affine: a boolean value that when set to ``True``, this module
+            has learnable per-element affine parameters initialized to ones (for weights)
+            and zeros (for biases). Default: ``True``.
+
+    Shape:
+        - Input: :math:`(N, *)`
+        - Output: :math:`(N, *)` (same shape as input)
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> input_arr = np.array(
+        ...     [
+        ...         [
+        ...             [[-0.16046895, -1.03667831], [-0.34974465, 0.26505867]],
+        ...             [[-1.24111986, -0.53806001], [1.72426331, 0.43572459]],
+        ...         ],
+        ...         [
+        ...             [[-0.77390957, -0.42610624], [0.16398858, -1.35760343]],
+        ...             [[1.07541728, 0.11008703], [0.26361224, -0.48663723]],
+        ...         ],
+        ...     ],
+        ...     dtype=np.float32,
+        ... )
+
+        >>> x = flow.Tensor(input_arr)
+        >>> m = flow.nn.LayerNorm(2)
+        >>> y = m(x).numpy()
+        >>> y
+        array([[[[ 0.99997395, -0.99997395],
+                 [-0.999947  ,  0.999947  ]],
+        <BLANKLINE>
+                [[-0.9999596 ,  0.9999594 ],
+                 [ 0.999988  , -0.999988  ]]],
+        <BLANKLINE>
+        <BLANKLINE>
+               [[[-0.9998343 ,  0.9998341 ],
+                 [ 0.9999914 , -0.9999914 ]],
+        <BLANKLINE>
+                [[ 0.99997866, -0.99997866],
+                 [ 0.9999646 , -0.9999646 ]]]], dtype=float32)
+
+    """
+
+    __constants__ = ["normalized_shape", "eps", "elementwise_affine"]
+    normalized_shape: Tuple[int, ...]
+    eps: float
+    elementwise_affine: bool
+
+    def __init__(
+        self,
+        normalized_shape: _shape_t,
+        eps: float = 1e-05,
+        elementwise_affine: bool = True,
+    ) -> None:
+        super(LayerNorm, self).__init__()
+        if isinstance(normalized_shape, int):
+            normalized_shape = (normalized_shape,)
+        self.normalized_shape = tuple(normalized_shape)
+        self.epsilon = eps
+        self.elementwise_affine = elementwise_affine
+        if self.elementwise_affine:
+            self.weight = flow.nn.Parameter(flow.Tensor(*self.normalized_shape))
+            self.bias = flow.nn.Parameter(flow.Tensor(*self.normalized_shape))
+        else:
+            self.register_parameter("weight", None)
+            self.register_parameter("bias", None)
+        self.reset_parameters()
+        self.begin_norm_axis = 1
+        self.begin_params_axis = 1
+
+    def reset_parameters(self) -> None:
+        if self.elementwise_affine:
+            init.ones_(self.weight)
+            init.zeros_(self.bias)
+
+    def forward(self, x):
+        assert len(x.shape) > len(
+            self.normalized_shape
+        ), "Input tensor dim must greater than normalized dim!"
+        self.begin_norm_axis = len(x.shape) - len(self.normalized_shape)
+        self.begin_params_axis = len(x.shape) - len(self.normalized_shape)
+        if x.device == flow.device("cpu"):
+            reduce_axis = []
+            for dim in range(len(x.shape)):
+                if dim >= self.begin_norm_axis:
+                    reduce_axis.append(dim)
+            mean = x.mean(dim=reduce_axis, keepdim=True)
+            variance = x.var(dim=reduce_axis, keepdim=True)
+            axis = self.begin_norm_axis
+            params_shape = x.shape[self.begin_params_axis :]
+            weight = self.weight
+            bias = self.bias
+            if len(mean.shape) == 1:
+                nd_params_shape = [1] * len(x.shape)
+                nd_params_shape[self.begin_norm_axis] = params_shape[0]
+                mean = mean.reshape(shape=nd_params_shape)
+                variance = variance.reshape(shape=nd_params_shape)
+                if self.weight and params_shape[0] == self.weight.nelement():
+                    weight = self.weight.reshape(shape=nd_params_shape)
+                if self.bias and params_shape[0] == self.bias.nelement():
+                    bias = self.bias.reshape(shape=nd_params_shape)
+            elif len(mean.shape) == len(x.shape):
+                pass
+            else:
+                raise ValueError(
+                    "shape of mean and variance should be 1D or has number of axes and x's"
+                )
+            variance += self.epsilon
+            normalized = (x - mean) * variance.rsqrt()
+            if self.weight:
+                normalized = normalized * weight
+            if self.bias:
+                normalized = normalized + bias
+            affined = normalized
+            nd_params_shape = [1] * (len(x.shape) - len(params_shape)) + list(
+                params_shape
+            )
+            if self.elementwise_affine:
+                affined = affined * self.weight
+                affined = affined + self.bias
+            return affined
+        else:
+            if self.elementwise_affine:
+                res = flow.F.layer_norm_affine(
+                    x,
+                    self.weight,
+                    self.bias,
+                    begin_norm_axis=self.begin_norm_axis,
+                    begin_params_axis=self.begin_params_axis,
+                    epsilon=self.epsilon,
+                )
+            else:
+                res = flow.F.layer_norm(
+                    x,
+                    begin_norm_axis=self.begin_norm_axis,
+                    begin_params_axis=self.begin_params_axis,
+                    epsilon=self.epsilon,
+                )
+            return res
+
+    def extra_repr(self) -> str:
+        return "{normalized_shape}, eps={eps}, elementwise_affine={elementwise_affine}".format(
+            **self.__dict__
+        )
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/padding.py b/python/oneflow/compatible/single_client/nn/modules/padding.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0f7bc841d4a3e56047a24011d9669deb72d2832
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/padding.py
@@ -0,0 +1,183 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Union
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class ReplicationPad2d(Module):
+    """The interface is consistent with PyTorch.
+    The documentation is referenced from:
+    https://pytorch.org/docs/stable/generated/torch.nn.ReplicationPad2d.html?highlight=replicationpad2d#torch.nn.ReplicationPad2d
+    
+    Pads the input tensor using the replication of the input boundary.
+
+    Args:
+        padding (Union[int, tuple, list]):  the size of the padding. If is `int`, uses the same padding in all boundaries. If a 4-`tuple`, uses (:math:`\\mathrm{padding_{left}}`, :math:`\\mathrm{padding_{right}}`, :math:`\\mathrm{padding_{top}}`, :math:`\\mathrm{padding_{bottom}}`)
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})`
+        - Output: :math:`(N, C, H_{out}, W_{out})` where
+
+            :math:`H_{out} = H_{in} + \\mathrm{padding_{top}} + \\mathrm{padding_{bottom}}`
+
+            :math:`W_{out} = W_{in} + \\mathrm{padding_{left}} + \\mathrm{padding_{right}}`
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+        >>> replicationpad_layer_0 = flow.nn.ReplicationPad2d((2, 2, 1, 1))
+        >>> input = flow.Tensor(np.arange(18).reshape((1, 2, 3, 3)).astype(np.float32))
+        >>> input_int = flow.Tensor(np.arange(18).reshape((1, 2, 3, 3)).astype(np.int32))
+        >>> output = replicationpad_layer_0(input)
+        >>> output.shape
+        flow.Size([1, 2, 5, 7])
+        >>> output
+        tensor([[[[ 0.,  0.,  0.,  1.,  2.,  2.,  2.],
+                  [ 0.,  0.,  0.,  1.,  2.,  2.,  2.],
+                  [ 3.,  3.,  3.,  4.,  5.,  5.,  5.],
+                  [ 6.,  6.,  6.,  7.,  8.,  8.,  8.],
+                  [ 6.,  6.,  6.,  7.,  8.,  8.,  8.]],
+        <BLANKLINE>
+                 [[ 9.,  9.,  9., 10., 11., 11., 11.],
+                  [ 9.,  9.,  9., 10., 11., 11., 11.],
+                  [12., 12., 12., 13., 14., 14., 14.],
+                  [15., 15., 15., 16., 17., 17., 17.],
+                  [15., 15., 15., 16., 17., 17., 17.]]]], dtype=oneflow.float32)
+        >>> output_int = replicationpad_layer_0(input_int)
+        >>> output_int
+        tensor([[[[ 0.,  0.,  0.,  1.,  2.,  2.,  2.],
+                  [ 0.,  0.,  0.,  1.,  2.,  2.,  2.],
+                  [ 3.,  3.,  3.,  4.,  5.,  5.,  5.],
+                  [ 6.,  6.,  6.,  7.,  8.,  8.,  8.],
+                  [ 6.,  6.,  6.,  7.,  8.,  8.,  8.]],
+        <BLANKLINE>
+                 [[ 9.,  9.,  9., 10., 11., 11., 11.],
+                  [ 9.,  9.,  9., 10., 11., 11., 11.],
+                  [12., 12., 12., 13., 14., 14., 14.],
+                  [15., 15., 15., 16., 17., 17., 17.],
+                  [15., 15., 15., 16., 17., 17., 17.]]]], dtype=oneflow.float32)
+
+    """
+
+    def __init__(self, padding: Union[int, tuple, list]):
+        super().__init__()
+        if isinstance(padding, (tuple, list)):
+            assert len(padding) == 4, ValueError("Length of padding must be 4")
+            boundary = [padding[0], padding[1], padding[2], padding[3]]
+        elif isinstance(padding, int):
+            boundary = [padding, padding, padding, padding]
+        else:
+            raise ValueError("padding must be int or list or tuple!")
+        self.padding = boundary
+
+    def forward(self, x):
+        (_, _, h, w) = x.shape
+        if (
+            self.padding[2] < h
+            and self.padding[3] < h
+            and (self.padding[0] < w)
+            and (self.padding[1] < w)
+        ):
+            return flow.F.pad(x, pad=self.padding, mode="replicate")
+        else:
+            raise AssertionError(
+                "Padding size should be less than the corresponding input dimension. Please check."
+            )
+
+
+class ReflectionPad2d(Module):
+    """The interface is consistent with PyTorch.
+    The documentation is referenced from:
+    https://pytorch.org/docs/stable/generated/torch.nn.ReflectionPad2d.html
+
+
+    This operator pads the input tensor using the reflection of the input boundary.
+
+    Args:
+        padding (Union[int,tuple]): The size or bundary of padding, if is `int` uses the same padding in all dimension; if 4-dims `tuple`, uses :math:`(\\text{padding}_{\\text{left}}, \\text{padding}_{\\text{right}}, \\text{padding}_{\\text{top}}, \\text{padding}_{\\text{bottom}} )`
+
+    Returns:
+        Tensor: Returns a new tensor which is result of the reflection padding of the input tensor.
+
+    Shape:
+        - Input: :math:`(N, C, H_{\\text{in}}, W_{\\text{in}})`
+        - Output: :math:`(N, C, H_{\\text{out}}, W_{\\text{out}})` where
+
+          :math:`H_{\\text{out}} = H_{\\text{in}} + \\text{padding}_{\\text{top}} + \\text{padding}_{\\text{bottom}}`
+
+          :math:`W_{\\text{out}} = W_{\\text{in}} + \\text{padding}_{\\text{left}} + \\text{padding}_{\\text{right}}`
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+        >>> input = flow.Tensor(np.arange(18).reshape((1, 2, 3, 3)), dtype=flow.float32)
+        >>> m = flow.nn.ReflectionPad2d((2, 2, 1, 1))
+        >>> out = m(input)
+        >>> out
+        tensor([[[[ 5.,  4.,  3.,  4.,  5.,  4.,  3.],
+                  [ 2.,  1.,  0.,  1.,  2.,  1.,  0.],
+                  [ 5.,  4.,  3.,  4.,  5.,  4.,  3.],
+                  [ 8.,  7.,  6.,  7.,  8.,  7.,  6.],
+                  [ 5.,  4.,  3.,  4.,  5.,  4.,  3.]],
+        <BLANKLINE>         
+                 [[14., 13., 12., 13., 14., 13., 12.],
+                  [11., 10.,  9., 10., 11., 10.,  9.],
+                  [14., 13., 12., 13., 14., 13., 12.],
+                  [17., 16., 15., 16., 17., 16., 15.],
+                  [14., 13., 12., 13., 14., 13., 12.]]]], dtype=oneflow.float32)
+
+    """
+
+    def __init__(self, padding: Union[int, tuple]) -> None:
+        super().__init__()
+        if isinstance(padding, tuple):
+            assert len(padding) == 4, ValueError("Padding length must be 4")
+            boundary = [padding[0], padding[1], padding[2], padding[3]]
+        elif isinstance(padding, int):
+            boundary = [padding, padding, padding, padding]
+        else:
+            raise ValueError("padding must be in or list or tuple!")
+        self.padding = boundary
+
+    def forward(self, x):
+        (H, W) = (x.shape[2], x.shape[3])
+        if (
+            self.padding[2] < H
+            and self.padding[3] < H
+            and (self.padding[0] < W)
+            and (self.padding[1] < W)
+        ):
+            return flow.F.pad(x, pad=self.padding, mode="reflect")
+        else:
+            raise ValueError(
+                "padding size should be less than the corresponding input dimension!"
+            )
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/permute.py b/python/oneflow/compatible/single_client/nn/modules/permute.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b55a7413551811a635655e198b33badc2b6ec8e
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/permute.py
@@ -0,0 +1,68 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional, Sequence
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class Permute(Module):
+    def __init__(self, *dims) -> None:
+        super().__init__()
+        self.perm = list(*dims)
+
+    def forward(self, x):
+        assert len(self.perm) == len(x.shape)
+        new_perm = []
+        for dim in self.perm:
+            if dim < 0:
+                dim += len(self.perm)
+            assert dim >= 0 and dim < len(
+                x.shape
+            ), "Invalid dim0 {}, len(shape): {}".format(dim, len(x.shape))
+            new_perm.append(dim)
+        return flow.F.transpose(x, perm=new_perm)
+
+
+@register_tensor_op("permute")
+def permute_op(tensor, *dims):
+    """Returns a view of the original tensor with its dimensions permuted.
+
+    Args:
+        *dims (int...): The desired ordering of dimensions
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> input = flow.Tensor(np.random.randn(2, 6, 5, 3), dtype=flow.float32)
+        >>> out = input.permute(1, 0, 2, 3).shape
+        >>> out
+        flow.Size([6, 2, 5, 3])
+
+    """
+    return Permute(dims)(tensor)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/pixelshuffle.py b/python/oneflow/compatible/single_client/nn/modules/pixelshuffle.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8e3f5c619813825de9ec2a2774cbeca04e783a5
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/pixelshuffle.py
@@ -0,0 +1,106 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.framework.tensor import Tensor
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class PixelShuffle(Module):
+    """The interface is consistent with PyTorch.
+    The documentation is referenced from:
+    https://pytorch.org/docs/stable/generated/torch.nn.PixelShuffle.html#torch.nn.PixelShuffle
+
+    Rearranges elements in a tensor of shape :math:`(*, C \\times r^2, H, W)`
+    to a tensor of shape :math:`(*, C, H \\times r, W \\times r)`, where r is an upscale factor.
+
+    This is useful for implementing efficient sub-pixel convolution
+    with a stride of :math:`1/r`.
+
+    See the paper:
+    `Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network`_
+    by Shi et. al (2016) for more details.
+
+    Args:
+        upscale_factor (int): factor to increase spatial resolution by
+
+    Shape:
+        - Input: :math:`(*, C_{in}, H_{in}, W_{in})`, where * is zero or more batch dimensions
+        - Output: :math:`(*, C_{out}, H_{out}, W_{out})`, where
+
+    .. math::
+        C_{out} = C_{in} \\div \\text{upscale_factor}^2
+
+    .. math::
+        H_{out} = H_{in} \\times \\text{upscale_factor}
+
+    .. math::
+        W_{out} = W_{in} \\times \\text{upscale_factor}
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+
+        >>> m = flow.nn.PixelShuffle(upscale_factor=2)
+        >>> x = flow.Tensor(np.random.randn(3, 4, 5, 5))
+        >>> y = m(x)
+        >>> print(y.size())
+        flow.Size([3, 1, 10, 10])
+
+        >>> m = flow.nn.PixelShuffle(upscale_factor=3)
+        >>> x = flow.Tensor(np.random.randn(1, 18, 2, 2))
+        >>> y = m(x)
+        >>> print(y.size())
+        flow.Size([1, 2, 6, 6])
+
+    .. _Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network:
+        https://arxiv.org/abs/1609.05158
+    """
+
+    def __init__(self, upscale_factor: int) -> None:
+        super().__init__()
+        assert upscale_factor > 0, "The scale factor must larger than zero"
+        self.upscale_factor = upscale_factor
+
+    def forward(self, input: Tensor) -> Tensor:
+        assert len(input.shape) == 4, "Only Accept 4D Tensor"
+        (_batch, _channel, _height, _width) = input.shape
+        assert (
+            _channel % self.upscale_factor ** 2 == 0
+        ), "The channels of input tensor must be divisible by (upscale_factor * upscale_factor)"
+        _new_c = int(_channel / self.upscale_factor ** 2)
+        out = input.reshape([_batch, _new_c, self.upscale_factor ** 2, _height, _width])
+        out = out.reshape(
+            [_batch, _new_c, self.upscale_factor, self.upscale_factor, _height, _width]
+        )
+        out = out.permute(0, 1, 4, 2, 5, 3)
+        out = out.reshape(
+            [
+                _batch,
+                _new_c,
+                _height * self.upscale_factor,
+                _width * self.upscale_factor,
+            ]
+        )
+        return out
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/pooling.py b/python/oneflow/compatible/single_client/nn/modules/pooling.py
new file mode 100644
index 0000000000000000000000000000000000000000..72fcee1493f195ec3bae74593e478750b54bd8d2
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/pooling.py
@@ -0,0 +1,563 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.nn.common_types import (
+    _size_1_t,
+    _size_2_t,
+    _size_3_t,
+)
+from oneflow.compatible.single_client.nn.module import Module
+from oneflow.compatible.single_client.nn.modules.utils import _pair, _single, _triple
+from oneflow.compatible.single_client.ops.nn_ops import (
+    calc_pool_padding,
+    get_dhw_offset,
+)
+
+
+class AvgPool1d(Module):
+    """Applies a 1D average pooling over an input signal composed of several input planes.
+
+    In the simplest case, the output value of the layer with input size :math:`(N, C, H, W)`,
+    output :math:`(N, C, H_{out}, W_{out})` and `kernel_size` :math:`k`
+    can be precisely described as:
+
+    .. math::
+
+        out(N_i, C_j, l)  = \\frac{1}{k} \\sum_{m=0}^{k-1}
+                               input(N_i, C_j, stride[0] \\times h + m, stride*l + m)
+
+    If padding is non-zero, then the input is implicitly zero-padded on both sides for padding number of points.
+    The parameters kernel_size, stride, padding can each be an int or a one-element tuple.
+
+    Note:
+        When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding or the
+        input. Sliding windows that would start in the right padded region are ignored.
+    
+    Args:
+        kernel_size: the size of the window.
+        strides: the stride of the window. Default value is kernel_size.
+        padding: implicit zero padding to be added on both sides.
+        ceil_mode: when True, will use ceil instead of floor to compute the output shape.
+        count_include_pad: when True, will include the zero-padding in the averaging calculation.
+
+
+    # TODO: fix cuDNN bugs in pooling_1d
+    
+    """
+
+    def __init__(
+        self,
+        kernel_size: _size_1_t,
+        stride: Optional[_size_1_t] = None,
+        padding: _size_1_t = 0,
+        ceil_mode: bool = False,
+        count_include_pad: Optional[bool] = None,
+        name: Optional[str] = None,
+    ):
+        raise NotImplementedError
+
+
+class AvgPool2d(Module):
+    """Performs the 2d-average pooling on the input.
+
+    In the simplest case, the output value of the layer with input size :math:`(N, C, H, W)`,
+    output :math:`(N, C, H_{out}, W_{out})` and `kernel_size` :math:`(kH, kW)`
+    can be precisely described as:
+
+    .. math::
+
+        out(N_i, C_j, h, w)  = \\frac{1}{kH * kW} \\sum_{m=0}^{kH-1} \\sum_{n=0}^{kW-1}
+                               input(N_i, C_j, stride[0] \\times h + m, stride[1] \\times w + n)
+
+    Args:
+        kernel_size (Union[int, Tuple[int, int]]):  An int or list of ints that has length 1, 2. The size of the window for each dimension of the input Tensor.
+        strides (Union[int, Tuple[int, int]]): An int or list of ints that has length 1, 2. The stride of the sliding window for each dimension of the input Tensor.
+        padding (Tuple[int, int]): An int or list of ints that has length 1, 2. Implicit zero padding to be added on both sides.
+        ceil_mode (bool, default to False): When True, will use ceil instead of floor to compute the output shape.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client.experimental as flow
+        import numpy as np
+
+
+        of_avgpool2d = flow.nn.AvgPool2d(
+            kernel_size=(3, 2),
+            padding=0,
+            stride=(2, 1),
+        )
+        x = flow.Tensor(shape=(1, 1, 10, 10))
+        of_y = of_avgpool2d(x)   
+        
+    """
+
+    def __init__(
+        self,
+        kernel_size: _size_2_t,
+        stride: Optional[_size_2_t] = None,
+        padding: _size_2_t = 0,
+        ceil_mode: bool = False,
+        count_include_pad: Optional[bool] = None,
+        divisor_override: Optional[int] = None,
+        name: Optional[str] = None,
+    ):
+        super().__init__()
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride) if stride is not None else kernel_size
+        assert isinstance(padding, int) or isinstance(
+            padding, tuple
+        ), "padding can only int int or tuple of 2 ints."
+        padding = _pair(padding)
+        padding = [0, 0, *padding]
+        assert count_include_pad is None, "count_include_pad not supported yet"
+        assert divisor_override is None, "divisor_override not supported yet"
+        self._channel_pos = "channels_first"
+        (self._padding_type, _pads_list) = calc_pool_padding(
+            padding, get_dhw_offset(self._channel_pos), 2
+        )
+        self._padding_before = [pad[0] for pad in _pads_list]
+        self._padding_after = [pad[1] for pad in _pads_list]
+        self.ceil_mode = ceil_mode
+
+    def forward(self, x):
+        res = flow.F.avg_pool_2d(
+            x,
+            kernel_size=self.kernel_size,
+            stride=self.stride,
+            padding=self._padding_type,
+            padding_before=self._padding_before,
+            padding_after=self._padding_after,
+            ceil_mode=self.ceil_mode,
+            data_format=self._channel_pos,
+        )
+        return res
+
+
+class AvgPool3d(Module):
+    """Applies a 3D average pooling over an input signal composed of several input planes.
+
+    In the simplest case, the output value of the layer with input size :math:`(N, C, D, H, W)`,
+    output :math:`(N, C, D_{out}, H_{out}, W_{out})` and `kernel_size` :math:`(kD, kH, kW)`
+    can be precisely described as:
+
+    .. math::
+
+        out(N_i, C_j, d, h, w)  = \\frac{1}{kD * kH * kW } \\sum_{k=0}^{kD-1} \\sum_{m=0}^{kH-1} \\sum_{n=0}^{kW-1}
+                               input(N_i, C_j, stride[0] \\times d + k, stride[1] \\times h + m, stride[2] \\times w + n)
+    
+    If padding is non-zero, then the input is implicitly zero-padded on all three sides for padding number of points.
+
+    Note:
+        When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding or the
+        input. Sliding windows that would start in the right padded region are ignored.
+
+    Args:
+        kernel_size: the size of the window.
+        strides:  the stride of the window. Default value is kernel_size.
+        padding:  implicit zero padding to be added on all three sides.
+        ceil_mode:  when True, will use ceil instead of floor to compute the output shape.
+        count_include_pad: when True, will include the zero-padding in the averaging calculation.
+        divisor_override: if specified, it will be used as divisor, otherwise kernel_size will be used.
+
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})`
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})`, where
+
+          .. math::
+              D_{out} = \\left\\lfloor\\frac{D_{in} + 2 \\times \\text{padding}[0] - \\text{kernel_size}[0]}{\\text{stride}[0]} + 1\\right\\rfloor
+
+          .. math::
+              H_{out} = \\left\\lfloor\\frac{H_{in} + 2 \\times \\text{padding}[1] - \\text{kernel_size}[1]}{\\text{stride}[1]} + 1\\right\\rfloor
+
+          .. math::
+              W_{out} = \\left\\lfloor\\frac{W_{in} + 2 \\times \\text{padding}[2] - \\text{kernel_size}[2]}{\\text{stride}[2]} + 1\\right\\rfloor
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+
+        >>> flow.enable_eager_execution()
+        >>> inputarr = np.random.randn(9, 7, 11, 32, 20)
+        >>> of_avgpool3d = flow.nn.AvgPool3d(kernel_size=(2,2,2),padding=(0,0,0),stride=(1,1,1),)
+        >>> x = flow.Tensor(inputarr)
+        >>> y = of_avgpool3d(x)
+
+    """
+
+    def __init__(
+        self,
+        kernel_size: _size_3_t,
+        stride: Optional[_size_3_t] = None,
+        padding: _size_3_t = 0,
+        ceil_mode: bool = False,
+        count_include_pad: Optional[bool] = None,
+        divisor_override: Optional[int] = None,
+    ):
+        super().__init__()
+        kernel_size = _pair(kernel_size)
+        stride = _pair(stride) if stride is not None else kernel_size
+        assert padding == (0, 0, 0), "padding>0 not supported yet"
+        assert isinstance(padding, int) or isinstance(
+            padding, tuple
+        ), "padding can only int int or tuple of 3 ints."
+        padding = _pair(padding)
+        padding = [0, 0, *padding]
+        assert count_include_pad is None, "count_include_pad not supported yet"
+        assert divisor_override is None, "divisor_override not supported yet"
+        _channel_pos = "channels_first"
+        (_padding_type, _pads_list) = calc_pool_padding(
+            padding, get_dhw_offset(_channel_pos), 3
+        )
+        _padding_before = [pad[0] for pad in _pads_list]
+        _padding_after = [pad[1] for pad in _pads_list]
+        self._op = (
+            flow.builtin_op("avg_pool_3d")
+            .Attr("data_format", _channel_pos)
+            .Attr("pool_size", kernel_size)
+            .Attr("strides", stride)
+            .Attr("ceil_mode", ceil_mode)
+            .Attr("padding", _padding_type)
+            .Attr("padding_before", _padding_before)
+            .Attr("padding_after", _padding_after)
+            .Input("x")
+            .Output("y")
+            .Build()
+        )
+
+    def forward(self, x):
+        res = self._op(x)[0]
+        return res
+
+
+class MaxPool1d(Module):
+    """The interface is consistent with PyTorch.
+    The documentation is referenced from: https://pytorch.org/docs/stable/generated/torch.nn.MaxPool1d.html#torch.nn.MaxPool1d
+
+    Applies a 1D max pooling over an input signal composed of several input planes.
+
+    In the simplest case, the output value of the layer with input size :math:`(N, C, L)`
+    and output :math:`(N, C, L_{out})` can be precisely described as:
+
+    .. math::
+        out(N_i, C_j, k) = \\max_{m=0, \\ldots, \\text{kernel\\_size} - 1}
+                input(N_i, C_j, stride \\times k + m)
+
+    If :attr:`padding` is non-zero, then the input is implicitly padded with minimum value on both sides
+    for :attr:`padding` number of points. :attr:`dilation` is the stride between the elements within the
+    sliding window. This `link`_ has a nice visualization of the pooling parameters.
+
+    Note:
+        When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding
+        or the input. Sliding windows that would start in the right padded region are ignored.
+
+    Args:
+        kernel_size: The size of the sliding window, must be > 0.
+        stride: The stride of the sliding window, must be > 0. Default value is :attr:`kernel_size`.
+        padding: Implicit negative infinity padding to be added on both sides, must be >= 0 and <= kernel_size / 2.
+        dilation: The stride between elements within a sliding window, must be > 0.
+        return_indices: If ``True``, will return the argmax along with the max values.
+                        Useful for :class:`torch.nn.MaxUnpool1d` later
+        ceil_mode: If ``True``, will use `ceil` instead of `floor` to compute the output shape. This
+                   ensures that every element in the input tensor is covered by a sliding window.
+
+    Shape:
+        - Input: :math:`(N, C, L_{in})`
+        - Output: :math:`(N, C, L_{out})`, where
+
+          .. math::
+              L_{out} = \\left\\lfloor \\frac{L_{in} + 2 \\times \\text{padding} - \\text{dilation}
+                    \\times (\\text{kernel_size} - 1) - 1}{\\text{stride}} + 1\\right\\rfloor
+
+    """
+
+    def __init__(
+        self,
+        kernel_size: _size_1_t,
+        stride: Optional[_size_1_t] = None,
+        padding: _size_1_t = 0,
+        dilation: _size_1_t = 1,
+        return_indices: bool = False,
+        ceil_mode: bool = False,
+    ):
+        raise NotImplementedError
+
+
+class MaxPool2d(Module):
+    """The interface is consistent with PyTorch.
+    The documentation is referenced from: https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html#torch.nn.MaxPool2d
+
+    Applies a 2D max pooling over an input signal composed of several input planes.
+
+    In the simplest case, the output value of the layer with input size :math:`(N, C, H, W)`,
+    output :math:`(N, C, H_{out}, W_{out})` and :attr:`kernel_size` :math:`(kH, kW)`
+    can be precisely described as:
+
+    .. math::
+        \\begin{aligned}
+            out(N_i, C_j, h, w) ={} & \\max_{m=0, \\ldots, kH-1} \\max_{n=0, \\ldots, kW-1} \\\\
+                                    & \\text{input}(N_i, C_j, \\text{stride[0]} \\times h + m,
+                                                   \\text{stride[1]} \\times w + n)
+        \\end{aligned}
+
+    If :attr:`padding` is non-zero, then the input is implicitly minimum value padded on both sides
+    for :attr:`padding` number of points. :attr:`dilation` controls the spacing between the kernel points.
+    It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
+
+    Note:
+        When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding
+        or the input. Sliding windows that would start in the right padded region are ignored.
+    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be:
+        - a single ``int`` -- in which case the same value is used for the height and width dimension
+        - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
+          and the second `int` for the width dimension
+
+    Args:
+        kernel_size: the size of the window to take a max over
+        stride: the stride of the window. Default value is :attr:`kernel_size`
+        padding: implicit minimum value padding to be added on both sides
+        dilation: a parameter that controls the stride of elements in the window
+        return_indices: if ``True``, will return the max indices along with the outputs.
+                        Useful for :class:`torch.nn.MaxUnpool2d` later
+        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})`
+        - Output: :math:`(N, C, H_{out}, W_{out})`, where
+
+          .. math::
+              H_{out} = \\left\\lfloor\\frac{H_{in} + 2 * \\text{padding[0]} - \\text{dilation[0]}
+                    \\times (\\text{kernel_size[0]} - 1) - 1}{\\text{stride[0]}} + 1\\right\\rfloor
+          .. math::
+              W_{out} = \\left\\lfloor\\frac{W_{in} + 2 * \\text{padding[1]} - \\text{dilation[1]}
+                    \\times (\\text{kernel_size[1]} - 1) - 1}{\\text{stride[1]}} + 1\\right\\rfloor
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+
+        >>> kernel_size, stride, padding = (3, 3), (1, 1), (1, 2)
+        >>> m = flow.nn.MaxPool2d(kernel_size, stride, padding)
+        >>> np.random.seed(0)
+        >>> x = flow.Tensor(np.random.rand(1, 1, 5, 3))
+        >>> y = m(x)
+        >>> y #doctest: +ELLIPSIS
+        tensor([[[[0.5488, 0.7152, 0.7152, 0.7152, 0.6459],
+                  ...
+                  [0.568 , 0.9256, 0.9256, 0.9256, 0.5289]]]], dtype=oneflow.float32)
+
+        >>> kernel_size, stride, padding = (2, 3), (4, 5), (1, 2)
+        >>> m = flow.nn.MaxPool2d(kernel_size, stride, padding)
+        >>> x = flow.Tensor(np.random.randn(9, 7, 32, 20))
+        >>> y = m(x)
+        >>> y.size()
+        flow.Size([9, 7, 9, 5])
+
+    """
+
+    def __init__(
+        self,
+        kernel_size: _size_2_t,
+        stride: Optional[_size_2_t] = None,
+        padding: _size_2_t = 0,
+        dilation: _size_2_t = 1,
+        return_indices: bool = False,
+        ceil_mode: bool = False,
+    ):
+        super().__init__()
+        self.kernel_size = _pair(kernel_size)
+        self.strides = _pair(stride) if stride is not None else kernel_size
+        data_format = "NCHW"
+        self.channel_pos = (
+            "channels_last" if data_format == "NHWC" else "channels_first"
+        )
+        assert return_indices is False, "Only support return_indices==False for now!"
+        assert dilation == 1 or dilation == (1, 1), "Only support dilation==1 for now!"
+        padding = _pair(padding)
+        if len(padding) == 2:
+            if data_format == "NCHW":
+                padding = (0, 0, padding[0], padding[1])
+            else:
+                raise ValueError("error padding param!")
+        else:
+            raise ValueError("error padding param!")
+        (self.padding_type, pads_list) = calc_pool_padding(
+            padding, get_dhw_offset(self.channel_pos), 2
+        )
+        self.padding_before = [pad[0] for pad in pads_list]
+        self.padding_after = [pad[1] for pad in pads_list]
+        self.ceil_mode = ceil_mode
+
+    def forward(self, x):
+        return flow.F.max_pool_2d(
+            x,
+            kernel_size=self.kernel_size,
+            stride=self.strides,
+            padding=self.padding_type,
+            padding_before=self.padding_before,
+            padding_after=self.padding_after,
+            ceil_mode=self.ceil_mode,
+            data_format=self.channel_pos,
+        )
+
+
+class MaxPool3d(Module):
+    """The interface is consistent with PyTorch.
+    The documentation is referenced from: https://pytorch.org/docs/stable/generated/torch.nn.MaxPool3d.html#torch.nn.MaxPool3d
+
+    Applies a 3D max pooling over an input signal composed of several input planes.
+
+    In the simplest case, the output value of the layer with input size :math:`(N, C, D, H, W)`,
+    output :math:`(N, C, D_{out}, H_{out}, W_{out})` and :attr:`kernel_size` :math:`(kD, kH, kW)`
+    can be precisely described as:
+
+    .. math::
+        \\begin{aligned}
+            \\text{out}(N_i, C_j, d, h, w) ={} & \\max_{k=0, \\ldots, kD-1} \\max_{m=0, \\ldots, kH-1} \\max_{n=0, \\ldots, kW-1} \\\\
+                                              & \\text{input}(N_i, C_j, \\text{stride[0]} \\times d + k,
+                                                             \\text{stride[1]} \\times h + m, \\text{stride[2]} \\times w + n)
+        \\end{aligned}
+
+    If :attr:`padding` is non-zero, then the input is implicitly minimum value on both sides
+    for :attr:`padding` number of points. :attr:`dilation` controls the spacing between the kernel points.
+    It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
+
+    Note:
+        When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding
+        or the input. Sliding windows that would start in the right padded region are ignored.
+
+    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be:
+
+        - a single ``int`` -- in which case the same value is used for the depth, height and width dimension
+        - a ``tuple`` of three ints -- in which case, the first `int` is used for the depth dimension,
+          the second `int` for the height dimension and the third `int` for the width dimension
+
+    Args:
+        kernel_size: the size of the window to take a max over
+        stride: the stride of the window. Default value is :attr:`kernel_size`
+        padding: implicit minimum value padding to be added on all three sides
+        dilation: a parameter that controls the stride of elements in the window
+        return_indices: if ``True``, will return the max indices along with the outputs.
+                        Useful for :class:`torch.nn.MaxUnpool3d` later
+        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
+
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})`
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})`, where
+
+          .. math::
+              D_{out} = \\left\\lfloor\\frac{D_{in} + 2 \\times \\text{padding}[0] - \\text{dilation}[0] \\times
+                (\\text{kernel_size}[0] - 1) - 1}{\\text{stride}[0]} + 1\\right\\rfloor
+
+          .. math::
+              H_{out} = \\left\\lfloor\\frac{H_{in} + 2 \\times \\text{padding}[1] - \\text{dilation}[1] \\times
+                (\\text{kernel_size}[1] - 1) - 1}{\\text{stride}[1]} + 1\\right\\rfloor
+
+          .. math::
+              W_{out} = \\left\\lfloor\\frac{W_{in} + 2 \\times \\text{padding}[2] - \\text{dilation}[2] \\times
+                (\\text{kernel_size}[2] - 1) - 1}{\\text{stride}[2]} + 1\\right\\rfloor
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+
+        >>> kernel_size, stride, padding = (3, 3, 3), (1, 1, 1), (1, 1, 2)
+        >>> m = flow.nn.MaxPool3d(kernel_size, stride, padding)
+        >>> np.random.seed(0)
+        >>> x = flow.Tensor(np.random.rand(1, 1, 3, 5, 3))
+        >>> y = m(x)
+        >>> y #doctest: +ELLIPSIS
+        tensor([[[[[0.7782, 0.87  , 0.9786, 0.9786, 0.9786],
+                   ...
+                   [0.9447, 0.9447, 0.9447, 0.6668, 0.6668]]]]], dtype=oneflow.float32)
+        >>> kernel_size, stride, padding = (2, 2, 3), (3, 4, 5), (2, 1, 2)
+        >>> m = flow.nn.MaxPool3d(kernel_size, stride, padding)
+        >>> x = flow.Tensor(np.random.randn(9, 7, 11, 32, 20))
+        >>> y = m(x)
+        >>> y.size()
+        flow.Size([9, 7, 5, 9, 5])
+
+    """
+
+    def __init__(
+        self,
+        kernel_size: _size_3_t,
+        stride: Optional[_size_3_t] = None,
+        padding: _size_3_t = 0,
+        dilation: _size_3_t = 1,
+        return_indices: bool = False,
+        ceil_mode: bool = False,
+    ):
+        super().__init__()
+        kernel_size = _triple(kernel_size)
+        strides = _triple(stride) if stride is not None else kernel_size
+        data_format = "NCDHW"
+        channel_pos = "channels_last" if data_format == "NDHWC" else "channels_first"
+        assert return_indices is False, "Only support return_indices==False for now!"
+        assert dilation == 1 or dilation == (
+            1,
+            1,
+            1,
+        ), "Only support dilation==1 for now!"
+        padding = _triple(padding)
+        if len(padding) == 3:
+            if data_format == "NCDHW":
+                padding = (0, 0, padding[0], padding[1], padding[2])
+            else:
+                raise ValueError("error padding param!")
+        else:
+            raise ValueError("error padding param!")
+        (padding_type, pads_list) = calc_pool_padding(
+            padding, get_dhw_offset(channel_pos), 3
+        )
+        padding_before = [pad[0] for pad in pads_list]
+        padding_after = [pad[1] for pad in pads_list]
+        self._op = (
+            flow.builtin_op("max_pool_3d")
+            .Attr("data_format", channel_pos)
+            .Attr("pool_size", kernel_size)
+            .Attr("strides", strides)
+            .Attr("ceil_mode", ceil_mode)
+            .Attr("padding", padding_type)
+            .Attr("padding_before", padding_before)
+            .Attr("padding_after", padding_after)
+            .Input("x")
+            .Output("y")
+            .Build()
+        )
+
+    def forward(self, x):
+        return self._op(x)[0]
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/reduce_ops.py b/python/oneflow/compatible/single_client/nn/modules/reduce_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..11a3d3e49b899b8508f70eded9eaefa3ab808812
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/reduce_ops.py
@@ -0,0 +1,201 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import collections
+from typing import Optional, Sequence, Union
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+from oneflow.compatible.single_client.nn.modules.utils import _check_axis
+
+
+def _build_reduce_op(op_type_name, keepdims):
+    return (
+        flow.builtin_op(op_type_name)
+        .Input("input_tensor")
+        .Output("output_tensor")
+        .Attr("keepdims", keepdims)
+        .Build()
+    )
+
+
+class Sum(Module):
+    def __init__(
+        self, axis: Optional[Union[int, Sequence[int]]] = None, keepdims: bool = False
+    ) -> None:
+        super().__init__()
+        self.axis = axis
+        self.keepdims = keepdims
+        self._op = _build_reduce_op("reduce_sum", keepdims)
+
+    def forward(self, input):
+        axis_checked = _check_axis(self.axis, input.shape)
+        if len(axis_checked) == 0:
+            return input
+        return self._op(input, axis=axis_checked)[0]
+
+
+@register_tensor_op("sum")
+def _sum(input, dim=None, keepdim=False):
+    """Computes the sum of row of elements in a tensor in the given axis, if the axis is None, sum of all elements will be caculated.
+    
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+        >>> input = flow.Tensor([[1, 2, 3], [4, 5, 6]])
+        >>> flow.sum(input)
+        tensor([21.], dtype=oneflow.float32)
+        >>> flow.sum(input, dim=0)
+        tensor([5., 7., 9.], dtype=oneflow.float32)
+        >>> flow.sum(input, dim=1)
+        tensor([ 6., 15.], dtype=oneflow.float32)
+
+    """
+    return Sum(dim, keepdim)(input)
+
+
+class Mean(Module):
+    def __init__(
+        self, axis: Optional[Union[int, Sequence[int]]] = None, keepdims: bool = False
+    ) -> None:
+        super().__init__()
+        self.axis = axis
+        self.keepdims = keepdims
+        if axis is None:
+            self.axes = []
+        else:
+            self.axes = list(axis) if isinstance(axis, collections.Sized) else [axis]
+
+    def forward(self, input):
+        axis_checked = _check_axis(self.axis, input.shape)
+        if len(axis_checked) == 0:
+            return input
+        reduce_sum = flow.experimental.sum(input, dim=self.axis, keepdim=self.keepdims)
+        reduce_count = 1
+        if len(self.axes) == 0:
+            for dim in input.shape:
+                reduce_count *= dim
+        else:
+            for i in self.axes:
+                reduce_count *= input.shape[i]
+        return flow.experimental.mul(reduce_sum, 1.0 / reduce_count)
+
+
+@register_tensor_op("mean")
+def _mean(input, dim=None, keepdim=False):
+    """Computes the mean of row of elements in a tensor in the given axis, if the axis is None, mean of all elements will be caculated.
+    
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+        >>> input = flow.Tensor([[1, 2, 3], [4, 5, 6]])
+        >>> flow.mean(input)
+        tensor([3.5], dtype=oneflow.float32)
+        >>> flow.mean(input, dim=0)
+        tensor([2.5, 3.5, 4.5], dtype=oneflow.float32)
+        >>> flow.mean(input, dim=1)
+        tensor([2., 5.], dtype=oneflow.float32)
+
+    """
+    return Mean(dim, keepdim)(input)
+
+
+class Min(Module):
+    def __init__(
+        self, axis: Optional[Union[int, Sequence[int]]] = None, keepdims: bool = False
+    ) -> None:
+        super().__init__()
+        self.axis = axis
+        self.keepdims = keepdims
+        self._op = _build_reduce_op("reduce_min", keepdims)
+
+    def forward(self, input):
+        axis_checked = _check_axis(self.axis, input.shape)
+        if len(axis_checked) == 0:
+            return input
+        return self._op(input, axis=axis_checked)[0]
+
+
+@register_tensor_op("min")
+def _min(input, dim=None, keepdim=False):
+    """Computes the minimum value of all elements in the input tensor.
+    
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+        >>> input = flow.Tensor([[4, 1, 5], [2, 6, 3]])
+        >>> flow.min(input)
+        tensor([1.], dtype=oneflow.float32)
+        >>> flow.min(input, dim=0)
+        tensor([2., 1., 3.], dtype=oneflow.float32)
+        >>> flow.min(input, dim=1)
+        tensor([1., 2.], dtype=oneflow.float32)
+
+    """
+    return Min(dim, keepdim)(input)
+
+
+class Max(Module):
+    def __init__(
+        self, axis: Optional[Union[int, Sequence[int]]] = None, keepdims: bool = False
+    ) -> None:
+        super().__init__()
+        self.axis = axis
+        self.keepdims = keepdims
+        self._op = _build_reduce_op("reduce_max", keepdims)
+
+    def forward(self, input):
+        axis_checked = _check_axis(self.axis, input.shape)
+        if len(axis_checked) == 0:
+            return input
+        return self._op(input, axis=axis_checked)[0]
+
+
+@register_tensor_op("max")
+def _max(input, dim=None, keepdim=False):
+    """Computes the maximum value of all elements in the input tensor.
+    
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+        >>> input = flow.Tensor([[4, 1, 5], [2, 6, 3]])
+        >>> flow.max(input)
+        tensor([6.], dtype=oneflow.float32)
+        >>> flow.max(input, dim=0)
+        tensor([4., 6., 5.], dtype=oneflow.float32)
+        >>> flow.max(input, dim=1)
+        tensor([5., 6.], dtype=oneflow.float32)
+
+    """
+    return Max(dim, keepdim)(input)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/repeat.py b/python/oneflow/compatible/single_client/nn/modules/repeat.py
new file mode 100644
index 0000000000000000000000000000000000000000..33b7a02a78bb59501dbf1bc009ef9cc11277d553
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/repeat.py
@@ -0,0 +1,96 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class Repeat(Module):
+    def __init__(self, sizes) -> None:
+        super().__init__()
+        self.sizes = sizes
+
+    def forward(self, input):
+        repeat = self.sizes
+        for repeat_v in repeat:
+            assert repeat_v > 0
+        input_shape = input.shape
+        assert len(repeat) >= len(input_shape)
+        in_reshape = []
+        out_reshape = []
+        expand_dim = []
+        diff = len(repeat) - len(input_shape)
+        for i in range(len(repeat) - 1, -1, -1):
+            if i >= diff:
+                if repeat[i] > 1:
+                    if input_shape[i - diff] > 1:
+                        in_reshape.insert(0, input_shape[i - diff])
+                        in_reshape.insert(0, 1)
+                        expand_dim.insert(0, input_shape[i - diff])
+                        expand_dim.insert(0, repeat[i])
+                        out_reshape.insert(0, input_shape[i - diff] * repeat[i])
+                    else:
+                        in_reshape.insert(0, input_shape[i - diff])
+                        expand_dim.insert(0, repeat[i])
+                        out_reshape.insert(0, repeat[i])
+                else:
+                    in_reshape.insert(0, input_shape[i - diff])
+                    expand_dim.insert(0, input_shape[i - diff])
+                    out_reshape.insert(0, input_shape[i - diff])
+            else:
+                expand_dim.insert(0, repeat[i])
+                out_reshape.insert(0, repeat[i])
+        new_tensor = flow.experimental.reshape(input, in_reshape)
+        tmp_tensor = new_tensor.expand(*expand_dim)
+        out = flow.experimental.reshape(tmp_tensor, out_reshape)
+        return out
+
+
+@register_tensor_op("repeat")
+def repeat_op(x, sizes):
+    """This operator repeat the input tensor to a larger size along the specified dimensions.
+
+    Args:
+        x (oneflow.compatible.single_client.Tensor): The input Tensor.
+        size (Sequence[int]): The number of times to repeat this tensor along each dimension
+
+    Returns:
+        oneflow.compatible.single_client.Tensor: The result Tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+
+        >>> x = np.array([[[[0, 1]],
+        ...               [[2, 3]],
+        ...               [[4, 5]]]]).astype(np.int32)
+
+        >>> input = flow.Tensor(x)
+        >>> out = input.repeat(sizes=(1, 1, 2, 2))
+        >>> out.shape
+        flow.Size([1, 3, 2, 4])
+    """
+    return Repeat(sizes=sizes)(x)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/reshape.py b/python/oneflow/compatible/single_client/nn/modules/reshape.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1e61f3b725075063ab751f6d94e049ea4af302a
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/reshape.py
@@ -0,0 +1,121 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Sequence
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class Reshape(Module):
+    def __init__(self, shape: Sequence[int]) -> None:
+        super().__init__()
+        self.shape = shape
+
+    def forward(self, x):
+        return flow.F.reshape(x, shape=self.shape)
+
+
+@register_tensor_op("reshape")
+def reshape_op(x, shape: Sequence[int] = None):
+    """This operator reshapes a Tensor.
+
+    We can set one dimension in `shape` as `-1`, the operator will infer the complete shape.
+
+    Args:
+        x: A Tensor.
+        shape: Shape of the output tensor.
+    Returns:
+        A Tensor has the same type as `x`.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> x = np.array(
+        ...    [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]]
+        ... ).astype(np.float32)
+        >>> input = flow.Tensor(x)
+
+        >>> y = flow.reshape(input, shape=[2, 2, 2, -1]).shape
+        >>> y
+        flow.Size([2, 2, 2, 2])
+
+    """
+    return Reshape(shape=shape)(x)
+
+
+@register_tensor_op("view")
+def view_op(x, shape: Sequence[int] = None):
+    """
+    The interface is consistent with PyTorch.
+    The documentation is referenced from: https://pytorch.org/docs/stable/generated/torch.Tensor.view.html
+
+    Returns a new tensor with the same data as the :attr:`self` tensor but of a
+    different :attr:`shape`.
+
+    The returned tensor shares the same data and must have the same number
+    of elements, but may have a different size. For a tensor to be viewed, the new
+    view size must be compatible with its original size and stride, i.e., each new
+    view dimension must either be a subspace of an original dimension, or only span
+    across original dimensions :math:`d, d+1, \\dots, d+k` that satisfy the following
+    contiguity-like condition that :math:`\\forall i = d, \\dots, d+k-1`,
+
+    .. math::
+
+      \\text{stride}[i] = \\text{stride}[i+1] \\times \\text{size}[i+1]
+
+    Otherwise, it will not be possible to view :attr:`self` tensor as :attr:`shape`
+    without copying it (e.g., via :meth:`contiguous`). When it is unclear whether a
+    :meth:`view` can be performed, it is advisable to use :meth:`reshape`, which
+    returns a view if the shapes are compatible, and copies (equivalent to calling
+    :meth:`contiguous`) otherwise.
+
+    Args:
+        x: A Tensor.
+        shape: Shape of the output tensor.
+    Returns:
+        A Tensor has the same type as `x`.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> x = np.array(
+        ...    [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]]
+        ... ).astype(np.float32)
+        >>> input = flow.Tensor(x)
+
+        >>> y = flow.view(input, shape=[2, 2, 2, -1]).numpy().shape
+        >>> y
+        (2, 2, 2, 2)
+
+    """
+    return Reshape(shape=shape)(x)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/round.py b/python/oneflow/compatible/single_client/nn/modules/round.py
new file mode 100644
index 0000000000000000000000000000000000000000..627ee4e693e78c7efc1ca20cc3835d4b22527634
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/round.py
@@ -0,0 +1,69 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class Round(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.round(x)
+
+
+def round_op(x):
+    """This operator rounds the value of Blob to the nearest integer.
+    Args:
+        x (oneflow.compatible.single_client.Tensor): A Tensor
+    Returns:
+        oneflow.compatible.single_client.Tensor: The result Tensor
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+        >>> x1 = flow.Tensor(np.array([1.49999, 1.500001, 2.7]).astype(np.float32))
+        >>> out1 = flow.round(x1)
+        >>> out1.numpy()
+        array([1., 2., 3.], dtype=float32)
+        >>> x2 = flow.Tensor(np.array([2.499999, 7.5000001, 5.3, 6.8]).astype(np.float32))
+        >>> out2 = flow.round(x2)
+        >>> out2.numpy()
+        array([2., 8., 5., 7.], dtype=float32)
+
+    """
+    return Round()(x)
+
+
+@register_tensor_op("round")
+def round_op_tensor(x):
+    """
+    round() -> Tensor
+
+    See :func:`oneflow.compatible.single_client.experimental.round`
+
+    """
+    return Round()(x)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/sign.py b/python/oneflow/compatible/single_client/nn/modules/sign.py
new file mode 100644
index 0000000000000000000000000000000000000000..53d3c3d3ba020648e59df0a1d11ef874a0237724
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/sign.py
@@ -0,0 +1,74 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class Sign(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.sign(x)
+
+
+def sign_op(x):
+    """Computes the sign of Tensor.
+
+    .. math::
+
+        \\text{out}_{i}  = \\text{sgn}(\\text{input}_{i})
+
+    Args:
+        input (Tensor): the input tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+        >>> x1 = flow.Tensor(np.array([-2, 0, 2]).astype(np.float32))
+        >>> out1 = flow.sign(x1)
+        >>> out1.numpy()
+        array([-1.,  0.,  1.], dtype=float32)
+        >>> x2 = flow.Tensor(np.array([-3.2, -4.5, 5.8]).astype(np.float32),device=flow.device('cuda'))
+        >>> out2 = flow.sign(x2)
+        >>> out2.numpy()
+        array([-1., -1.,  1.], dtype=float32)
+
+    """
+    return Sign()(x)
+
+
+@register_tensor_op("sign")
+def sign_op_tensor(x):
+    """
+
+    sign() -> Tensor
+
+    See :func:`oneflow.compatible.single_client.experimental.sign`
+
+    """
+    return Sign()(x)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/sinh.py b/python/oneflow/compatible/single_client/nn/modules/sinh.py
new file mode 100644
index 0000000000000000000000000000000000000000..050ea575b5944e3be8964d217430fdc852fadee4
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/sinh.py
@@ -0,0 +1,76 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class Sinh(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.sinh(x)
+
+
+def sinh_op(x):
+    """Returns a new tensor with the hyperbolic sine of the elements of :attr:`input`.
+
+    .. math::
+        \\text{out}_{i} = \\sinh(\\text{input}_{i})
+
+    Args:
+        input (Tensor): the input tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+
+        >>> x1 = flow.Tensor(np.array([1, 2, 3]))
+        >>> x2 = flow.Tensor(np.array([1.53123589,0.54242598,0.15117185]))
+        >>> x3 = flow.Tensor(np.array([1,0,-1]))
+
+        >>> flow.enable_eager_execution()
+        >>> flow.sinh(x1).numpy()
+        array([ 1.1752012,  3.6268604, 10.017875 ], dtype=float32)
+        >>> flow.sinh(x2).numpy()
+        array([2.20381  , 0.5694193, 0.1517483], dtype=float32)
+        >>> flow.sinh(x3).numpy()
+        array([ 1.1752012,  0.       , -1.1752012], dtype=float32)
+
+    """
+    return Sinh()(x)
+
+
+@register_tensor_op("sinh")
+def sinh_op_tensor(x):
+    """
+
+    sinh() -> Tensor
+
+    See :func:`oneflow.compatible.single_client.experimental.sinh`
+
+    """
+    return Sinh()(x)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/slice.py b/python/oneflow/compatible/single_client/nn/modules/slice.py
new file mode 100644
index 0000000000000000000000000000000000000000..f42cf75aa4b2896d8fd88356432a0be891d51aac
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/slice.py
@@ -0,0 +1,154 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Sequence, Tuple
+
+import numpy as np
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.nn.module import Module
+from oneflow.compatible.single_client.ops.array_ops import (
+    GetSliceAttrs,
+    check_slice_tup_list,
+)
+
+
+class Slice(Module):
+    def __init__(
+        self, start: Tuple[int, ...], stop: Tuple[int, ...], step: Tuple[int, ...]
+    ) -> None:
+        super().__init__()
+        self.start = start
+        self.stop = stop
+        self.step = step
+
+    def forward(self, x):
+        return flow.F.slice(x, start=self.start, stop=self.stop, step=self.step)
+
+
+def slice_op(x, slice_tup_list: Sequence[Tuple[int, int, int]]):
+    """Extracts a slice from a tensor.
+    The `slice_tup_list` assigns the slice indices in each dimension, the format is (start, stop, step).
+    The operator will slice the tensor according to the `slice_tup_list`.
+
+    Args:
+        x: A `Tensor`.
+        slice_tup_list: A list of slice tuple, indicate each dimension slice (start, stop, step).
+
+    For example: 
+
+    .. code-block:: python 
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> input = flow.Tensor(np.random.randn(3, 6, 9).astype(np.float32))
+        >>> tup_list = [[None, None, None], [0, 5, 2], [0, 6, 3]]
+        >>> y = flow.slice(input, slice_tup_list=tup_list)
+        >>> y.shape
+        flow.Size([3, 3, 2])
+    """
+    (start, stop, step) = check_slice_tup_list(slice_tup_list, x.shape)
+    return Slice(start, stop, step)(x)
+
+
+class SliceUpdate(Module):
+    def __init__(
+        self, start: Tuple[int, ...], stop: Tuple[int, ...], step: Tuple[int, ...]
+    ) -> None:
+        super().__init__()
+        self.start = start
+        self.stop = stop
+        self.step = step
+
+    def forward(self, x, update):
+        return flow.F.slice_update(
+            x, update, start=self.start, stop=self.stop, step=self.step
+        )
+
+
+def slice_update_op(x, update, slice_tup_list: Sequence[Tuple[int, int, int]]):
+    """Update a slice of tensor `x`. Like `x[start:stop:step] = update`. 
+
+    Args:
+        x: A `Tensor`, whose slice will be updated.
+        update: A `Tensor`, indicate the update content.
+        slice_tup_list: A list of slice tuple, indicate each dimension slice (start, stop, step).
+
+    For example: 
+
+    .. code-block:: python 
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> input = flow.Tensor(np.array([1, 1, 1, 1, 1]).astype(np.float32))
+        >>> update = flow.Tensor(np.array([2, 3, 4]).astype(np.float32))
+        >>> y = flow.slice_update(input, update, slice_tup_list=[[1, 4, 1]])
+        >>> y.numpy()
+        array([1., 2., 3., 4., 1.], dtype=float32)
+    """
+    (start, stop, step) = GetSliceAttrs(slice_tup_list, x.shape)
+    return SliceUpdate(start, stop, step)(x, update)
+
+
+class LogicalSliceAssign(Module):
+    def __init__(
+        self, start: Tuple[int, ...], stop: Tuple[int, ...], step: Tuple[int, ...]
+    ) -> None:
+        super().__init__()
+        self.start = start
+        self.stop = stop
+        self.step = step
+
+    def forward(self, x, update):
+        if update.dtype != x.dtype:
+            update = update.to(dtype=x.dtype)
+        return flow.F.logical_slice_assign(
+            x, update, start=self.start, stop=self.stop, step=self.step
+        )
+
+
+def logical_slice_assign_op(x, update, slice_tup_list: Sequence[Tuple[int, int, int]]):
+    """Update a slice of tensor `x`(in-place). Like `x[start:stop:step] = update`. 
+
+    Args:
+        x: A `Tensor`, whose slice will be updated.
+        update: A `Tensor`, indicate the update content.
+        slice_tup_list: A list of slice tuple, indicate each dimension slice (start, stop, step).
+
+    For example: 
+
+    .. code-block:: python 
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> input = flow.Tensor(np.array([1, 1, 1, 1, 1]).astype(np.float32))
+        >>> update = flow.Tensor(np.array([2, 3, 4]).astype(np.float32))
+        >>> y = flow.tmp.logical_slice_assign(input, update, slice_tup_list=[[1, 4, 1]])
+    """
+    "[summary]\n\n    Returns:\n        [type]: [description]\n    "
+    (start, stop, step) = GetSliceAttrs(slice_tup_list, x.shape)
+    return LogicalSliceAssign(start, stop, step)(x, update)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/softplus.py b/python/oneflow/compatible/single_client/nn/modules/softplus.py
new file mode 100644
index 0000000000000000000000000000000000000000..6383839cac0132f036fe742ce0ae02cfa71c8757
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/softplus.py
@@ -0,0 +1,71 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class Softplus(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.softplus(x)
+
+
+@register_tensor_op("softplus")
+def softplus_op(x):
+    """Applies the element-wise function:
+
+    .. math::
+        Softplus(x)= \\frac{1}{尾}*log(1+exp(尾鈭梮))
+
+    SoftPlus is a smooth approximation to the ReLU function and can be used to constrain the output of a machine to always be positive.
+
+    For numerical stability the implementation reverts to the linear function when :attr:`input X 尾 > threshold`.
+
+    Args:
+        beta:the value for the Softplus formulation.Default:1
+
+        threshold:values above this revert to a linear function.Default:20
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+
+        >>> x1 = flow.Tensor(np.array([1, 2, 3]))
+        >>> x2 = flow.Tensor(np.array([1.53123589,0.54242598,0.15117185]))
+        >>> x3 = flow.Tensor(np.array([1,0,-1]))
+
+        >>> flow.enable_eager_execution()
+        >>> flow.softplus(x1).numpy()
+        array([1.3132616, 2.126928 , 3.0485873], dtype=float32)
+        >>> flow.softplus(x2).numpy()
+        array([1.7270232, 1.0006962, 0.771587 ], dtype=float32)
+        >>> flow.softplus(x3).numpy()
+        array([1.3132616 , 0.6931472 , 0.31326166], dtype=float32)
+
+    """
+    return Softplus()(x)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/sort.py b/python/oneflow/compatible/single_client/nn/modules/sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..c66a4dfae3e3f3c399184566bdba72e6d8fabb95
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/sort.py
@@ -0,0 +1,106 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+from oneflow.compatible.single_client.ops.transpose_util import (
+    get_inversed_perm,
+    get_perm_when_transpose_axis_to_last_dim,
+)
+
+
+class Sort(Module):
+    def __init__(self, dim: int = -1, descending: bool = False) -> None:
+        super().__init__()
+        self.dim = dim
+        direction = "DESCENDING" if descending else "ASCENDING"
+        self._argsort_op = (
+            flow.builtin_op("arg_sort")
+            .Input("in")
+            .Output("out")
+            .Attr("direction", direction)
+            .Build()
+        )
+
+    def forward(self, input):
+        num_dims = len(input.shape)
+        dim = self.dim if self.dim >= 0 else self.dim + num_dims
+        assert 0 <= dim < num_dims, "dim out of range"
+        if dim == num_dims - 1:
+            indices = self._argsort_op(input)[0]
+            return (flow.experimental.gather(input, indices, dim), indices)
+        else:
+            perm = get_perm_when_transpose_axis_to_last_dim(num_dims, dim)
+            x = flow.F.transpose(input, perm=perm)
+            indices = self._argsort_op(x)[0]
+            indices = flow.F.transpose(indices, perm=get_inversed_perm(perm))
+            return (flow.experimental.gather(input, indices, dim), indices)
+
+
+@register_tensor_op("sort")
+def sort_op(input, dim: int = -1, descending: bool = False):
+    """Sorts the elements of the input tensor along a given dimension in ascending order by value.
+
+    Args:
+        input (oneflow.compatible.single_client.Tensor): The input Tensor.
+        dim (int, optional): dimension to be sorted. Defaults to the last dim (-1).
+        descending (bool, optional): controls the sorting order (ascending or descending).
+
+    Returns:
+        Tuple(oneflow.compatible.single_client.Tensor, oneflow.compatible.single_client.Tensor(dtype=int32)): A tuple of (values, indices), where
+        where the values are the sorted values and the indices are the indices of the elements
+        in the original input tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+
+        >>> x = np.array([[1, 3, 8, 7, 2], [1, 9, 4, 3, 2]], dtype=np.float32)
+        >>> input = flow.Tensor(x)
+        >>> (values, indices) = flow.sort(input)
+        >>> values
+        tensor([[1., 2., 3., 7., 8.],
+                [1., 2., 3., 4., 9.]], dtype=oneflow.float32)
+        >>> indices
+        tensor([[0, 4, 1, 3, 2],
+                [0, 4, 3, 2, 1]], dtype=oneflow.int32)
+        >>> (values, indices) = flow.sort(input, descending=True)
+        >>> values
+        tensor([[8., 7., 3., 2., 1.],
+                [9., 4., 3., 2., 1.]], dtype=oneflow.float32)
+        >>> indices
+        tensor([[2, 3, 1, 4, 0],
+                [1, 2, 3, 4, 0]], dtype=oneflow.int32)
+        >>> (values, indices) = flow.sort(input, dim=0)
+        >>> values
+        tensor([[1., 3., 4., 3., 2.],
+                [1., 9., 8., 7., 2.]], dtype=oneflow.float32)
+        >>> indices
+        tensor([[0, 0, 1, 1, 0],
+                [1, 1, 0, 0, 1]], dtype=oneflow.int32)
+ 
+    """
+    return Sort(dim=dim, descending=descending)(input)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/sparse.py b/python/oneflow/compatible/single_client/nn/modules/sparse.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1e7ac3286dbd95fb3eef79b3eb8899929a564d2
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/sparse.py
@@ -0,0 +1,110 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import List, Optional, Tuple
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import Tensor
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class Embedding(Module):
+    """A simple lookup table that stores embeddings of a fixed dictionary and size.
+
+    This module is often used to store word embeddings and retrieve them using indices.
+    The input to the module is a list of indices, and the output is the corresponding
+    word embeddings.
+
+    Args:
+        num_embeddings (int): size of the dictionary of embeddings
+        embedding_dim (int): the size of each embedding vector
+        padding_idx (int, optional): If specified, the entries at :attr:`padding_idx` do not contribute to the gradient;
+                                    therefore, the embedding vector at :attr:`padding_idx` is not updated during training,
+                                    i.e. it remains as a fixed "pad". For a newly constructed Embedding,
+                                    the embedding vector at :attr:`padding_idx` will default to all zeros,
+                                    but can be updated to another value to be used as the padding vector.
+    
+    For example:
+
+    .. code-block:: python
+        
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> indices = flow.Tensor([[1, 2, 4, 5], [4, 3, 2, 9]], dtype=flow.int)
+        >>> m = flow.nn.Embedding(10, 3)
+        >>> y = m(indices)
+
+    """
+
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        padding_idx: Optional[int] = None,
+        max_norm: Optional[float] = None,
+        norm_type: Optional[float] = None,
+        scale_grad_by_freq: bool = False,
+        sparse: bool = False,
+        _weight: Optional[Tensor] = None,
+    ):
+        super().__init__()
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        if padding_idx is not None:
+            if padding_idx > 0:
+                assert (
+                    padding_idx < self.num_embeddings
+                ), "Padding_idx must be within num_embeddings"
+            elif padding_idx < 0:
+                assert (
+                    padding_idx >= -self.num_embeddings
+                ), "Padding_idx must be within num_embeddings"
+                padding_idx = self.num_embeddings + padding_idx
+        self.padding_idx = padding_idx
+        assert max_norm is None, "Not support max_norm yet!"
+        assert norm_type is None, "Not support norm_type yet!"
+        assert scale_grad_by_freq is False, "Not support scale_grad_by_freq=True yet!"
+        assert sparse is False, "Not support sparse=True yet!"
+        if _weight is None:
+            self.weight = flow.nn.Parameter(Tensor(num_embeddings, embedding_dim))
+            self.reset_parameters()
+        else:
+            assert list(_weight.shape) == [
+                num_embeddings,
+                embedding_dim,
+            ], "Shape of weight does not match num_embeddings and embedding_dim"
+            self.weight = flow.nn.Parameter(_weight)
+        self.sparse = sparse
+
+    def reset_parameters(self) -> None:
+        flow.nn.init.normal_(self.weight)
+        self._fill_padding_idx_with_zero()
+
+    def _fill_padding_idx_with_zero(self) -> None:
+        if self.padding_idx is not None:
+            with flow.no_grad():
+                self.weight[self.padding_idx].fill_(0)
+
+    def forward(self, indices):
+        res = flow.F.gather(self.weight, indices, axis=0)
+        return res
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/squeeze.py b/python/oneflow/compatible/single_client/nn/modules/squeeze.py
new file mode 100644
index 0000000000000000000000000000000000000000..c441759bb5c037e83166b573f2cde5aab1fb5b34
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/squeeze.py
@@ -0,0 +1,74 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional, Sequence
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class Squeeze(Module):
+    def __init__(self, dim: Optional[Sequence[int]] = None) -> None:
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, x):
+        if self.dim is None:
+            return x
+        return flow.F.squeeze(x, dim=self.dim)
+
+
+@register_tensor_op("squeeze")
+def squeeze_op(input, dim: Optional[Sequence[int]] = None):
+    """This operator removes the specified dimention which size is 1 of the input Tensor.
+    If the `dim` is not specified, this operator will remove all the dimention which size is 1 of the input Tensor.
+
+    The amount of element in return value is the same as Tensor `input`.
+
+    Args:
+        input (oneflow.compatible.single_client.Tensor): The input Tensor.
+        dim (Optional[Sequence[int]]): The dim. Defaults to None.
+
+    Returns:
+        Tensor: The result Tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+
+        >>> input = flow.Tensor(np.array([[[[1, 1, 1]]]]).astype(np.int32))
+        >>> out = flow.squeeze(input, dim=[1, 2]).shape
+        >>> out
+        flow.Size([1, 3])
+
+    """
+    if isinstance(dim, int):
+        dim = [dim]
+    elif dim is None:
+        dim = range(input.ndim)
+    dim = list(filter(lambda i: input.size(i) == 1, dim))
+    return Squeeze(dim=dim)(input)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/stack.py b/python/oneflow/compatible/single_client/nn/modules/stack.py
new file mode 100644
index 0000000000000000000000000000000000000000..19b8f2c910820d4d67912967829b159033484678
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/stack.py
@@ -0,0 +1,83 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import List, Tuple
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import Tensor, register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class Stack(Module):
+    def __init__(self, dim: int = 0) -> None:
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, inputs):
+        assert isinstance(inputs, (List, Tuple))
+        input_shape = inputs[0].shape
+        max_dim = len(input_shape)
+        if self.dim < 0:
+            self.dim = self.dim + max_dim + 1
+        assert self.dim >= 0 and self.dim <= max_dim
+        input_list_length = len(inputs)
+        unsqueezed = list()
+        for i in range(input_list_length):
+            current_shape = inputs[i].shape
+            assert (
+                input_shape == current_shape
+            ), "Each tensor should have the same shape ! Found a tensor instance shape is: {}".format(
+                current_shape
+            )
+            unsqueezed.append(inputs[i].unsqueeze(dim=self.dim))
+        return flow.experimental.cat(unsqueezed, dim=self.dim)
+
+
+@register_tensor_op("stack")
+def stack(inputs: Tensor, dim: int = 0) -> None:
+    """Concatenates a sequence of tensors along a new dimension.
+    The returned tensor shares the same underlying data with input tensors.
+
+    A :attr:`dim` value within the range `[-input.ndimension() - 1, input.ndimension() + 1]`
+    can be used. Negative :attr:`dim` will correspond to :meth:`stack`
+    applied at :attr:`dim` = ``dim + input.ndimension() + 1``.
+
+    Args:
+        inputs (List[oneflow.compatible.single_client.Tensor]): the list of input tensors. Each tensor should have the same shape.
+        dim (int): the index at which to insert the concatenated dimension.
+
+    Returns:
+        A `Tensor`
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+        >>> x = flow.Tensor(np.random.rand(1, 3, 5))
+        >>> y = flow.Tensor(np.random.rand(1, 3, 5))
+        >>> out = flow.stack([x, y], dim = -1)
+        >>> out.shape
+        flow.Size([1, 3, 5, 2])
+    """
+    return Stack(dim)(inputs)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/tan.py b/python/oneflow/compatible/single_client/nn/modules/tan.py
new file mode 100644
index 0000000000000000000000000000000000000000..810312613d3d13feca18d14a9768696a8e5a74ba
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/tan.py
@@ -0,0 +1,69 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class Tan(Module):
+    def __init__(self):
+        super().__init__()
+        self._op = flow.builtin_op("tan").Input("x").Output("y").Build()
+
+    def forward(self, x):
+        return self._op(x)[0]
+
+
+def tan_op(input):
+    """Returns  the tan value of the elements of :attr:`input`.
+
+    .. math::
+        \\text{out}_{i} = \\tan(\\text{input}_{i})
+
+    Args:
+        input (Tensor): the input tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+        >>> np_arr = np.array([-1/4*np.pi, 0, 1/4*np.pi]).astype(np.float32)
+        >>> input = flow.Tensor(np_arr)
+        >>> output = flow.tan(input)
+        >>> output
+        tensor([-1.,  0.,  1.], dtype=oneflow.float32)
+
+    """
+    return Tan()(input)
+
+
+@register_tensor_op("tan")
+def tan_op_tensor(input):
+    """
+    tan() -> Tensor
+    See :func:`oneflow.compatible.single_client.experimental.tan`
+
+    """
+    return Tan()(input)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/tensor_buffer.py b/python/oneflow/compatible/single_client/nn/modules/tensor_buffer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e02bcaaa81e10bb9d0d45e65461d82b413ae393d
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/tensor_buffer.py
@@ -0,0 +1,151 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional, Sequence
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class TensorBufferToTensor(Module):
+    def __init__(self, dtype, instance_shape):
+        super().__init__()
+        self._op = (
+            flow.builtin_op("tensor_buffer_to_tensor")
+            .Input("in")
+            .Output("out")
+            .Attr("dtype", dtype)
+            .Attr("instance_shape", instance_shape)
+            .Build()
+        )
+
+    def forward(self, input):
+        return self._op(input)[0]
+
+
+def tensor_buffer_to_tensor_op(x, dtype: flow.dtype, instance_shape: Sequence[int]):
+    """This operator converts the Tensor's type from TensorBuffer to original type.
+    Some operator's output data type is `TensorBuffer`, you can use this operator to convert back
+    to `Tensor`.
+
+    Refer to `Concept Explanation <https://docs.oneflow.org/basics_topics/concept_explanation.html#3tensorbuffer-tensorlist>`_
+    for more about TensorBuffer.
+
+    Args:
+        x (oneflow.compatible.single_client.Tensor): The input Tensor.
+        dtype (flow.dtype): The data dtype.
+        instance_shape (Sequence[int]): The shape of each TensorBuffer instance.
+
+    Returns:
+        oneflow.compatible.single_client.Tensor: The result Tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> x = np.random.randn(4, 16, 64, 64).astype(np.float32)
+        >>> x = flow.Tensor(x)
+        >>> x = flow.tensor_to_tensor_buffer(x, instance_dims=2)
+        >>> output = flow.tensor_buffer_to_tensor(x, instance_shape=(64, 64), dtype=flow.float)
+        >>> output.shape
+        flow.Size([4, 16, 64, 64])
+
+    """
+    return TensorBufferToTensor(dtype=dtype, instance_shape=instance_shape)(x)
+
+
+class TensorToTensorBuffer(Module):
+    def __init__(self, instance_dims):
+        super().__init__()
+        self._op = (
+            flow.builtin_op("tensor_to_tensor_buffer")
+            .Input("in")
+            .Output("out")
+            .Attr("instance_dims", instance_dims)
+            .Build()
+        )
+
+    def forward(self, input):
+        return self._op(input)[0]
+
+
+def tensor_to_tensor_buffer(x, instance_dims: int):
+    """This operator converts the Tensor's type to TensorBuffer.
+
+    Refer to `Concept Explanation <https://docs.oneflow.org/basics_topics/concept_explanation.html#3tensorbuffer-tensorlist>`_
+    for more about TensorBuffer.
+
+    Args:
+        x (oneflow.compatible.single_client.Tensor): The input Tensor.
+        instance_dims (int): The dimensions of dynamic tensor instance.
+
+    Returns:
+        oneflow.compatible.single_client.Tensor: The result Tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> x = np.random.randn(4, 16, 64, 64).astype(np.float32)
+        >>> x = flow.Tensor(x)
+        >>> x = flow.tensor_to_tensor_buffer(x, instance_dims=2)
+        >>> output = flow.tensor_buffer_to_tensor(x, instance_shape=(64, 64), dtype=flow.float)
+        >>> output.shape
+        flow.Size([4, 16, 64, 64])
+    
+    """
+    return TensorToTensorBuffer(instance_dims=instance_dims)(x)
+
+
+class GenTensorBuffer(Module):
+    def __init__(self, shape, shape_list, value_list, data_type, dynamic_out):
+        super().__init__()
+        self._op = (
+            flow.builtin_op("gen_tensor_buffer")
+            .Output("out")
+            .Attr("shape", shape)
+            .Attr("shape_list", shape_list)
+            .Attr("value_list", value_list)
+            .Attr("data_type", data_type)
+            .Attr("dynamic_out", dynamic_out)
+            .Build()
+        )
+
+    def forward(self):
+        return self._op()[0]
+
+
+def gen_tensor_buffer(
+    shape: Sequence[int],
+    shape_list: Sequence[Sequence[int]],
+    value_list: Sequence[float],
+    data_type: Optional[flow.dtype] = flow.float32,
+    dynamic_out: Optional[bool] = False,
+):
+    return GenTensorBuffer(shape, shape_list, value_list, data_type, dynamic_out)()
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/tensor_ops.py b/python/oneflow/compatible/single_client/nn/modules/tensor_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..b40d0788e1e2db22df77acae83386bcb5d12b721
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/tensor_ops.py
@@ -0,0 +1,91 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class TypeAs(Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input, target):
+        return input.to(dtype=target.dtype)
+
+
+@register_tensor_op("type_as")
+def type_as_op(input, target):
+    """Returns this tensor cast to the type of the given tensor.
+        This is a no-op if the tensor is already of the correct type.
+
+    Args:
+        input  (Tensor): the input tensor.
+        target (Tensor): the tensor which has the desired type.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+
+        >>> input = flow.Tensor(np.random.randn(1, 2, 3), dtype=flow.float32)
+        >>> target = flow.Tensor(np.random.randn(4, 5, 6), dtype = flow.int32)
+        >>> input = input.type_as(target)
+        >>> input.dtype
+        oneflow.int32
+
+    """
+    return TypeAs()(input, target)
+
+
+class Long(Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input):
+        return input.to(dtype=flow.int64)
+
+
+@register_tensor_op("long")
+def long_op(input):
+    """`Tensor.long()` is equivalent to `Tensor.to(flow.int64)`. See to().
+
+    Args:
+        input  (Tensor): the input tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+
+        >>> input = flow.Tensor(np.random.randn(1, 2, 3), dtype=flow.float32)
+        >>> input = input.long()
+        >>> input.dtype
+        oneflow.int64
+
+    """
+    return Long()(input)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/tile.py b/python/oneflow/compatible/single_client/nn/modules/tile.py
new file mode 100644
index 0000000000000000000000000000000000000000..70b78a37cb64a5354bc644af10e22272084e45cb
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/tile.py
@@ -0,0 +1,93 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Union
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import Tensor, register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class Tile(Module):
+    def __init__(self, reps: tuple) -> None:
+        super().__init__()
+        self.reps = reps
+
+    def forward(self, input: Tensor) -> Tensor:
+        reps = self.reps
+        for s in self.reps:
+            assert s > 0
+        input_shape = input.shape
+        diff = len(input_shape) - len(reps)
+        if diff > 0:
+            shape = [1 for _ in range(diff)]
+            shape.extend([i for i in reps])
+            reps = tuple(shape)
+        return input.repeat(reps)
+
+
+@register_tensor_op("tile")
+def tile_op(x, reps):
+    """The interface is consistent with PyTorch.
+    The documentation is referenced from:
+    https://pytorch.org/docs/stable/generated/torch.tile.html
+
+    Constructs a tensor by repeating the elements of ``input``.  The ``reps`` argument specifies the number
+    of repetitions in each dimension.
+
+    If ``reps`` specifies fewer dimensions than ``input`` has, then ones are prepended to ``reps`` until
+    all dimensions are specified.  For example, if ``input`` has shape (8, 6, 4, 2) and ``reps`` is (2, 2),
+    then ``reps`` is treated as (1, 1, 2, 2).
+
+    Analogously, if ``input`` has fewer dimensions than ``reps`` specifies, then ``input`` is treated as
+    if it were unsqueezed at dimension zero until it has as many dimensions as ``reps`` specifies.
+    For example, if ``input`` has shape (4, 2) and ``reps`` is (3, 3, 2, 2), then ``input`` is treated as
+    if it had the shape (1, 1, 4, 2).
+
+    .. note::
+        This function is similar to NumPy鈥檚 tile function.
+
+    Args:
+        input (oneflow.compatible.single_client.Tensor): the tensor whose elements to repeat.
+        reps (tuple): the number of repetitions per dimension.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+
+        >>> x = np.array([1, 2]).astype(np.int32)
+        >>> input = flow.Tensor(x, dtype=flow.int32)
+        >>> out = input.tile(reps=(2,))
+        >>> out
+        tensor([1, 2, 1, 2], dtype=oneflow.int32)
+
+        >>> x = np.random.randn(5, 2, 1)
+        >>> input = flow.Tensor(x)
+        >>> out = input.tile(reps=(3, 4))
+        >>> out.size()
+        flow.Size([5, 6, 4])
+
+    """
+    return Tile(reps=reps)(x)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/to.py b/python/oneflow/compatible/single_client/nn/modules/to.py
new file mode 100644
index 0000000000000000000000000000000000000000..2440f71cf43d8fe5a8496e61cd779c681a6cd563
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/to.py
@@ -0,0 +1,101 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional, Union
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class To(Module):
+    def __init__(self, copy):
+        super().__init__()
+        self.copy = copy
+
+    def forward(self, x, device, dtype):
+        result = x
+        if device is not None:
+            if x.device != device or self.copy:
+                result = flow.F.copy(x, device_type=device.type, device_id=device.index)
+        if dtype is not None:
+            if x.dtype != dtype or self.copy:
+                result = flow.F.cast(result, dtype=dtype)
+        return result
+
+
+@register_tensor_op("to")
+def to_op(input, *args, **kwargs):
+    """Performs Tensor dtype and/or device conversion. 
+        A flow.dtype and flow.device are inferred from the arguments of `input.to(*args, **kwargs)`.
+    
+    .. note::
+        If the ``input`` Tensor already
+        has the correct :class:`flow.dtype` and :class:`flow.device`, then ``input`` is returned.
+        Otherwise, the returned tensor is a copy of ``input`` with the desired.
+
+    Args:
+        input (oneflow.compatible.single_client.Tensor): An input tensor.
+        *args (oneflow.compatible.single_client.Tensor or oneflow.compatible.single_client.device or oneflow.compatible.single_client.dtype): Positional arguments
+        **kwargs (oneflow.compatible.single_client.device or oneflow.compatible.single_client.dtype) : Key-value arguments
+
+    Returns:
+        oneflow.compatible.single_client.Tensor: A Tensor.
+    
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> arr = np.random.randint(1, 9, size=(1, 2, 3, 4))
+        >>> input = flow.Tensor(arr)
+        >>> output = input.to(dtype=flow.float32)
+        >>> np.array_equal(arr.astype(np.float32), output.numpy())
+        True
+
+    """
+    copy = kwargs.get("copy", False)
+    device = kwargs.get("device", None)
+    dtype = kwargs.get("dtype", None)
+    if len(args) > 0:
+        if isinstance(args[0], flow.Tensor):
+            if len(args) == 2:
+                copy = args[1]
+            return To(copy)(input, args[0].device, args[0].dtype)
+        elif isinstance(args[0], flow.dtype):
+            if len(args) == 2:
+                copy = args[1]
+            return To(copy)(input, None, args[0])
+        else:
+            device = flow.device(args[0]) if isinstance(args[0], str) else args[0]
+            if len(args) > 1:
+                dtype = args[1]
+                assert isinstance(dtype, flow.dtype)
+            if len(args) > 2:
+                copy = args[2]
+            assert isinstance(device, flow.device)
+            return To(copy)(input, device, dtype)
+    if isinstance(device, flow.device) or isinstance(dtype, flow.dtype):
+        return To(copy)(input, device, dtype)
+    raise TypeError("to() received an invalid combination of arguments")
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/transpose.py b/python/oneflow/compatible/single_client/nn/modules/transpose.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff9d35057a949cb2ab3ba672d3306c062fa9d29e
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/transpose.py
@@ -0,0 +1,89 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional, Sequence
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class Transpose(Module):
+    def __init__(
+        self, dim0, dim1, conjugate: bool = False, batch_axis_non_change: bool = False
+    ) -> None:
+        super().__init__()
+        if conjugate:
+            raise NotImplementedError
+        if batch_axis_non_change:
+            raise NotImplementedError
+        self.dim0 = dim0
+        self.dim1 = dim1
+
+    def forward(self, x):
+        x_shape = x.shape
+        dim0 = self.dim0
+        dim1 = self.dim1
+        if dim0 < 0:
+            dim0 += len(x_shape)
+        if dim1 < 0:
+            dim1 += len(x_shape)
+        assert dim0 >= 0 and dim0 < len(
+            x_shape
+        ), "Invalid dim0 {}, len(shape): {}".format(dim0, len(x_shape))
+        assert dim1 >= 0 and dim1 < len(
+            x_shape
+        ), "Invalid dim1 {}, len(shape): {}".format(dim1, len(x_shape))
+        perm = []
+        for i in range(len(x_shape)):
+            perm.append(i)
+        (perm[dim0], perm[dim1]) = (perm[dim1], perm[dim0])
+        return flow.F.transpose(x, perm=perm)
+
+
+@register_tensor_op("transpose")
+def transpose_op(tensor, dim0, dim1):
+    """Returns a tensor that is a transposed version of input. The given dimensions dim0 and dim1 are swapped.
+
+    The resulting out tensor shares its underlying storage with the input tensor, so changing the content of one would change the content of the other.
+
+    Args:
+        tensor (oneflow.compatible.single_client.Tensor): The input tensor.
+        dim0 (int): the first dimension to be transposed.
+        dim1 (int): the second dimension to be transposed.
+    Returns:
+        Tensor: A transposed tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> input = flow.Tensor(np.random.randn(2, 6, 5, 3), dtype=flow.float32)
+        >>> out = flow.transpose(input, 0, 1).shape
+        >>> out
+        flow.Size([6, 2, 5, 3])
+
+    """
+    return Transpose(dim0=dim0, dim1=dim1)(tensor)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/triu.py b/python/oneflow/compatible/single_client/nn/modules/triu.py
new file mode 100644
index 0000000000000000000000000000000000000000..96f5d31b1326a7eb6cb9d0b6cf75ce6ff005ccf5
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/triu.py
@@ -0,0 +1,60 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class Triu(Module):
+    def __init__(self, diagonal=0):
+        super().__init__()
+        self.diagonal = diagonal
+
+    def forward(self, x):
+        return flow.F.triu(x, self.diagonal)
+
+
+@register_tensor_op("triu")
+def triu_op(x, diagonal=0):
+    """Returns the upper triangular part of a matrix (2-D tensor) or batch of matrices input, 
+    the other elements of the result tensor out are set to 0.
+    
+    Args:
+        input (Tensor): the input tensor. 
+        diagonal (int, optional): the diagonal to consider
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+
+        >>> x = flow.Tensor(np.ones(shape=(3, 3)).astype(np.float32))
+        >>> flow.triu(x)
+        tensor([[1., 1., 1.],
+                [0., 1., 1.],
+                [0., 0., 1.]], dtype=oneflow.float32)
+
+    """
+    return Triu(diagonal)(x)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/unsqueeze.py b/python/oneflow/compatible/single_client/nn/modules/unsqueeze.py
new file mode 100644
index 0000000000000000000000000000000000000000..42a14a90cdb8263ea9a30f8ad85a8a432edf3b84
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/unsqueeze.py
@@ -0,0 +1,69 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class Unsqueeze(Module):
+    def __init__(self, dim: int = 0) -> None:
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, input):
+        assert (
+            -(1 + input.ndimension()) <= self.dim <= input.ndimension()
+        ), "dim should within the range [-input.ndimension() - 1, input.ndimension() + 1)"
+        if self.dim < 0:
+            self.dim = 1 + input.ndimension() + self.dim
+        return flow.F.expand_dims(input, axis=self.dim)
+
+
+@register_tensor_op("unsqueeze")
+def unsqueeze_op(input, dim):
+    """Returns a new tensor with a dimension of size one inserted at the
+    specified position.
+
+    The returned tensor shares the same underlying data with this tensor.
+
+    A :attr:`dim` value within the range `[-input.ndimension() - 1, input.ndimension() + 1)`
+    can be used. Negative :attr:`dim` will correspond to :meth:`unsqueeze`
+    applied at :attr:`dim` = ``dim + input.ndimension() + 1``.
+
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the index at which to insert the singleton dimension
+
+    For example: 
+
+    .. code-block:: python 
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> x = flow.Tensor(np.random.rand(2, 3, 4))
+        >>> y = x.unsqueeze(2)
+        >>> y.shape
+        flow.Size([2, 3, 1, 4])
+    """
+    return Unsqueeze(dim)(input)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/upsampling.py b/python/oneflow/compatible/single_client/nn/modules/upsampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..29ac7771f56a4373511343793659796eb5d923e1
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/upsampling.py
@@ -0,0 +1,257 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional, Tuple, Union
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class Upsample(Module):
+    """Upsamples a given multi-channel 2D (spatial) data.
+
+    The input data is assumed to be of the form
+    `minibatch x channels x height x width`.
+    Hence, for spatial inputs, we expect a 4D Tensor.
+
+    The algorithms available for upsampling are nearest neighbor,
+    bilinear, 4D input Tensor, respectively.
+
+    One can either give a :attr:`scale_factor` or the target output :attr:`size` to
+    calculate the output size. (You cannot give both, as it is ambiguous)
+
+    Args:
+        size (int or Tuple[int, int] optional):
+            output spatial sizes
+        scale_factor (float or Tuple[float, float], optional):
+            multiplier for spatial size. Has to match input size if it is a tuple.
+        mode (str, optional): the upsampling algorithm: one of ``'nearest'``,
+            ``'bilinear'``.
+            Default: ``'nearest'``
+        align_corners (bool, optional): if ``True``, the corner pixels of the input
+            and output tensors are aligned, and thus preserving the values at
+            those pixels. This only has effect when :attr:`mode` is ``'bilinear'``.
+            Default: ``False``
+
+    Shape:
+        - Input: : :math:`(N, C, H_{in}, W_{in})`
+        - Output: :math:`(N, C, H_{out}, W_{out})` , where
+
+    .. math::
+        D_{out} = \\left\\lfloor D_{in} \\times \\text{scale_factor} \\right\\rfloor
+
+    .. math::
+        H_{out} = \\left\\lfloor H_{in} \\times \\text{scale_factor} \\right\\rfloor
+
+    .. math::
+        W_{out} = \\left\\lfloor W_{in} \\times \\text{scale_factor} \\right\\rfloor
+
+    .. note::
+        If you want downsampling/general resizing, you should use :func:`~nn.functional.interpolate`.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> input = flow.Tensor(np.arange(1, 5).reshape((1, 1, 2, 2)), dtype=flow.float32)
+        >>> input = input.to("cuda")
+        >>> m = flow.nn.Upsample(scale_factor=2.0, mode="nearest")
+        >>> output = m(input)
+        >>> output #doctest: +ELLIPSIS
+        tensor([[[[1., 1., 2., 2.],
+                  ...
+                  [3., 3., 4., 4.]]]], device='cuda:0', dtype=oneflow.float32)
+
+    """
+
+    def __init__(
+        self,
+        size: Optional[Union[int, Tuple[int, ...]]] = None,
+        scale_factor: Optional[Union[float, Tuple[float, ...]]] = None,
+        mode: str = "nearest",
+        align_corners: Optional[bool] = None,
+    ):
+        super().__init__()
+        self.size = size
+        if isinstance(scale_factor, tuple):
+            self.scale_factor = tuple((float(factor) for factor in scale_factor))
+        else:
+            self.scale_factor = float(scale_factor) if scale_factor else None
+        self.mode = mode
+        if align_corners == None:
+            align_corners = False
+        self.align_corners = align_corners
+        self.height_scale = None
+        self.width_scale = None
+        if isinstance(self.scale_factor, float):
+            self.height_scale = self.scale_factor
+            self.width_scale = self.scale_factor
+        elif isinstance(self.scale_factor, tuple):
+            self.height_scale = self.scale_factor[0]
+            self.width_scale = self.scale_factor[1]
+        else:
+            pass
+        if self.mode != "nearest" and self.mode != "bilinear":
+            raise ValueError('interpolation must be "nearest" or "bilinear".')
+        if self.mode == "nearest" and self.align_corners:
+            raise ValueError('interpolation "nearest" does not support align_corners.')
+
+    def forward(self, x):
+        assert (
+            self.size != None or self.scale_factor != None
+        ), f"size and scale_factor can not be none at the same time!"
+        (h, w) = (x.shape[2], x.shape[3])
+        if self.height_scale == None:
+            if isinstance(self.size, int):
+                self.height_scale = 1.0 * self.size / h
+            else:
+                self.height_scale = 1.0 * self.size[0] / h
+        if self.width_scale == None:
+            if isinstance(self.size, int):
+                self.width_scale = 1.0 * self.size / w
+            else:
+                self.width_scale = 1.0 * self.size[1] / w
+        res = flow.F.upsample(
+            x,
+            height_scale=self.height_scale,
+            width_scale=self.width_scale,
+            align_corners=self.align_corners,
+            interpolation=self.mode,
+            data_format="channels_first",
+        )
+        return res
+
+
+class UpsamplingNearest2d(Upsample):
+    """Applies a 2D nearest neighbor upsampling to an input signal composed of several input
+    channels.
+
+    To specify the scale, it takes either the :attr:`size` or the :attr:`scale_factor`
+    as it's constructor argument.
+
+    When :attr:`size` is given, it is the output size of the image `(h, w)`.
+
+    Args:
+        size (int or Tuple[int, int], optional): output spatial sizes
+        scale_factor (float or Tuple[float, float], optional): multiplier for
+            spatial size.
+
+    .. warning::
+        This class is deprecated in favor of :func:`~nn.functional.interpolate`.
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})`
+        - Output: :math:`(N, C, H_{out}, W_{out})` where
+
+    .. math::
+          H_{out} = \\left\\lfloor H_{in} \\times \\text{scale_factor} \\right\\rfloor
+
+    .. math::
+          W_{out} = \\left\\lfloor W_{in} \\times \\text{scale_factor} \\right\\rfloor
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> input = flow.Tensor(np.arange(1, 5).reshape((1, 1, 2, 2)), dtype=flow.float32)
+        >>> input = input.to("cuda")
+        >>> m = flow.nn.UpsamplingNearest2d(scale_factor=2.0)
+        >>> output = m(input)
+        >>> output #doctest: +ELLIPSIS
+        tensor([[[[1., 1., 2., 2.],
+                  ...
+                  [3., 3., 4., 4.]]]], device='cuda:0', dtype=oneflow.float32)
+
+    """
+
+    def __init__(
+        self,
+        size: Optional[Tuple[int, int]] = None,
+        scale_factor: Optional[Tuple[float, float]] = None,
+    ) -> None:
+        super(UpsamplingNearest2d, self).__init__(size, scale_factor, mode="nearest")
+
+
+class UpsamplingBilinear2d(Upsample):
+    """Applies a 2D bilinear upsampling to an input signal composed of several input
+    channels.
+
+    To specify the scale, it takes either the :attr:`size` or the :attr:`scale_factor`
+    as it's constructor argument.
+
+    When :attr:`size` is given, it is the output size of the image `(h, w)`.
+
+    Args:
+        size (int or Tuple[int, int], optional): output spatial sizes
+        scale_factor (float or Tuple[float, float], optional): multiplier for
+            spatial size.
+
+    .. warning::
+        This class is deprecated in favor of :func:`~nn.functional.interpolate`. It is
+        equivalent to ``nn.functional.interpolate(..., mode='bilinear', align_corners=True)``.
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})`
+        - Output: :math:`(N, C, H_{out}, W_{out})` where
+
+    .. math::
+        H_{out} = \\left\\lfloor H_{in} \\times \\text{scale_factor} \\right\\rfloor
+
+    .. math::
+        W_{out} = \\left\\lfloor W_{in} \\times \\text{scale_factor} \\right\\rfloor
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> input = flow.Tensor(np.arange(1, 5).reshape((1, 1, 2, 2)), dtype=flow.float32)
+        >>> input = input.to("cuda")
+        >>> m = flow.nn.UpsamplingBilinear2d(scale_factor=2.0)
+        >>> output = m(input)
+        >>> output #doctest: +ELLIPSIS
+        tensor([[[[1.    , 1.3333, 1.6667, 2.    ],
+                  ...
+                  [3.    , 3.3333, 3.6667, 4.    ]]]], device='cuda:0',
+               dtype=oneflow.float32)
+
+    """
+
+    def __init__(
+        self,
+        size: Optional[Tuple[int, int]] = None,
+        scale_factor: Optional[Tuple[float, float]] = None,
+    ) -> None:
+        super(UpsamplingBilinear2d, self).__init__(
+            size, scale_factor, mode="bilinear", align_corners=True
+        )
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/utils.py b/python/oneflow/compatible/single_client/nn/modules/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..56a4c1f41cea28e6b6630d3587393a1bb889a78a
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/utils.py
@@ -0,0 +1,73 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from collections import abc as container_abcs
+from itertools import repeat
+from typing import List
+
+
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, container_abcs.Iterable):
+            return tuple(x)
+        return tuple(repeat(x, n))
+
+    return parse
+
+
+_single = _ntuple(1)
+_pair = _ntuple(2)
+_triple = _ntuple(3)
+_quadruple = _ntuple(4)
+
+
+def _reverse_repeat_tuple(t, n):
+    """Reverse the order of `t` and repeat each element for `n` times.
+    This can be used to translate padding arg used by Conv and Pooling modules
+    to the ones used by `F.pad`.
+    """
+    return tuple((x for x in reversed(t) for _ in range(n)))
+
+
+def _list_with_default(out_size, defaults):
+    if isinstance(out_size, int):
+        return out_size
+    if len(defaults) <= len(out_size):
+        raise ValueError(
+            "Input dimension should be at least {}".format(len(out_size) + 1)
+        )
+    return [
+        v if v is not None else d
+        for (v, d) in zip(out_size, defaults[-len(out_size) :])
+    ]
+
+
+def _check_axis(axis, shape):
+    ndim = len(shape)
+    if axis is None:
+        axis = list(range(len(shape)))
+    if isinstance(axis, int):
+        axis = [axis]
+    assert isinstance(axis, (list, tuple)), "Invalid axis {}".format(axis)
+    axis = list(axis)
+    for i in range(len(axis)):
+        assert (
+            -ndim <= axis[i] <= ndim - 1
+        ), "Dimension out of range (expected to be in range of [{}, {}], but got {})".format(
+            -ndim, ndim - 1, axis[i]
+        )
+        if axis[i] < 0:
+            axis[i] = axis[i] + ndim
+    return axis
diff --git a/python/oneflow/compatible/single_client/nn/modules/where.py b/python/oneflow/compatible/single_client/nn/modules/where.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f5312c04acb2f19f908075a4452c82e0dfbbf62
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/where.py
@@ -0,0 +1,132 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import register_tensor_op
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class Where(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, condition, x, y):
+        assert condition.dtype == flow.int32 or condition.dtype == flow.int8
+        if isinstance(x, int) or isinstance(x, float):
+            x = flow.Tensor(
+                [float(x)],
+                dtype=flow.float32,
+                device=flow.device(condition.device.type),
+            )
+        if isinstance(y, int) or isinstance(y, float):
+            y = flow.Tensor(
+                [float(y)],
+                dtype=flow.float32,
+                device=flow.device(condition.device.type),
+            )
+        assert (
+            condition.device.type == x.device.type
+            and condition.device.type == y.device.type
+        )
+        assert len(condition.shape) == len(x.shape) and len(condition.shape) == len(
+            y.shape
+        ), f"The dim of where module's inputs can not match, please check!"
+        broadcast_cond = condition
+        broadcast_x = x
+        broadcast_y = y
+        broadcast_like_shape = []
+        broadcast_condition_axes = []
+        broadcast_x_axes = []
+        broadcast_y_axes = []
+        for i in range(len(x.shape)):
+            max_dim = max(x.shape[i], max(y.shape[i], condition.shape[i]))
+            broadcast_like_shape.append(max_dim)
+            if max_dim != condition.shape[i]:
+                broadcast_condition_axes.append(i)
+            if max_dim != x.shape[i]:
+                broadcast_x_axes.append(i)
+            if max_dim != y.shape[i]:
+                broadcast_y_axes.append(i)
+        broadcast_like_tensor = flow.experimental.zeros(
+            tuple(broadcast_like_shape), dtype=flow.float32
+        )
+        broadcast_like_tensor = broadcast_like_tensor.to(x.device.type)
+        broadcast_like_tensor.requires_grad = x.requires_grad or y.requires_grad
+        if len(broadcast_condition_axes) != 0:
+            condition = flow.experimental.cast(condition, flow.float32)
+            broadcast_cond = flow.experimental.broadcast_like(
+                condition, broadcast_like_tensor, tuple(broadcast_condition_axes)
+            )
+            broadcast_cond = flow.experimental.cast(broadcast_cond, flow.int32)
+        if len(broadcast_x_axes) != 0:
+            broadcast_x = flow.experimental.broadcast_like(
+                x, broadcast_like_tensor, broadcast_axes=tuple(broadcast_x_axes)
+            )
+        if len(broadcast_y_axes) != 0:
+            broadcast_y = flow.experimental.broadcast_like(
+                y, broadcast_like_tensor, broadcast_axes=tuple(broadcast_y_axes)
+            )
+        return flow.F.where(broadcast_cond, broadcast_x, broadcast_y)
+
+
+@register_tensor_op("where")
+def where_op(condition, x, y):
+    """Return a tensor of elements selected from either :attr:`x` or :attr:`y`, depending on :attr:`condition`.
+    If the element in condition is larger than 0,
+
+    it will take the `x` element, else it will take the `y` element
+
+    .. note::
+
+        The tensors :attr:`condition`, :attr:`x`, :attr:`y` must be broadcastable.
+        It will take the `x` element, else it will take the `y` element.
+
+    Args:
+        condition (IntTensor): When 1 (nonzero), yield x, otherwise yield y
+        x (Tensor or Scalar): value (if :attr:x is a scalar) or values selected at indices
+                            where :attr:`condition` is True
+        y (Tensor or Scalar): value (if :attr:x is a scalar) or values selected at indices
+                            where :attr:`condition` is False
+    Returns:
+        Tensor: A tensor of shape equal to the broadcasted shape of :attr:`condition`, :attr:`x`, :attr:`y`
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> flow.enable_eager_execution()
+
+        >>> x = flow.Tensor(
+        ...    np.array([[-0.4620, 0.3139], [0.3898, -0.7197], [0.0478, -0.1657]]),
+        ...    dtype=flow.float32,
+        ... )
+        >>> y = flow.Tensor(np.ones(shape=(3, 2)), dtype=flow.float32)
+        >>> condition = flow.Tensor(np.array([[0, 1], [1, 0], [1, 0]]), dtype=flow.int32)
+        >>> out = condition.where(x, y)
+        >>> out #doctest: +ELLIPSIS
+        tensor([[1.    , 0.3139],
+                ...
+                [0.0478, 1.    ]], dtype=oneflow.float32)
+
+    """
+    return Where()(condition, x, y)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/modules/zeropad2d.py b/python/oneflow/compatible/single_client/nn/modules/zeropad2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f644f549b43a315a78abf36b45b8bce4107389d
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/modules/zeropad2d.py
@@ -0,0 +1,120 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Union
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.nn.module import Module
+
+
+class ZeroPad2d(Module):
+    """The interface is consistent with PyTorch.
+    The documentation is referenced from:
+    https://pytorch.org/docs/stable/generated/torch.nn.ZeroPad2d.html
+
+    Pads the input tensor boundaries with zero. User can set the amount of padding by setting the parameter `paddings`.
+
+    Args:
+        padding (Union[int, tuple]):  the size of the padding. If is `int`, uses the same padding in all boundaries. If a 4-`tuple`, uses (:math:`\\mathrm{padding_{left}}`, :math:`\\mathrm{padding_{right}}`, :math:`\\mathrm{padding_{top}}`, :math:`\\mathrm{padding_{bottom}}`)
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})`
+        - Output: :math:`(N, C, H_{out}, W_{out})` where
+
+            :math:`H_{out} = H_{in} + \\mathrm{padding_{top}} + \\mathrm{padding_{bottom}}`
+
+            :math:`W_{out} = W_{in} + \\mathrm{padding_{left}} + \\mathrm{padding_{right}}`
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.compatible.single_client.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+        >>> zeropad_layer_int = flow.nn.ZeroPad2d(2)
+        >>> zeropad_layer_tuple = flow.nn.ZeroPad2d((1,2,2,0))
+        >>> input = flow.Tensor(np.arange(18).reshape((1, 2, 3, 3)).astype(np.float32))
+        >>> output_int = zeropad_layer_int(input)
+        >>> output_int.shape
+        flow.Size([1, 2, 7, 7])
+        >>> output_int
+        tensor([[[[ 0.,  0.,  0.,  0.,  0.,  0.,  0.],
+                  [ 0.,  0.,  0.,  0.,  0.,  0.,  0.],
+                  [ 0.,  0.,  0.,  1.,  2.,  0.,  0.],
+                  [ 0.,  0.,  3.,  4.,  5.,  0.,  0.],
+                  [ 0.,  0.,  6.,  7.,  8.,  0.,  0.],
+                  [ 0.,  0.,  0.,  0.,  0.,  0.,  0.],
+                  [ 0.,  0.,  0.,  0.,  0.,  0.,  0.]],
+        <BLANKLINE>
+                 [[ 0.,  0.,  0.,  0.,  0.,  0.,  0.],
+                  [ 0.,  0.,  0.,  0.,  0.,  0.,  0.],
+                  [ 0.,  0.,  9., 10., 11.,  0.,  0.],
+                  [ 0.,  0., 12., 13., 14.,  0.,  0.],
+                  [ 0.,  0., 15., 16., 17.,  0.,  0.],
+                  [ 0.,  0.,  0.,  0.,  0.,  0.,  0.],
+                  [ 0.,  0.,  0.,  0.,  0.,  0.,  0.]]]], dtype=oneflow.float32)
+        >>> output_tuple = zeropad_layer_tuple(input)
+        >>> output_tuple
+        tensor([[[[ 0.,  0.,  0.,  0.,  0.,  0.],
+                  [ 0.,  0.,  0.,  0.,  0.,  0.],
+                  [ 0.,  0.,  1.,  2.,  0.,  0.],
+                  [ 0.,  3.,  4.,  5.,  0.,  0.],
+                  [ 0.,  6.,  7.,  8.,  0.,  0.]],
+        <BLANKLINE>
+                 [[ 0.,  0.,  0.,  0.,  0.,  0.],
+                  [ 0.,  0.,  0.,  0.,  0.,  0.],
+                  [ 0.,  9., 10., 11.,  0.,  0.],
+                  [ 0., 12., 13., 14.,  0.,  0.],
+                  [ 0., 15., 16., 17.,  0.,  0.]]]], dtype=oneflow.float32)
+    """
+
+    def __init__(self, padding: Union[int, tuple]):
+        super().__init__()
+        if isinstance(padding, tuple):
+            assert len(padding) == 4, ValueError("Length of padding must be 4")
+            boundary = [padding[0], padding[1], padding[2], padding[3]]
+        elif isinstance(padding, int):
+            boundary = [padding, padding, padding, padding]
+        else:
+            raise ValueError("padding must be int  or tuple!")
+        self.padding = boundary
+        self.value = 0.0
+
+    def forward(self, x):
+        (_, _, h, w) = x.shape
+        if x.dtype in [flow.float32, flow.float16, flow.float64]:
+            floating_value = float(self.value)
+            integral_value = int(0)
+        else:
+            floating_value = float(0)
+            integral_value = int(self.value)
+        self._op = (
+            flow.builtin_op("constant_pad2d")
+            .Input("x")
+            .Output("y")
+            .Attr("padding", self.padding)
+            .Attr("floating_value", floating_value)
+            .Attr("integral_value", integral_value)
+            .Build()
+        )
+        res = self._op(x)[0]
+        return res
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/nn/optimizer/__init__.py b/python/oneflow/compatible/single_client/nn/optimizer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/python/oneflow/compatible/single_client/nn/optimizer/adam.py b/python/oneflow/compatible/single_client/nn/optimizer/adam.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b062b2d33521c3316c7abb6324fbd47ca6da73f
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/optimizer/adam.py
@@ -0,0 +1,143 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import collections
+from typing import Callable, Dict, Iterator, List, Tuple, Union
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.nn.optimizer.optimizer import (
+    Optimizer,
+    ParamGroup,
+)
+from oneflow.compatible.single_client.nn.parameter import Parameter
+
+
+class Adam(Optimizer):
+    """Implements Adam algorithm.
+
+    It has been proposed in `Adam: A Method for Stochastic Optimization`_.
+    The implementation of the L2 penalty follows changes proposed in
+    `Decoupled Weight Decay Regularization`_.
+
+    This algorithm can adjust the learning rate of each parameter dynamically according to the 1st-moment estimates and the 2nd-moment estimates of gradient.
+
+    the equation of parameters updating is:
+
+    .. math::
+
+        & V_t = \\beta_1*V_{t-1} + (1-\\beta_1)*grad
+
+        & S_t = \\beta_2*S_{t-1} + (1-\\beta_2)*{grad} \\odot {grad}
+
+        & \\hat{g} = learning\\_rate*\\frac{{V_t}}{\\sqrt{{S_t}}+\\epsilon}
+
+        & param_{new} = param_{old} - \\hat{g}
+
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        scale (float, optional): the scale factor of loss (default: 1.0)
+
+    .. _Adam\\: A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+    .. _Decoupled Weight Decay Regularization:
+        https://arxiv.org/abs/1711.05101
+
+    """
+
+    def __init__(
+        self,
+        parameters: Union[Iterator[Parameter], List[Dict]],
+        lr: float = 0.001,
+        betas: Tuple[float, float] = (0.9, 0.999),
+        eps: float = 1e-08,
+        weight_decay: float = 0,
+        amsgrad: bool = False,
+        scale: float = 1.0,
+    ):
+        super().__init__()
+        assert lr >= 0.0, f"Invalid learning rate: {lr}"
+        assert eps >= 0.0, f"Invalid epsilon value: {eps}"
+        assert (
+            betas[0] >= 0.0 and betas[0] < 1.0
+        ), f"Invalid beta parameter at index 0: {betas[0]}"
+        assert (
+            betas[1] >= 0.0 and betas[1] < 1.0
+        ), f"Invalid beta parameter at index 1: {betas[1]}"
+        assert weight_decay >= 0.0, f"Invalid weight_decay value: {weight_decay}"
+        assert scale > 0.0, f"Invalid scale factor: {scale}"
+        assert amsgrad is False, "Not support AMSGrad now!"
+        self._default_options["lr"] = lr
+        self._default_options["eps"] = eps
+        self._default_options["betas"] = betas
+        self._default_options["weight_decay"] = weight_decay
+        self._default_options["amsgrad"] = amsgrad
+        self._default_options["scale"] = scale
+        if isinstance(parameters, collections.abc.Iterator):
+            self.param_groups.append(ParamGroup(parameters, self._default_options))
+        else:
+            for param in parameters:
+                self.param_groups.append(ParamGroup(param, self._default_options))
+        for param_group in self.param_groups:
+            for param in param_group.parameters:
+                assert param.is_leaf, "parameters must be leaf tensor"
+                self._state[param] = dict()
+                self._state[param]["exp_avg"] = flow.experimental.zeros_like(param)
+                self._state[param]["exp_avg_sq"] = flow.experimental.zeros_like(param)
+        self._op = (
+            flow.builtin_op("adam_update")
+            .Input("model")
+            .Input("model_diff")
+            .Input("m")
+            .Input("v")
+            .Attr("l1", 0.0)
+            .Attr("weight_decay", 0.0)
+            .Build()
+        )
+
+    def step(self, closure: Callable = None):
+        """Performs a single optimization step.
+
+        Args:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        with flow.no_grad():
+            loss = None
+            if closure is not None:
+                loss = closure()
+            for param_group in self.param_groups:
+                kwargs = {
+                    "learning_rate_val": param_group["lr"],
+                    "scale": param_group["scale"],
+                    "l2": param_group["weight_decay"],
+                    "beta1": param_group["betas"][0],
+                    "beta2": param_group["betas"][1],
+                    "epsilon": param_group["eps"],
+                }
+                for param in param_group.parameters:
+                    if param.grad is None:
+                        continue
+                    m_tensor = self._state[param]["exp_avg"]
+                    v_tensor = self._state[param]["exp_avg_sq"]
+                    self._op(param, param.grad, m_tensor, v_tensor, **kwargs)
+            self._state["step"] = self._state["step"] + 1
+            return loss
diff --git a/python/oneflow/compatible/single_client/nn/optimizer/adamw.py b/python/oneflow/compatible/single_client/nn/optimizer/adamw.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1eb2e257b0af0ba6c610f9c956a98eb8947b1d7
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/optimizer/adamw.py
@@ -0,0 +1,146 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import collections
+from typing import Callable, Dict, Iterator, List, Tuple, Union
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.nn.optimizer.optimizer import (
+    Optimizer,
+    ParamGroup,
+)
+from oneflow.compatible.single_client.nn.parameter import Parameter
+
+
+class AdamW(Optimizer):
+    """Implements AdamW algorithm.
+
+    The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_.
+    The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_.
+
+    The optimizer of the Adam-weight-decay algorithm.
+
+    (More details please refer to `Adam-weight-decay <https://www.fast.ai/2018/07/02/adam-weight-decay/>`_).
+
+    So we use Adam-weight-decay algorithm to solve this problem.
+
+    the equation of parameters updating is:
+
+    .. math::
+
+        & V_t = \\beta_1*V_{t-1} + (1-\\beta_1)*grad
+
+        & S_t = \\beta_2*S_{t-1} + (1-\\beta_2)*{grad} \\odot {grad}
+
+        & \\hat{g} = learning\\_rate*(\\frac{{V_t}}{\\sqrt{{S_t}}+\\epsilon}+\\lambda*param_{old})
+
+        & param_{new} = param_{old} - \\hat{g}
+
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (In the equation is 位, default: 0)
+        scale (float, optional): the scale factor of loss (default: 1.0)
+
+    .. _Adam\\: A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+    .. _Decoupled Weight Decay Regularization:
+        https://arxiv.org/abs/1711.05101
+
+    """
+
+    def __init__(
+        self,
+        parameters: Union[Iterator[Parameter], List[Dict]],
+        lr: float = 0.001,
+        betas: Tuple[float, float] = (0.9, 0.999),
+        eps: float = 1e-08,
+        weight_decay: float = 0,
+        amsgrad: bool = False,
+        scale: float = 1.0,
+    ):
+        super().__init__()
+        assert lr >= 0.0, f"Invalid learning rate: {lr}"
+        assert eps >= 0.0, f"Invalid epsilon value: {eps}"
+        assert (
+            betas[0] >= 0.0 and betas[0] < 1.0
+        ), f"Invalid beta parameter at index 0: {betas[0]}"
+        assert (
+            betas[1] >= 0.0 and betas[1] < 1.0
+        ), f"Invalid beta parameter at index 1: {betas[1]}"
+        assert weight_decay >= 0.0, f"Invalid weight_decay value: {weight_decay}"
+        assert scale > 0.0, f"Invalid scale factor: {scale}"
+        assert amsgrad is False, "Not support AMSGrad now!"
+        self._default_options["lr"] = lr
+        self._default_options["eps"] = eps
+        self._default_options["betas"] = betas
+        self._default_options["weight_decay"] = weight_decay
+        self._default_options["amsgrad"] = amsgrad
+        self._default_options["scale"] = scale
+        if isinstance(parameters, collections.abc.Iterator):
+            self.param_groups.append(ParamGroup(parameters, self._default_options))
+        else:
+            for param in parameters:
+                self.param_groups.append(ParamGroup(param, self._default_options))
+        for param_group in self.param_groups:
+            for param in param_group.parameters:
+                assert param.is_leaf, "parameters must be leaf tensor"
+                self._state[param] = dict()
+                self._state[param]["exp_avg"] = flow.experimental.zeros_like(param)
+                self._state[param]["exp_avg_sq"] = flow.experimental.zeros_like(param)
+        self._op = (
+            flow.builtin_op("adam_update")
+            .Input("model")
+            .Input("model_diff")
+            .Input("m")
+            .Input("v")
+            .Attr("l1", 0.0)
+            .Attr("l2", 0.0)
+            .Build()
+        )
+
+    def step(self, closure: Callable = None):
+        """Performs a single optimization step.
+
+        Args:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        with flow.no_grad():
+            loss = None
+            if closure is not None:
+                loss = closure()
+            for param_group in self.param_groups:
+                kwargs = {
+                    "learning_rate_val": param_group["lr"],
+                    "scale": param_group["scale"],
+                    "weight_decay": param_group["weight_decay"],
+                    "beta1": param_group["betas"][0],
+                    "beta2": param_group["betas"][1],
+                    "epsilon": param_group["eps"],
+                }
+                for param in param_group.parameters:
+                    if param.grad is None:
+                        continue
+                    m_tensor = self._state[param]["exp_avg"]
+                    v_tensor = self._state[param]["exp_avg_sq"]
+                    self._op(param, param.grad, m_tensor, v_tensor, **kwargs)
+            self._state["step"] = self._state["step"] + 1
+            return loss
diff --git a/python/oneflow/compatible/single_client/nn/optimizer/cosine_annealing_lr.py b/python/oneflow/compatible/single_client/nn/optimizer/cosine_annealing_lr.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd6dea7792d92931c769887daacf9899950c8a2f
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/optimizer/cosine_annealing_lr.py
@@ -0,0 +1,81 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import math
+
+from .lr_scheduler import LrScheduler
+
+
+class CosineAnnealingLR(LrScheduler):
+    """This operator creates a Cosine decayed learning rate scheduler.
+
+    Before the steps are specified by user, the learning rate will be updated as:
+
+    .. math::
+
+        & cos\\_decay = 0.5*(1+cos(\\pi*\\frac{current\\_step}{steps}))
+
+        & decay\\_factor = (1-\\alpha)*cos\\_decay+\\alpha
+
+        & learning\\_rate = base\\_learning\\_rate*decay\\_factor
+
+    After the steps specified by user, the learning rate will be :
+
+    .. math::
+
+        learning\\_rate = {base\\_learning\\_rate}*{\\alpha}
+
+    It has been proposed in
+    `SGDR: Stochastic Gradient Descent with Warm Restarts`_. Note that this only
+    implements the cosine annealing part of SGDR, and not the restarts.
+
+    Args:
+        optimizer(Optimizer): Wrapped optimizer.
+        steps (int): The decay steps in the scheduler.
+        alpha (float, optional): The learning rate scale factor (:math:`\\alpha`). (default: 0.0)
+        last_step (int, optional): The index of last step. (default: -1)
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. (default: ``False``)
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client.experimental as flow
+
+        ...
+        cosine_annealing_lr = flow.optim.lr_scheduler.CosineAnnealingLR(optimizer, steps=100, alpha=0.0)
+        for epoch in range(num_epoch):
+            train(...)
+            cosine_annealing_lr.step()
+
+    .. _SGDR\\: Stochastic Gradient Descent with Warm Restarts:
+        https://arxiv.org/abs/1608.03983
+    """
+
+    def __init__(
+        self, optimizer, steps: int, alpha: float = 0.0, last_step=-1, verbose=False
+    ):
+        assert steps > 0, f"steps must greater than zero, but got {steps}"
+        self.steps = steps
+        self.alpha = alpha
+        super().__init__(optimizer, last_step, verbose)
+
+    def get_lr(self):
+        if self.last_step < self.steps:
+            cos_decay = 0.5 * (1 + math.cos(math.pi * self.last_step / self.steps))
+            decay_factor = (1 - self.alpha) * cos_decay + self.alpha
+            return [base_lr * decay_factor for base_lr in self.base_lrs]
+        else:
+            return [base_lr * self.alpha for base_lr in self.base_lrs]
diff --git a/python/oneflow/compatible/single_client/nn/optimizer/lambda_lr.py b/python/oneflow/compatible/single_client/nn/optimizer/lambda_lr.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad0e793fc9505c3a3b3867abe4f9b96146ac341e
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/optimizer/lambda_lr.py
@@ -0,0 +1,100 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import types
+
+from .lr_scheduler import LrScheduler
+
+
+class LambdaLR(LrScheduler):
+    """
+    Sets the learning rate of each parameter group to the initial lr times a given function.
+    When last_step=-1, sets initial lr as lr.
+
+    .. math::
+
+        learning\\_rate = base\\_learning\\_rate*lambda(last\\_step)
+
+    Args:
+        optimizer(Optimizer): Wrapped optimizer.
+        lr_lambda(function or list): A function which computes a multiplicative factor given an integer
+            parameter epoch, or a list of such functions, one for each group in optimizer.param_groups.
+        last_step (int, optional): The index of last step. (default: -1)
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. (default: ``False``)
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client.experimental as flow
+
+        ...
+        lambda1 = lambda step: step // 30
+        lambda2 = lambda step: 0.95 * step
+        lambda_lr = flow.optim.lr_scheduler.LambdaLR(optimizer, [lambda1, lambda2])
+        for epoch in range(num_epoch):
+            train(...)
+            lambda_lr.step()
+
+    """
+
+    def __init__(self, optimizer, lr_lambda, last_step=-1, verbose=False):
+        if not isinstance(lr_lambda, (list, tuple)):
+            self.lr_lambdas = [lr_lambda] * len(optimizer.param_groups)
+        else:
+            assert len(lr_lambda) == len(
+                optimizer.param_groups
+            ), f"Expected {len(optimizer.param_groups)} lr_lambdas, but got {len(lr_lambda)}"
+            self.lr_lambdas = list(lr_lambda)
+        super().__init__(optimizer, last_step, verbose)
+
+    def state_dict(self):
+        """Returns the state of the scheduler as a :class:`dict`.
+
+        It contains an entry for every variable in self.__dict__ which
+        is not the optimizer.
+        The learning rate lambda functions will only be saved if they are callable objects
+        and not if they are functions or lambdas.
+        """
+        state_dict = {
+            key: value
+            for (key, value) in self.__dict__.items()
+            if key not in ("optimizer", "lr_lambdas")
+        }
+        state_dict["lr_lambdas"] = [None] * len(self.lr_lambdas)
+        for (idx, fn) in enumerate(self.lr_lambdas):
+            if not isinstance(fn, types.FunctionType):
+                state_dict["lr_lambdas"][idx] = fn.__dict__.copy()
+        return state_dict
+
+    def load_state_dict(self, state_dict):
+        """Loads the schedulers state.
+
+        Arguments:
+            state_dict (dict): scheduler state. Should be an object returned
+                from a call to :meth:`state_dict`.
+        """
+        lr_lambdas = state_dict.pop("lr_lambdas")
+        self.__dict__.update(state_dict)
+        state_dict["lr_lambdas"] = lr_lambdas
+        for (idx, fn) in enumerate(lr_lambdas):
+            if fn is not None:
+                self.lr_lambdas[idx].__dict__.update(fn)
+
+    def get_lr(self):
+        return [
+            base_lr * lmbda(self.last_step)
+            for (lmbda, base_lr) in zip(self.lr_lambdas, self.base_lrs)
+        ]
diff --git a/python/oneflow/compatible/single_client/nn/optimizer/lr_scheduler.py b/python/oneflow/compatible/single_client/nn/optimizer/lr_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfaae3047ea1f39975a4e337e0aea8500368561c
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/optimizer/lr_scheduler.py
@@ -0,0 +1,78 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from .optimizer import Optimizer
+
+
+class LrScheduler(object):
+    def __init__(self, optimizer, last_step=-1, verbose=False):
+        if not isinstance(optimizer, Optimizer):
+            raise TypeError(f"{type(optimizer).__name__} is not an Optimizer object")
+        self._optimizer = optimizer
+        if last_step == -1:
+            for group in self._optimizer.param_groups:
+                group["initial_lr"] = group["lr"]
+        else:
+            for (i, group) in enumerate(self._optimizer.param_groups):
+                assert (
+                    "initial_lr" in group
+                ), f"param 'initial_lr' is not specified in param_groups[{i}] when resuming an optimizer"
+        self.base_lrs = [group["initial_lr"] for group in self._optimizer.param_groups]
+        self.last_lr = list()
+        self.last_step = last_step
+        self.verbose = verbose
+        self.step()
+
+    def state_dict(self):
+        """Returns the state of the scheduler as a :class:`dict`.
+
+        It contains an entry for every variable in self.__dict__ which
+        is not the optimizer.
+        """
+        return {
+            key: value for (key, value) in self.__dict__.items() if key != "_optimizer"
+        }
+
+    def load_state_dict(self, state_dict):
+        """Loads the schedulers state.
+
+        Arguments:
+            state_dict (dict): scheduler state. Should be an object returned
+                from a call to :meth:`state_dict`.
+        """
+        self.__dict__.update(state_dict)
+
+    def get_lr(self):
+        """Compute learning rate using chainable form of the scheduler
+        """
+        raise NotImplementedError
+
+    def get_last_lr(self):
+        """ Return last computed learning rate by current scheduler.
+        """
+        return self.last_lr
+
+    def print_lr(self, group_idx, lr):
+        """Display the current learning rate.
+        """
+        print(f"Adjusting learning rate of param_groups[{group_idx}] to {lr}")
+
+    def step(self):
+        self.last_step += 1
+        self.last_lr = self.get_lr()
+        for (i, group) in enumerate(self._optimizer.param_groups):
+            group["lr"] = self.last_lr[i]
+            if self.verbose:
+                self.print_lr(i, self.last_lr[i])
diff --git a/python/oneflow/compatible/single_client/nn/optimizer/optimizer.py b/python/oneflow/compatible/single_client/nn/optimizer/optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..80f6a36bf1eb7a9fa16729fc20a6400c81db66fe
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/optimizer/optimizer.py
@@ -0,0 +1,111 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import collections
+import warnings
+from typing import Any, Callable, Dict, Iterator, Union
+
+from oneflow.compatible.single_client.framework.tensor import Tensor
+from oneflow.compatible.single_client.nn.parameter import Parameter
+
+
+class ParamGroup(object):
+    def __init__(
+        self,
+        parameters: Union[Iterator[Parameter], Dict[str, Any]],
+        default_options: Dict,
+    ):
+        if isinstance(parameters, collections.abc.Iterator):
+            self._parameters = list(parameters)
+            self._options = default_options
+        else:
+            assert "params" in parameters
+            self._parameters = list(parameters["params"])
+            self._options = default_options
+            for key in self._options:
+                if key in parameters:
+                    self._options[key] = parameters[key]
+
+    def __getitem__(self, key):
+        return self._options[key]
+
+    def __setitem__(self, key, value):
+        self._options[key] = value
+
+    @property
+    def options(self):
+        return self._options
+
+    @property
+    def parameters(self):
+        return self._parameters
+
+
+class Optimizer(object):
+    def __init__(self):
+        self.param_groups = list()
+        self._default_options = dict()
+        self._state = dict()
+        self._state["step"] = 0
+        self._op = None
+
+    def add_param_group(self, param_group) -> None:
+        raise NotImplementedError()
+
+    def load_state_dict(self, state_dict) -> None:
+        raise NotImplementedError()
+
+    def state_dict(self):
+        raise NotImplementedError()
+
+    def step(self, closure: Union[Callable, None] = None) -> Union[Tensor, None]:
+        raise NotImplementedError()
+
+    def zero_grad(self, set_to_none: bool = False):
+        """Sets the gradients of all optimized torch.Tensor s to zero.
+
+        Args:
+            set_to_none (bool): instead of setting to zero, set the grads to None.
+                This will in general have lower memory footprint, and can modestly
+                improve performance. However, it changes certain behaviors.
+        For example:
+            1. When the user tries to access a gradient and perform manual ops on
+            it, a None attribute or a Tensor full of 0s will behave differently.
+
+            2. If the user requests zero_grad(set_to_none=True) followed by a
+            backward pass, grads are guaranteed to be None for params that did not
+            receive a gradient.
+
+            3. Optimizers have a different behavior if the gradient is 0 or None
+            (in one case it does the step with a gradient of 0 and in the other
+            it skips the step altogether).
+
+        Returns:
+            None
+
+        """
+        all_grad_is_none = True
+        for param_group in self.param_groups:
+            for param in param_group.parameters:
+                if param.grad is not None:
+                    all_grad_is_none = False
+                    if set_to_none:
+                        param.grad = None
+                    else:
+                        param.grad.zeros_()
+        if all_grad_is_none:
+            warnings.warn(
+                "\nParameters in optimizer do not have gradient.\nPlease check `loss.backward()` is called or not,\nor try to declare optimizer after calling `module.to()`"
+            )
diff --git a/python/oneflow/compatible/single_client/nn/optimizer/rmsprop.py b/python/oneflow/compatible/single_client/nn/optimizer/rmsprop.py
new file mode 100644
index 0000000000000000000000000000000000000000..4797e7f73016726c34b0bfc87f5627176d9bed80
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/optimizer/rmsprop.py
@@ -0,0 +1,176 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import collections
+from typing import Callable, Dict, Iterator, List, Union
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.nn.optimizer.optimizer import (
+    Optimizer,
+    ParamGroup,
+)
+from oneflow.compatible.single_client.nn.parameter import Parameter
+
+
+class RMSprop(Optimizer):
+    """Implements RMSprop algorithm.
+
+    oot Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning
+    rate method. The original slides proposed RMSProp: Slide 29 of
+    http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf .
+
+    The original equation is as follows:
+
+    .. math::
+
+        r(w, t) = \\alpha r(w, t-1) + (1 - \\alpha)(\\nabla Q_{i}(w))^2
+
+        W = w - \\frac{\\eta} {\\\\sqrt{r(w,t) + \\epsilon}} \\nabla Q_{i}(w)
+
+    The first equation calculates moving average of the squared gradient for
+    each weight. Then dividing the gradient by :math:`sqrt{v(w,t)}`.
+    In some cases, adding a momentum term :math: `\\beta` is beneficial.
+    In our implementation, Nesterov momentum is used:
+
+    .. math::
+
+        r(w, t) = \\alpha r(w, t-1) + (1 - \\alpha)(\\nabla Q_{i}(w))^2
+
+        v(w, t) = \\beta v(w, t-1) + \\frac{\\eta} {\\\\sqrt{r(w,t) +
+            \\epsilon}} \\nabla Q_{i}(w)
+
+        w = w - v(w, t)
+
+    if centered is True:
+
+    .. math::
+
+        r(w, t) = \\alpha r(w, t-1) + (1 - \\alpha)(\\nabla Q_{i}(w))^2
+
+        g(w, t) = \\alpha g(w, t-1) + (1 - \\alpha)\\nabla Q_{i}(w)
+
+        v(w, t) = \\beta v(w, t-1) + \\frac{\\eta} {\\\\sqrt{r(w,t) - (g(w, t))^2 +
+            \\epsilon}} \\nabla Q_{i}(w)
+
+        w = w - v(w, t)
+
+    where, :math:`\\alpha` is a hyperparameter and typical values are 0.99, 0.95
+    and so on. :math:`\\beta` is the momentum term. :math:`\\epsilon` is a
+    smoothing term to avoid division by zero, usually set somewhere in range
+    from 1e-4 to 1e-8.
+
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-2)
+        momentum (float, optional): momentum factor (default: 0, oneflow not support momenmtum > 0 now!)
+        alpha (float, optional): smoothing constant (default: 0.99)
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        centered (bool, optional) : if ``True``, compute the centered RMSProp,
+            the gradient is normalized by an estimation of its variance
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+    """
+
+    def __init__(
+        self,
+        parameters: Union[Iterator[Parameter], List[Dict]],
+        lr: float = 0.001,
+        alpha: float = 0.99,
+        eps: float = 1e-08,
+        weight_decay: float = 0,
+        momentum: float = 0.0,
+        centered: bool = False,
+        scale: float = 1.0,
+    ):
+        super().__init__()
+        assert lr >= 0.0, f"Invalid learning rate: {lr}"
+        assert alpha >= 0.0, f"Invalid alpha value: {alpha}"
+        assert eps >= 0.0, f"Invalid epsilon value: {eps}"
+        assert weight_decay >= 0.0, f"Invalid weight_decay value: {weight_decay}"
+        assert scale > 0.0, f"Invalid scale factor: {scale}"
+        assert momentum == 0.0, "Not support momentum greater than zeros now!"
+        self._default_options["lr"] = lr
+        self._default_options["alpha"] = alpha
+        self._default_options["eps"] = eps
+        self._default_options["weight_decay"] = weight_decay
+        self._default_options["centered"] = centered
+        self._default_options["scale"] = scale
+        if isinstance(parameters, collections.abc.Iterator):
+            self.param_groups.append(ParamGroup(parameters, self._default_options))
+        else:
+            for param in parameters:
+                self.param_groups.append(ParamGroup(param, self._default_options))
+        for param_group in self.param_groups:
+            for param in param_group.parameters:
+                assert param.is_leaf, "parameters must be leaf tensor"
+                self._state[param] = dict()
+                self._state[param]["square_avg"] = flow.experimental.zeros_like(param)
+                if param_group["centered"]:
+                    self._state[param]["grad_avg"] = flow.experimental.zeros_like(param)
+        self._centered_rmsprop = (
+            flow.builtin_op("rmsprop_update")
+            .Input("model")
+            .Input("model_diff")
+            .Input("mean_square")
+            .Input("mean_gradient")
+            .Attr("centered", True)
+            .Attr("l1", 0.0)
+            .Attr("l2", 0.0)
+            .Build()
+        )
+        self._rmsprop = (
+            flow.builtin_op("rmsprop_update")
+            .Input("model")
+            .Input("model_diff")
+            .Input("mean_square")
+            .Attr("centered", False)
+            .Attr("l1", 0.0)
+            .Attr("l2", 0.0)
+            .Build()
+        )
+
+    def step(self, closure: Callable = None):
+        """Performs a single optimization step.
+
+        Args:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        with flow.no_grad():
+            loss = None
+            if closure is not None:
+                loss = closure()
+            for param_group in self.param_groups:
+                kwargs = {
+                    "learning_rate_val": param_group["lr"],
+                    "scale": param_group["scale"],
+                    "epsilon": param_group["eps"],
+                    "decay_rate": param_group["alpha"],
+                    "weight_decay": param_group["weight_decay"],
+                }
+                for param in param_group.parameters:
+                    if param.grad is None:
+                        continue
+                    ms_tensor = self._state[param]["square_avg"]
+                    if param_group["centered"]:
+                        mg_tensor = self._state[param]["grad_avg"]
+                        self._centered_rmsprop(
+                            param, param.grad, ms_tensor, mg_tensor, **kwargs
+                        )
+                    else:
+                        self._rmsprop(param, param.grad, ms_tensor, **kwargs)
+            self._state["step"] = self._state["step"] + 1
+            return loss
diff --git a/python/oneflow/compatible/single_client/nn/optimizer/sgd.py b/python/oneflow/compatible/single_client/nn/optimizer/sgd.py
new file mode 100644
index 0000000000000000000000000000000000000000..17f82ea0f50d78067ae7fdc1d28a1f089a7a7f84
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/optimizer/sgd.py
@@ -0,0 +1,126 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import collections
+from typing import Callable, Dict, Iterator, List, Union
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.nn.parameter import Parameter
+
+from .optimizer import Optimizer, ParamGroup
+
+
+class SGD(Optimizer):
+    """Implements SGD algorithm.
+
+    This algorithm takes a random sample鈥檚 gradient as an approximate estimate of the overall gradient in small batch gradient descent.
+
+    When the momentum = 0, the equation of parameters updating is:
+
+        .. math::
+
+            param_{new} = param_{old} - learning\\_rate * grad
+
+    With momentum, the equation of parameters updating is:
+
+        .. math::
+
+            & V_t = \\beta * V_{t-1} + learning\\_rate * g_t
+
+            & param_{new} = param_{old} - V_t
+
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-3)
+        momentum (float, optional): Momentum factor (default: 0.0)
+        scale (float, optional): the scale factor of loss (default: 1.0)
+
+    """
+
+    def __init__(
+        self,
+        parameters: Union[Iterator[Parameter], List[Dict]],
+        lr: float = 0.001,
+        momentum: float = 0.0,
+        scale: float = 1.0,
+    ):
+        super().__init__()
+        assert lr >= 0.0, f"Invalid learning rate: {lr}"
+        assert momentum >= 0.0, f"Invalid momentum: {momentum}"
+        assert scale >= 0.0, f"Invalid scale factor: {scale}"
+        self._default_options["lr"] = lr
+        self._default_options["scale"] = scale
+        self._default_options["momentum"] = momentum
+        if isinstance(parameters, collections.abc.Iterator):
+            self.param_groups.append(ParamGroup(parameters, self._default_options))
+        else:
+            for param in parameters:
+                self.param_groups.append(ParamGroup(param, self._default_options))
+        for param_group in self.param_groups:
+            for param in param_group.parameters:
+                assert param.is_leaf, "parameters must be leaf tensor"
+                self._state[param] = dict()
+                if param_group["momentum"] != 0.0:
+                    self._state[param]["momentum_buf"] = flow.experimental.zeros_like(
+                        param
+                    )
+        self._momentum_sgd = (
+            flow.builtin_op("momentum_update")
+            .Input("model")
+            .Input("model_diff")
+            .Input("momentum")
+            .Attr("l1", 0.0)
+            .Attr("l2", 0.0)
+            .Attr("weight_decay", 0.0)
+            .Build()
+        )
+        self._sgd = (
+            flow.builtin_op("sgd_update")
+            .Input("model")
+            .Input("model_diff")
+            .Attr("weight_decay", 0.0)
+            .Attr("l1", 0.0)
+            .Attr("l2", 0.0)
+            .Build()
+        )
+
+    def step(self, closure: Callable = None):
+        with flow.no_grad():
+            loss = None
+            if closure is not None:
+                loss = closure()
+            for param_group in self.param_groups:
+                lr = param_group["lr"]
+                for param in param_group.parameters:
+                    if param.grad is None:
+                        continue
+                    if param_group["momentum"] == 0.0:
+                        scale = param_group["scale"]
+                        self._sgd(param, param.grad, learning_rate_val=lr, scale=scale)
+                    else:
+                        momentum_buf = self._state[param]["momentum_buf"]
+                        scale = param_group["scale"]
+                        beta = param_group["momentum"]
+                        self._momentum_sgd(
+                            param,
+                            param.grad,
+                            momentum_buf,
+                            learning_rate_val=lr,
+                            scale=scale,
+                            beta=beta,
+                        )
+            self._state["step"] = self._state["step"] + 1
+            return loss
diff --git a/python/oneflow/compatible/single_client/nn/optimizer/step_lr.py b/python/oneflow/compatible/single_client/nn/optimizer/step_lr.py
new file mode 100644
index 0000000000000000000000000000000000000000..91560acc89db06474ff8b31505a69762988f3386
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/optimizer/step_lr.py
@@ -0,0 +1,59 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from .lr_scheduler import LrScheduler
+
+
+class StepLR(LrScheduler):
+    """
+    Decays the learning rate of each parameter group by gamma every step_size steps.
+    Notice that such decay can happen simultaneously with other changes to the learning
+    rate fromoutside this scheduler. When last_step=-1, sets initial lr as lr.
+
+    Args:
+        optimizer(Optimizer): Wrapped optimizer.
+        step_size (int): Period of learning rate decay.
+        gamma (float, optional): Multiplicative factor of learning rate decay. (default: 0.1)
+        last_step (int, optional): The index of last step. (default: -1)
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. (default: ``False``)
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client.experimental as flow
+
+        ...
+        step_lr = flow.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)
+        for epoch in range(num_epoch):
+            train(...)
+            step_lr.step()
+
+    """
+
+    def __init__(
+        self, optimizer, step_size: int, gamma: float = 0.1, last_step=-1, verbose=False
+    ):
+        assert step_size > 0, f"step_size must greater than zero, but got {step_size}"
+        assert gamma > 0.0, f"gamma must greater than zero, but got {gamma}"
+        self.step_size = step_size
+        self.gamma = gamma
+        super().__init__(optimizer, last_step, verbose)
+
+    def get_lr(self):
+        if self.last_step == 0 or self.last_step % self.step_size != 0:
+            return [group["lr"] for group in self._optimizer.param_groups]
+        else:
+            return [group["lr"] * self.gamma for group in self._optimizer.param_groups]
diff --git a/python/oneflow/compatible/single_client/nn/parameter.py b/python/oneflow/compatible/single_client/nn/parameter.py
new file mode 100644
index 0000000000000000000000000000000000000000..4eb1b563da1e05a5d155090d221b65064be6f63d
--- /dev/null
+++ b/python/oneflow/compatible/single_client/nn/parameter.py
@@ -0,0 +1,26 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework.tensor import Tensor
+
+
+class Parameter(Tensor):
+    def __init__(self, data, requires_grad=True):
+        self._data = data
+        self._data.requires_grad = requires_grad
+
+    def __getattr__(self, name):
+        return getattr(self._data, name)
diff --git a/python/oneflow/compatible/single_client/ops/__init__.py b/python/oneflow/compatible/single_client/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..aec506331974bc9d1d8805490a43e08f3efe50de
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/__init__.py
@@ -0,0 +1,122 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import re
+
+import oneflow._oneflow_internal
+from oneflow._oneflow_internal.oneflow.core.job import placement as placement_cfg
+from oneflow.compatible.single_client.eager import blob_register as blob_register_util
+from oneflow.compatible.single_client.eager import boxing_util as boxing_util
+from oneflow.compatible.single_client.framework import c_api_util as c_api_util
+from oneflow.compatible.single_client.framework import (
+    compile_context as compile_context,
+)
+from oneflow.compatible.single_client.framework import hob as hob
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework import input_blob_def as input_blob_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+from oneflow.compatible.single_client.framework import scope_util as scope_util
+from oneflow.compatible.single_client.framework import session_context as session_ctx
+from oneflow.compatible.single_client.support import enable_if as enable_if
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+from oneflow.core.register import logical_blob_id_pb2 as logical_blob_id_util
+
+blob_register = oneflow._oneflow_internal.GetDefaultBlobRegister()
+
+
+def InputOpByArgBlobDef(blob_def):
+    assert isinstance(blob_def, input_blob_util.ArgBlobDef)
+    op_conf = op_conf_util.OperatorConf()
+    op_conf.name = blob_def.op_name
+    op_conf.input_conf.out = blob_def.blob_name
+    op_conf.input_conf.blob_conf.CopyFrom(blob_def.ToInterfaceBlobConf())
+    blob_def.AddAndInferOp(op_conf)
+    lbi = logical_blob_id_util.LogicalBlobId()
+    lbi.op_name = blob_def.op_name
+    lbi.blob_name = blob_def.blob_name
+    return remote_blob_util.RemoteBlob(lbi)
+
+
+def ReturnRemoteBlob(remote_blob, allow_cpu_return_op=True):
+    return enable_if.unique([LazyReturnRemoteBlob, EagerReturnRemoteBlob])(
+        remote_blob, allow_cpu_return_op
+    )
+
+
+@enable_if.condition(hob.in_global_mode & ~hob.eager_execution_enabled)
+def LazyReturnRemoteBlob(remote_blob, allow_cpu_return_op=True):
+    assert isinstance(
+        remote_blob,
+        (
+            oneflow._oneflow_internal.LazyMirroredBlob,
+            oneflow._oneflow_internal.LazyConsistentBlob,
+        ),
+    )
+    (op_conf, lbi, scope) = _GetReturnOpConfAndOutLbiAndScope(
+        remote_blob, allow_cpu_return_op
+    )
+    compile_context.CurJobAddOp(op_conf, scope)
+    return remote_blob_util.RemoteBlob(lbi)
+
+
+@enable_if.condition(hob.in_global_mode & hob.eager_execution_enabled)
+def EagerReturnRemoteBlob(remote_blob, allow_cpu_return_op=True):
+    if not hob.is_trainable(None):
+        return remote_blob
+    (op_conf, lbi, scope) = _GetReturnOpConfAndOutLbiAndScope(
+        remote_blob, allow_cpu_return_op
+    )
+    if remote_blob.blob_object.op_arg_parallel_attr.is_mirrored():
+        add_and_infer = compile_context.CurJobAddMirroredOp
+    else:
+        add_and_infer = compile_context.CurJobAddConsistentOp
+    op_attribute = add_and_infer(op_conf, scope)
+
+    def BuildInstruction(builder):
+        get_blob_scope = blob_register_util.BnInOp2BlobObjectScope
+        with get_blob_scope(blob_register, op_attribute) as bn_in_op2blob_object:
+            cfg_op_attribute = oneflow._oneflow_internal.deprecated.MakeOpAttributeByString(
+                str(op_attribute)
+            )
+            builder.StatelessCall(
+                cfg_op_attribute,
+                remote_blob.blob_object.parallel_desc_symbol.parallel_conf,
+                bn_in_op2blob_object,
+                boxing_util.BoxingTo,
+            )
+
+    oneflow._oneflow_internal.deprecated.LogicalRun(BuildInstruction)
+    return remote_blob_util.RemoteBlob(lbi)
+
+
+def _GetReturnOpConfAndOutLbiAndScope(remote_blob, allow_cpu_return_op=True):
+    op_conf = op_conf_util.OperatorConf()
+    op_conf.name = id_util.UniqueStr("Return_")
+    setattr(op_conf.return_conf, "in", remote_blob.unique_name)
+    op_conf.return_conf.out = "out"
+    if allow_cpu_return_op:
+        op_conf.device_tag = "cpu"
+    lbi = logical_blob_id_util.LogicalBlobId()
+    lbi.op_name = op_conf.name
+    lbi.blob_name = "out"
+    parallel_conf = placement_cfg.ParallelConf()
+    parallel_conf.CopyFrom(remote_blob.parallel_conf)
+
+    def BuildScope(old_scope, builder):
+        return builder.BuildScopeWithNewParallelConf(old_scope, parallel_conf)
+
+    sess = session_ctx.GetDefaultSession()
+    scope = scope_util.MakeScope(BuildScope)
+    return (op_conf, lbi, scope)
diff --git a/python/oneflow/compatible/single_client/ops/array_ops.py b/python/oneflow/compatible/single_client/ops/array_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3a2e57dc19f46a5163e816c82b7aa67846bed66
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/array_ops.py
@@ -0,0 +1,2495 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import operator
+from functools import reduce
+from typing import Iterable, List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework import interpret_util as interpret_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+from oneflow.core.register import logical_blob_id_pb2 as logical_blob_id_util
+
+
+def gather(
+    params: oneflow._oneflow_internal.BlobDesc,
+    indices: oneflow._oneflow_internal.BlobDesc,
+    validate_indices: Optional[oneflow._oneflow_internal.BlobDesc] = None,
+    axis: Optional[int] = None,
+    batch_dims: int = 0,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator gathers slices from params `axis` according to indices.
+
+    Args:
+        params: A `Blob`. The blob from which to gather values. Must be at least rank `axis + 1`.
+        indices: A `Blob`. Index blob. Must be in range [0, params.shape[axis]).
+        axis: A `int`. The axis in params to gather indices from. Defaults to the first dimension.
+            Supports negative indexes.
+        batch_dims: An optional `int`. Defaults to 0.
+        name: A name for the operation (optional).
+    Returns:
+        A blob. Has the same type as params.
+
+    For example:
+
+    Example 1:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def gather_Job(x: tp.Numpy.Placeholder(shape=(3, 3), dtype=flow.float32),
+                    indice: tp.Numpy.Placeholder(shape=(2, ), dtype=flow.int32)
+        ) -> tp.Numpy:
+            gather_blob = flow.gather(params=x,
+                                    indices=indice,
+                                    axis=1)
+            return gather_blob
+
+
+        x = np.array([[1, 2, 3],
+                    [4, 5, 6],
+                    [7, 8, 9]]).astype(np.float32)
+        indice = np.array([0, 2]).astype(np.int32)
+        out = gather_Job(x, indice)
+
+        # out [[1. 3.]
+        #      [4. 6.]
+        #      [7. 9.]]
+
+
+    Example 2:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def gather_Job(x: tp.Numpy.Placeholder(shape=(3, 3), dtype=flow.float32),
+                    indice: tp.Numpy.Placeholder(shape=(2, ), dtype=flow.int32)
+        ) -> tp.Numpy:
+            gather_blob = flow.gather(params=x,
+                                    indices=indice,
+                                    axis=0)
+            return gather_blob
+
+
+        x = np.array([[1, 2, 3],
+                    [4, 5, 6],
+                    [7, 8, 9]]).astype(np.float32)
+        indice = np.array([0, 2]).astype(np.int32)
+        out = gather_Job(x, indice)
+
+        # out [[1. 2. 3.]
+        #      [7. 8. 9.]]
+
+    """
+    params_ndims = len(params.shape)
+    if axis is None:
+        axis = batch_dims
+    elif axis < 0:
+        origin_axis = axis
+        axis += params_ndims
+        assert axis >= 0 and axis < params_ndims, ValueError(
+            "Expected axis to between [%d, %d).  But received: %d "
+            % (-params_ndims, params_ndims, origin_axis)
+        )
+    if batch_dims > 0:
+        if axis == batch_dims:
+            return (
+                flow.user_op_builder(
+                    name if name is not None else id_util.UniqueStr("BatchGather_")
+                )
+                .Op("batch_gather")
+                .Input("in", [params])
+                .Input("indices", [indices])
+                .Output("out")
+                .Build()
+                .InferAndTryRun()
+                .RemoteBlobList()[0]
+            )
+        elif axis > batch_dims:
+            raise NotImplementedError
+        else:
+            raise AttributeError
+    else:
+        return (
+            flow.user_op_builder(
+                name if name is not None else id_util.UniqueStr("Gather_")
+            )
+            .Op("gather")
+            .Input("in", [params])
+            .Input("indices", [indices])
+            .Output("out")
+            .Attr("axis", int(axis))
+            .Build()
+            .InferAndTryRun()
+            .RemoteBlobList()[0]
+        )
+
+
+def flatten(
+    input: oneflow._oneflow_internal.BlobDesc,
+    start_dim: int = 0,
+    end_dim: int = -1,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Flattens a contiguous range of dims in a Blob.
+
+    Args:
+        input: A `Blob`.
+        start_dim: The first dim to flatten.
+        end_dim: The last dim to flatten.
+        name: A name for the operation (optional).
+    Returns:
+        A `Blob`, has the same type as `input`.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def flatten_Job(input: tp.Numpy.Placeholder(shape=(4, 4, 3, 2), dtype=flow.float32)
+        ) -> tp.Numpy:
+            flatten_blob = flow.flatten(input, start_dim=1, end_dim=-1)
+            return flatten_blob
+
+
+        input = np.zeros((4, 4, 3, 2)).astype(np.float32)
+        out = flatten_Job(input)
+
+        # out.shape (4, 24)
+
+    """
+    if name is None:
+        name = id_util.UniqueStr("Flatten_")
+    return (
+        flow.user_op_builder(name)
+        .Op("flatten")
+        .Input("in", [input])
+        .Output("out")
+        .Attr("start_dim", start_dim)
+        .Attr("end_dim", end_dim)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def infer_shape(x, shape):
+    dim_index_need_infer = shape.index(-1) if shape.count(-1) == 1 else None
+    in_elem_cnt = reduce(operator.mul, x.shape, 1)
+    out_elem_cnt = reduce(operator.mul, shape, 1)
+    if dim_index_need_infer is not None:
+        assert in_elem_cnt % out_elem_cnt == 0
+        shape[dim_index_need_infer] = int(abs(in_elem_cnt / out_elem_cnt))
+    else:
+        assert in_elem_cnt == out_elem_cnt
+    return shape
+
+
+def reshape(
+    x: oneflow._oneflow_internal.BlobDesc,
+    shape: Sequence[int],
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator reshapes a Blob.
+    If the Blob is dynamic, it will call `flow.dynamic_reshape` automatically
+
+    We can set one dimension in `shape` as `-1`, the operator will infer the complete shape.
+
+    Args:
+        x: A `Blob`.
+        shape: Shape of the output blob.
+        name: A name for the operation (optional).
+    Returns:
+        A `Blob`, has the same type as `x`.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def reshape_Job(x: tp.Numpy.Placeholder(shape=(4, 4), dtype=flow.float32)
+        ) -> tp.Numpy:
+            reshape_blob = flow.reshape(x,
+                                        shape=[2, 2, 2, -1])
+            return reshape_blob
+
+
+        x = np.array([[1, 2, 3, 4],
+                    [5, 6, 7, 8],
+                    [9, 10, 11, 12],
+                    [13, 14, 15, 16]]).astype(np.float32)
+        out = reshape_Job(x)
+
+        # out.shape (2, 2, 2, 2)
+
+    """
+    x = flow.cast_to_current_logical_view(x)
+    assert isinstance(shape, tuple) or isinstance(shape, list)
+    shape = list(shape)
+    assert all((dim == -1 or dim > 0 for dim in shape))
+    assert shape.count(-1) <= 1
+    if not x.is_dynamic:
+        if name is None:
+            name = id_util.UniqueStr("Reshape_")
+        return (
+            flow.user_op_builder(name)
+            .Op("reshape")
+            .Input("in", [x])
+            .Output("out")
+            .Attr("shape", infer_shape(x, shape))
+            .Build()
+            .InferAndTryRun()
+            .RemoteBlobList()[0]
+        )
+    else:
+        op_conf = op_conf_util.OperatorConf()
+        setattr(
+            op_conf,
+            "name",
+            name if name is not None else id_util.UniqueStr("DynamicReshape_"),
+        )
+        setattr(op_conf.dynamic_reshape_conf, "in", x.unique_name)
+        op_conf.dynamic_reshape_conf.shape.dim.extend(list(shape))
+        setattr(op_conf.dynamic_reshape_conf, "out", "out")
+        interpret_util.Forward(op_conf)
+        lbi = logical_blob_id_util.LogicalBlobId()
+        lbi.op_name = op_conf.name
+        lbi.blob_name = "out"
+        return remote_blob_util.RemoteBlob(lbi)
+
+
+def reshape_like(
+    x: oneflow._oneflow_internal.BlobDesc,
+    like: oneflow._oneflow_internal.BlobDesc,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator reshapes the Blob x to be the same as Blob `like` .
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): The input Blob.
+        like (oneflow._oneflow_internal.BlobDesc): A Blob.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def reshape_like_Job(x: tp.Numpy.Placeholder(shape=(4, 4), dtype=flow.float32)
+        ) -> tp.Numpy:
+            like_blob = flow.constant(value=1,
+                                    dtype=flow.int8,
+                                    shape=(2, 2, 4))
+            reshape_like_blob = flow.reshape_like(x,
+                                                like=like_blob)
+            return reshape_like_blob
+
+
+        x = np.array([[1, 2, 3, 4],
+                    [5, 6, 7, 8],
+                    [9, 10, 11, 12],
+                    [13, 14, 15, 16]]).astype(np.float32)
+        out = reshape_like_Job(x)
+
+        # out.shape (2, 2, 4)
+
+    """
+    if name is None:
+        name = id_util.UniqueStr("ReshapeLike_")
+    return (
+        flow.user_op_builder(name)
+        .Op("reshape_like")
+        .Input("in", [x])
+        .Input("like", [like])
+        .Output("out")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def dynamic_reshape(
+    x: oneflow._oneflow_internal.BlobDesc,
+    shape: Sequence[int],
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator reshapes a dynamic blob.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): The input Blob.
+        shape (Sequence[int]): The output shape.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def dynamic_reshape_Job(x: tp.Numpy.Placeholder(shape=(1, 3, 64, 64), dtype=flow.float32)
+        ) -> tp.Numpy:
+            reshape_out1 = flow.dynamic_reshape(x, (-1, 64))
+            variable1 = flow.get_variable(
+                "var1",
+                shape=(64, 32),
+                dtype=flow.float,
+                initializer=flow.random_uniform_initializer(minval=-10, maxval=10),
+                trainable=True,
+            )
+            matmul_tensor = flow.matmul(reshape_out1, variable1)
+            reshape_out2 = flow.dynamic_reshape(matmul_tensor, (-1, 8, 4))
+            return reshape_out2
+
+        x = np.random.rand(1, 3, 64, 64).astype(np.float32)
+        out = dynamic_reshape_Job(x)
+
+        # out.shape (192, 8, 4)
+
+    """
+    assert isinstance(shape, tuple) or isinstance(shape, list)
+    shape = list(shape)
+    op_conf = op_conf_util.OperatorConf()
+    setattr(
+        op_conf,
+        "name",
+        name if name is not None else id_util.UniqueStr("DynamicReshape_"),
+    )
+    setattr(op_conf.dynamic_reshape_conf, "in", x.unique_name)
+    op_conf.dynamic_reshape_conf.shape.dim.extend(list(shape))
+    setattr(op_conf.dynamic_reshape_conf, "out", "out")
+    interpret_util.Forward(op_conf)
+    lbi = logical_blob_id_util.LogicalBlobId()
+    lbi.op_name = op_conf.name
+    lbi.blob_name = "out"
+    return remote_blob_util.RemoteBlob(lbi)
+
+
+def transpose(
+    a: oneflow._oneflow_internal.BlobDesc,
+    perm: Sequence[int] = None,
+    conjugate: bool = False,
+    batch_axis_non_change: bool = False,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator transposes the specified axis of input Blob.
+
+    Args:
+        a (oneflow._oneflow_internal.BlobDesc): The input Blob.
+        perm (Sequence[int], optional): The list of dimension permutation. Defaults to None.
+        conjugate (bool, optional): Still Unavailable. Defaults to False.
+        batch_axis_non_change (bool, optional): deprecated. Defaults to False.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Raises:
+        NotImplementedError: The attribute `conjugate` still unavailable.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A transposed blob.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def transpose_Job(x: tp.Numpy.Placeholder(shape=(1, 2, 3), dtype=flow.float32)
+        ) -> tp.Numpy:
+            transpose_blob = flow.transpose(x,
+                                            perm=[2, 0, 1])
+            return transpose_blob
+
+        x = np.random.randn(1, 2, 3).astype(np.float32)
+        out = transpose_Job(x)
+
+        # out.shape (3, 1, 2)
+
+    """
+    assert isinstance(perm, (tuple, list))
+    if name is None:
+        name = id_util.UniqueStr("Transpose_")
+    if conjugate:
+        raise NotImplementedError
+    return (
+        flow.user_op_builder(name)
+        .Op("transpose")
+        .Input("input", [a])
+        .Output("output")
+        .Attr("perm", perm)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def slice(
+    x: oneflow._oneflow_internal.BlobDesc,
+    begin: Sequence[int],
+    size: Sequence[int],
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Extracts a slice from a tensor.
+
+    Args:
+        x: A `Blob`.
+        begin: A list or a tuple, indicate each dimension slice begin, whose length must be equal
+            to x's number of dimensions, the first element of begin must be set to None.
+            (Because the internal op of OneFlow does not support 0-dimension slice at present.)
+        size: A list or a tuple, indicate each dimension slice size, whose length must be equal
+            to x's number of dimensions, the first element of beign must be set to None.
+        name: A name for the operation (optional).
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def slice_Job(x: tp.Numpy.Placeholder(shape=(3, 3), dtype=flow.float32)
+        ) -> tp.Numpy:
+            slice_blob = flow.slice(x,
+                                    begin=[None, 0],
+                                    size=[None, 2])
+            return slice_blob
+
+        x = np.array([[1, 2, 3],
+                    [4, 5, 6],
+                    [7, 8, 9]]).astype(np.float32)
+        out = slice_Job(x)
+
+        # out [[1. 2.]
+        #      [4. 5.]
+        #      [7. 8.]]
+
+    """
+    ndim = len(x.shape)
+    if not isinstance(begin, (list, tuple)) or len(begin) != ndim:
+        raise ValueError(
+            "begin must be a list/tuple with the same length as input tensor's number of dimensions"
+        )
+    if not all((isinstance(b, int) or b is None for b in begin)):
+        raise ValueError("element of begin must be a int or None")
+    if not isinstance(size, (list, tuple)) or len(size) != ndim:
+        raise ValueError(
+            "size must be a list/tuple with the same length as input tensor's number of dimensions."
+        )
+    if not all((isinstance(s, int) or s is None for s in size)):
+        raise ValueError("element of size must be a int or None")
+    slice_tup_list = []
+    for (b, s, dim_size) in zip(begin, size, x.shape):
+        (start, stop, step) = (None, None, 1)
+        if b is not None:
+            if b < -dim_size or b >= dim_size:
+                raise ValueError("element of begin is out of range")
+            start = b
+        if s is not None:
+            if s == -1:
+                stop = dim_size
+            else:
+                if s <= 0 or s > dim_size:
+                    raise ValueError("element of size is invalid")
+                if b + s < dim_size:
+                    stop = b + s
+        slice_tup_list.append((start, stop, step))
+    return slice_v2(x, slice_tup_list, name=name)
+
+
+def check_slice_tup_list(slice_tup_list, shape):
+    ndim = len(shape)
+    if not isinstance(slice_tup_list, (list, tuple)) or len(slice_tup_list) > ndim:
+        raise ValueError(
+            "slice_tup_list must be a list or tuple with length less than or equal to number of dimensions of input tensor"
+        )
+    if len(slice_tup_list) < ndim:
+        slice_tup_list += type(slice_tup_list)(
+            [(None, None, None)] * (ndim - len(slice_tup_list))
+        )
+    start_list = []
+    stop_list = []
+    step_list = []
+    for (slice_tup, dim_size) in zip(slice_tup_list, shape):
+        if not isinstance(slice_tup, (tuple, list)) or len(slice_tup) != 3:
+            raise ValueError(
+                "element of slice_tup_list must be a list or tuple with form (start, stop, step)"
+            )
+        if not all((isinstance(idx, int) or idx is None for idx in slice_tup)):
+            raise ValueError("element of slice tuple must int or None")
+        (start, stop, step) = slice_tup
+        if step is None:
+            step = 1
+        if step == 0:
+            raise ValueError("slice step can't be 0")
+        if start is None:
+            start = 0 if step > 0 else np.iinfo(np.int64).max
+        elif start < -dim_size or start >= dim_size:
+            raise ValueError("slice start must be in range [-size, size)")
+        if stop is None:
+            stop = np.iinfo(np.int64).max if step > 0 else np.iinfo(np.int64).min
+        elif stop < -dim_size - 1 or stop > dim_size:
+            raise ValueError("slice start must be in range [-size-1, size]")
+        start_list.append(start)
+        stop_list.append(stop)
+        step_list.append(step)
+    return (start_list, stop_list, step_list)
+
+
+def slice_v2(
+    x: oneflow._oneflow_internal.BlobDesc,
+    slice_tup_list: Sequence[Tuple[int, int, int]],
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Extracts a slice from a tensor.
+    The `slice_tup_list` assigns the slice indices in each dimension, the format is (start, stop, step).
+    The operator will slice the Blob according to the `slice_top_list`.
+
+    Args:
+        x: A `Blob`.
+        slice_tup_list: A list of slice tuple, indicate each dimension slice (start, stop, step).
+        name: A name for the operation (optional).
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+
+    Note: Because the internal op of OneFlow does not support 0-dimension slice at present, we should
+    set the zero element in `slice_tup_list` as `None`.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+        @flow.global_function()
+        def slicev2_Job(x: tp.Numpy.Placeholder(shape=(3, 6, 9), dtype=flow.float32)
+        ) -> tp.Numpy:
+            slicev2_blob = flow.slice_v2(x,
+                                        slice_tup_list=[[None, None, None],
+                                                        [0, 5, 2], # slice in dimension 1, extract [0, 2, 4]
+                                                        [0, 6, 3]]) # slice in dimension 2, extract [0, 3]
+            return slicev2_blob
+        x = np.random.randn(3, 6, 9).astype(np.float32)
+        out = slicev2_Job(x)
+
+        # out.shape (3, 3, 2)
+
+    """
+    name = name or id_util.UniqueStr("Slice_")
+    if not isinstance(name, str):
+        raise ValueError("name must be a string")
+    (start, stop, step) = check_slice_tup_list(slice_tup_list, x.shape)
+    op = (
+        flow.user_op_builder(name)
+        .Op("slice")
+        .Input("x", [x])
+        .Output("y")
+        .Attr("start", start)
+        .Attr("stop", stop)
+        .Attr("step", step)
+        .Build()
+    )
+    return op.InferAndTryRun().SoleOutputBlob()
+
+
+def api_slice_update(
+    x: oneflow._oneflow_internal.BlobDesc,
+    update: oneflow._oneflow_internal.BlobDesc,
+    slice_tup_list: Sequence[Tuple[int, int, int]],
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Update a slice of tensor `x`. Like `x[start:stop:step] = update`. 
+
+    Args:
+        x: A `Blob`, whose slice will be updated.
+        update: A `Blob`, indicate the update content.
+        slice_tup_list: A list of slice tuple, indicate each dimension slice (start, stop, step).
+        name: A name for the operation (optional).
+
+    For example: 
+
+    .. code-block:: python 
+
+        import oneflow.compatible.single_client as flow 
+        import oneflow.compatible.single_client.typing as tp 
+        import numpy as np 
+
+
+        @flow.global_function()
+        def slice_update_job(x: tp.Numpy.Placeholder(shape=(5, )), 
+                            update: tp.Numpy.Placeholder(shape=(3, )))->tp.Numpy: 
+            out = flow.slice_update(x=x, 
+                                    update=update, 
+                                    slice_tup_list=[[1, 4, 1]])
+
+            return out 
+
+        x = np.array([1, 1, 1, 1, 1]).astype(np.float32)
+        update = np.array([2, 3, 4]).astype(np.float32)
+        out = slice_update_job(x, update)
+
+        # out [1. 2. 3. 4. 1.]
+
+    """
+    if name is None:
+        name = id_util.UniqueStr("SliceUpdate_")
+    if not isinstance(name, str):
+        raise ValueError("name must be a string")
+    (start, stop, step) = check_slice_tup_list(slice_tup_list, x.shape)
+    op = (
+        flow.user_op_builder(name)
+        .Op("slice_update")
+        .Input("x", [x])
+        .Input("update", [update])
+        .Output("y")
+        .Attr("start", start)
+        .Attr("stop", stop)
+        .Attr("step", step)
+        .Build()
+    )
+    return op.InferAndTryRun().SoleOutputBlob()
+
+
+def GetSliceAttrs(slice_tup_list, input_shape):
+    ndim = len(input_shape)
+    if not (isinstance(slice_tup_list, (list, tuple)) and len(slice_tup_list) <= ndim):
+        raise ValueError(
+            "slice_tup_list must be a list or tuple with length less than or equal to number of dimensions of input tensor"
+        )
+    if len(slice_tup_list) < ndim:
+        slice_tup_list += type(slice_tup_list)(
+            [(None, None, None)] * (ndim - len(slice_tup_list))
+        )
+    start_list = []
+    stop_list = []
+    step_list = []
+    for (slice_tup, dim_size) in zip(slice_tup_list, input_shape):
+        if not (isinstance(slice_tup, (tuple, list)) and len(slice_tup) == 3):
+            raise ValueError(
+                "element of slice_tup_list must be a list or tuple with form (start, stop, step)"
+            )
+        if not all((isinstance(idx, int) or idx is None for idx in slice_tup)):
+            raise ValueError("element of slice tuple must int or None")
+        (start, stop, step) = slice_tup
+        if step is None:
+            step = 1
+        if step <= 0:
+            raise ValueError("slice_assign/logical_slice step must be greater than 0")
+        if start is None:
+            start = 0
+        elif start < -dim_size or start >= dim_size:
+            raise ValueError(
+                "slice_assign/logical_slice start must be in range [-size, size)"
+            )
+        elif start < 0:
+            start += dim_size
+        if stop is None:
+            stop = dim_size
+        elif stop < -dim_size or stop > dim_size:
+            raise ValueError(
+                "slice_assign/logical_slice start must be in range [-size, size]"
+            )
+        elif stop < 0:
+            stop += dim_size
+        start_list.append(start)
+        stop_list.append(stop)
+        step_list.append(step)
+    return (start_list, stop_list, step_list)
+
+
+def logical_slice(
+    x: oneflow._oneflow_internal.BlobDesc,
+    slice_tup_list: Sequence[Tuple[int, int, int]],
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    name = id_util.UniqueStr("LogicalSlice_") if name is None else name
+    if not isinstance(name, str):
+        raise ValueError("name must be a string")
+    (start_list, stop_list, step_list) = GetSliceAttrs(slice_tup_list, x.shape)
+    op = (
+        flow.user_op_builder(name)
+        .Op("logical_slice")
+        .Input("x", [x])
+        .Output("y")
+        .Attr("start", start_list)
+        .Attr("stop", stop_list)
+        .Attr("step", step_list)
+        .Build()
+    )
+    return op.InferAndTryRun().SoleOutputBlob()
+
+
+def logical_slice_assign(
+    x: oneflow._oneflow_internal.BlobDesc,
+    value: oneflow._oneflow_internal.BlobDesc,
+    slice_tup_list: Sequence[Tuple[int, int, int]],
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    name = id_util.UniqueStr("LogicalSliceAssign_") if name is None else name
+    if not isinstance(name, str):
+        raise ValueError("name must be a string")
+    (start_list, stop_list, step_list) = GetSliceAttrs(slice_tup_list, x.shape)
+    op = (
+        flow.user_op_builder(name)
+        .Op("logical_slice_assign")
+        .Input("ref", [x])
+        .Input("value", [value])
+        .Attr("start", start_list)
+        .Attr("stop", stop_list)
+        .Attr("step", step_list)
+        .Build()
+    )
+    return op.InferAndTryRun()
+
+
+def reverse(
+    input: oneflow._oneflow_internal.BlobDesc,
+    axis: Union[int, Sequence[int]],
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator reverses the elements on the assigned axis.
+
+    Args:
+        input (oneflow._oneflow_internal.BlobDesc): The input Blob.
+        axis (Union[int, Sequence[int]]): The reverse axis.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Raises:
+        ValueError: The name must be a string.
+        ValueError: The axis must be a int or a list/tuple of int.
+        ValueError: The axis is out of range.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def reverse_Job(x: tp.Numpy.Placeholder(shape=(3, 3), dtype=flow.float32)) -> tp.Numpy:
+            reverse_blob = flow.reverse(x,
+                                        axis=0)
+            return reverse_blob
+
+
+        x = np.array([[1, 2, 3],
+                    [4, 5, 6],
+                    [7, 8, 9]]).astype(np.float32)
+        out = reverse_Job(x)
+
+        # out [[7. 8. 9.]
+        #      [4. 5. 6.]
+        #      [1. 2. 3.]]
+
+    """
+    if name is None:
+        name = id_util.UniqueStr("Reverse_")
+    if not isinstance(name, str):
+        raise ValueError("name must be a string")
+    if isinstance(axis, int):
+        axis = [axis]
+    if not isinstance(axis, (tuple, list)) or not all(
+        (isinstance(a, int) for a in axis)
+    ):
+        raise ValueError("axis must be a int or a list/tuple of int")
+    ndim = len(input.shape)
+    slice_tup_list = [(None, None, None)] * ndim
+    for (i, a) in enumerate(axis):
+        if a < 0:
+            a += ndim
+        if a < 0 or a >= ndim:
+            raise ValueError("axis is out of range")
+        slice_tup_list[a] = (None, None, -1)
+    return slice_v2(input, slice_tup_list, name)
+
+
+def concat(
+    inputs: Optional[Sequence[oneflow._oneflow_internal.BlobDesc]] = None,
+    axis: int = 0,
+    max_dim_size: Optional[int] = None,
+    name: Optional[str] = None,
+    values: Optional[Sequence[oneflow._oneflow_internal.BlobDesc]] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Concatenate two or more `Blob` s at specified axis.
+
+    Analogous to `numpy.concatenate <https://docs.scipy.org/doc/numpy/reference/generated/numpy.concatenate.html>`_
+
+    Args:
+        inputs: a `list` of `Blob`
+        axis: a `int`. `0` by default
+        max_dim_size: hint of max dimension size along the given axis
+        name: name of this operator. `None` by default
+        values: deprecated param, use inputs instead
+
+    Returns:
+        A `Blob`
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def concat_Job() -> tp.Numpy:
+            constant_blob_1 = flow.constant(value=1.5,
+                                            shape=(1, 3, 3, 4),
+                                            dtype=flow.float,
+                                            name="blob1")
+            constant_blob_2 = flow.constant(value=2.5,
+                                            shape=(1, 3, 3, 4),
+                                            dtype=flow.float,
+                                            name="blob2")
+            return flow.concat(inputs=[constant_blob_1, constant_blob_2],
+                            axis=3)
+
+
+        out = concat_Job()
+
+        # out.shape (1, 3, 3, 8)
+
+    """
+    if values is not None:
+        assert inputs is None
+        inputs = values
+    assert isinstance(inputs, (list, tuple))
+    if len(inputs) == 1:
+        return inputs[0]
+    assert len(inputs) >= 2
+    if axis < 0:
+        axis += len(inputs[0].shape)
+    assert axis >= 0 and axis < len(
+        inputs[0].shape
+    ), "axis must be in range [0, num_axes of inputs)"
+    first_input_shape = inputs[0].shape
+    static_dim_size = 0
+    dynamic_dim_size = 0
+    for input in inputs:
+        assert len(input.shape) == len(first_input_shape)
+        for i in range(len(input.shape)):
+            if i == axis:
+                if input.is_dynamic:
+                    dynamic_dim_size += input.shape[i]
+                else:
+                    static_dim_size += input.shape[i]
+            else:
+                assert input.shape[i] == first_input_shape[i]
+    if max_dim_size is None:
+        max_dim_size = static_dim_size + dynamic_dim_size
+    else:
+        assert (
+            max_dim_size >= static_dim_size
+        ), "max diemension size {} is too small to hold concatenated static dimension size {} along the given axis".format(
+            max_dim_size, static_dim_size
+        )
+    if name is None:
+        name = id_util.UniqueStr("Concat_")
+    op = (
+        flow.user_op_builder(name)
+        .Op("concat")
+        .Input("in", inputs)
+        .Output("out")
+        .Attr("axis", axis)
+        .Attr("max_dim_size", max_dim_size)
+        .Build()
+    )
+    return op.InferAndTryRun().SoleOutputBlob()
+
+
+def gather_nd(
+    params: oneflow._oneflow_internal.BlobDesc,
+    indices: oneflow._oneflow_internal.BlobDesc,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator is a high-dimensional extension of `gather`, `indices` is a K-dimensional
+    tensor, which is regarded as a index of input Blob `params`.
+
+    Each element defines a slice of `params`:
+
+    .. math::
+
+        output[(i_0,i_1,...,i_{K-2})] = param[indices(i_{0},i_{1},...,i_{K-2})]
+
+
+    Args:
+        params (oneflow._oneflow_internal.BlobDesc): The input Blob.
+        indices (oneflow._oneflow_internal.BlobDesc): The slice indices.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+
+    For example:
+
+    Example 1:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def gather_nd_Job(x: tp.Numpy.Placeholder(shape=(3, 3), dtype=flow.float32),
+                        indice: tp.Numpy.Placeholder(shape=(2, 1), dtype=flow.int32)
+        ) -> tp.Numpy:
+            gather_nd_blob = flow.gather_nd(params=x,
+                                            indices=indice)
+            return gather_nd_blob
+
+
+        x = np.array([[1, 2, 3],
+                    [4, 5, 6],
+                    [7, 8, 9]]).astype(np.float32)
+        indice = np.array([[0], [2]]).astype(np.int32)
+        out = gather_nd_Job(x, indice)
+
+        # out [[1. 2. 3.]
+        #      [7. 8. 9.]]
+
+    Example 2:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def gather_nd_Job(x: tp.Numpy.Placeholder(shape=(3, 3), dtype=flow.float32),
+                        indice: tp.Numpy.Placeholder(shape=(2, 2), dtype=flow.int32)
+        ) -> tp.Numpy:
+            gather_nd_blob = flow.gather_nd(params=x,
+                                            indices=indice)
+            return gather_nd_blob
+
+
+        x = np.array([[1, 2, 3],
+                    [4, 5, 6],
+                    [7, 8, 9]]).astype(np.float32)
+        indice = np.array([[0, 2], [2, 1]]).astype(np.int32)
+        out = gather_nd_Job(x, indice)
+
+        # out [3. 8.]
+
+    Example3:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def gather_nd_Job(x: tp.Numpy.Placeholder(shape=(3, 3), dtype=flow.float32),
+                        indice: tp.Numpy.Placeholder(shape=(3, 2), dtype=flow.int32)
+        ) -> tp.Numpy:
+            gather_nd_blob = flow.gather_nd(params=x,
+                                            indices=indice)
+            return gather_nd_blob
+
+
+        x = np.array([[1, 2, 3],
+                    [4, 5, 6],
+                    [7, 8, 9]]).astype(np.float32)
+        indice = np.array([[0, 1], [1, 0], [2, 2]]).astype(np.int32)
+        out = gather_nd_Job(x, indice)
+
+        # out [2. 4. 9.]
+
+    """
+    if name is None:
+        name = id_util.UniqueStr("GatherNd_")
+    op = (
+        flow.user_op_builder(name)
+        .Op("gather_nd")
+        .Input("params", [params])
+        .Input("indices", [indices])
+        .Output("out")
+        .Build()
+    )
+    return op.InferAndTryRun().RemoteBlobList()[0]
+
+
+def scatter_nd(
+    indices: oneflow._oneflow_internal.BlobDesc,
+    updates: oneflow._oneflow_internal.BlobDesc,
+    shape: Sequence[int],
+    name: Optional[str] = None,
+):
+    """This operator inserts the elements in `updates` according to the `indices` and create a new Blob.
+
+    Args:
+        indices (oneflow._oneflow_internal.BlobDesc): The indice of `updates`. Its type should be `flow.int`.
+        updates (oneflow._oneflow_internal.BlobDesc): The update Blob.
+        shape (Sequence[int]): The constant tensor shape, the constant tensor elements are all zero.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+
+    For example:
+
+    Example 1:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def scatter_nd_Job(indice: tp.Numpy.Placeholder(shape=(3, 1), dtype=flow.int32),
+                        update: tp.Numpy.Placeholder(shape=(3, ), dtype=flow.float32),
+        ) -> tp.Numpy:
+            scatter_blob = flow.scatter_nd(indices=indice,
+                                        updates=update,
+                                        shape=[8])
+            return scatter_blob
+
+
+        indice_array = np.array([[1], [6], [4]]).astype(np.int32)
+        update_array = np.array([10.2, 5.1, 12.7]).astype(np.float32)
+        out = scatter_nd_Job(indice_array, update_array)
+
+        # [ 0.  10.2  0.   0.  12.7  0.   5.1  0. ]
+
+    Example 2:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def scatter_nd_Job(indice: tp.Numpy.Placeholder(shape=(3, 1), dtype=flow.int32),
+                        update: tp.Numpy.Placeholder(shape=(3, 3), dtype=flow.float32),
+        ) -> tp.Numpy:
+            scatter_blob = flow.scatter_nd(indices=indice,
+                                        updates=update,
+                                        shape=[5, 3])
+            return scatter_blob
+
+
+        indice_array = np.array([[0], [4], [2]]).astype(np.int32)
+        update_array = np.array([[1, 1, 1],
+                                [2, 2, 2],
+                                [3, 3, 3]]).astype(np.float32)
+        out = scatter_nd_Job(indice_array, update_array)
+
+        # out [[1. 1. 1.]
+        #      [0. 0. 0.]
+        #      [3. 3. 3.]
+        #      [0. 0. 0.]
+        #      [2. 2. 2.]]
+
+    """
+    if name is None:
+        name = id_util.UniqueStr("ScatterNd_")
+    op = (
+        flow.user_op_builder(name)
+        .Op("scatter_nd")
+        .Input("indices", [indices])
+        .Input("updates", [updates])
+        .Attr("shape", shape)
+        .Output("out")
+        .Build()
+    )
+    return op.InferAndTryRun().RemoteBlobList()[0]
+
+
+def tensor_scatter_nd_update(
+    params: oneflow._oneflow_internal.BlobDesc,
+    indices: oneflow._oneflow_internal.BlobDesc,
+    updates: oneflow._oneflow_internal.BlobDesc,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator inserts the elements in `updates` according to the `indices` into the Blob `params`.
+
+    Args:
+        params (oneflow._oneflow_internal.BlobDesc): The input Blob.
+        indices (oneflow._oneflow_internal.BlobDesc): The indice of `updates`. Its type should be `flow.int32`.
+        updates (oneflow._oneflow_internal.BlobDesc): The update Blob.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def tensor_scatter_nd_Job(x: tp.Numpy.Placeholder(shape=(5, 3), dtype=flow.float32),
+                                indice: tp.Numpy.Placeholder(shape=(3, 1), dtype=flow.int32),
+                                update: tp.Numpy.Placeholder(shape=(3, 3), dtype=flow.float32),
+        ) -> tp.Numpy:
+            scatter_blob = flow.tensor_scatter_nd_update(params=x,
+                                                        indices=indice,
+                                                        updates=update)
+            return scatter_blob
+
+        x = np.array([[1, 2, 3],
+                    [1, 2, 3],
+                    [1, 2, 3],
+                    [1, 2, 3],
+                    [1, 2, 3]]).astype(np.float32)
+        indice_array = np.array([[0], [4], [2]]).astype(np.int32)
+        update_array = np.array([[1, 1, 1],
+                                [2, 2, 2],
+                                [3, 3, 3]]).astype(np.float32)
+        out = tensor_scatter_nd_Job(x, indice_array, update_array)
+
+        # out [[1. 1. 1.]
+        #      [1. 2. 3.]
+        #      [3. 3. 3.]
+        #      [1. 2. 3.]
+        #      [2. 2. 2.]]
+
+    """
+    if name is None:
+        name = id_util.UniqueStr("TensorScatterNdUpdate_")
+    op = (
+        flow.user_op_builder(name)
+        .Op("tensor_scatter_nd_update")
+        .Input("params", [params])
+        .Input("updates", [updates])
+        .Input("indices", [indices])
+        .Output("out")
+        .Build()
+    )
+    return op.InferAndTryRun().RemoteBlobList()[0]
+
+
+def tensor_scatter_nd_add(
+    params: oneflow._oneflow_internal.BlobDesc,
+    indices: oneflow._oneflow_internal.BlobDesc,
+    updates: oneflow._oneflow_internal.BlobDesc,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator adds elements from 'updates' to Blob 'params' based on the `indices`.
+
+    Args:
+        params (oneflow._oneflow_internal.BlobDesc): The input Blob.
+        indices (oneflow._oneflow_internal.BlobDesc): The indice of `updates`. Its type should be `flow.int32`.
+        updates (oneflow._oneflow_internal.BlobDesc): The update Blob.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+
+    For example锛�
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def tensor_scatter_nd_add_Job(x: tp.Numpy.Placeholder(shape=(5, 3), dtype=flow.float32),
+                                    indice: tp.Numpy.Placeholder(shape=(3, 1), dtype=flow.int32),
+                                    update: tp.Numpy.Placeholder(shape=(3, 3), dtype=flow.float32),
+        ) -> tp.Numpy:
+            scatter_blob = flow.tensor_scatter_nd_add(params=x,
+                                                    indices=indice,
+                                                    updates=update)
+            return scatter_blob
+
+        x = np.array([[1, 2, 3],
+                    [1, 2, 3],
+                    [1, 2, 3],
+                    [1, 2, 3],
+                    [1, 2, 3]]).astype(np.float32)
+        indice_array = np.array([[0], [4], [2]]).astype(np.int32)
+        update_array = np.array([[1, 1, 1],
+                                [2, 2, 2],
+                                [3, 3, 3]]).astype(np.float32)
+        out = tensor_scatter_nd_add_Job(x, indice_array, update_array)
+
+        # out [[2. 3. 4.]
+        #      [1. 2. 3.]
+        #      [4. 5. 6.]
+        #      [1. 2. 3.]
+        #      [3. 4. 5.]]
+
+    """
+    if name is None:
+        name = id_util.UniqueStr("TensorScatterNdAdd_")
+    op = (
+        flow.user_op_builder(name)
+        .Op("tensor_scatter_nd_add")
+        .Input("params", [params])
+        .Input("updates", [updates])
+        .Input("indices", [indices])
+        .Output("out")
+        .Build()
+    )
+    return op.InferAndTryRun().RemoteBlobList()[0]
+
+
+def argwhere(
+    condition: oneflow._oneflow_internal.BlobDesc,
+    dtype: Optional[flow.dtype] = None,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator finds the indices of input Blob `condition` elements that are non-zero. It returns a List.
+    Each element in the output is a coordinate that points to a non-zero element in the condition.
+
+    Args:
+        condition (oneflow._oneflow_internal.BlobDesc): The input Blob.
+        dtype (Optional[flow.dtype], optional): The data type of output. Defaults to None.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob. Its type is `ListNumpy`.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def argwhere_Job(x: tp.Numpy.Placeholder(shape=(2, 3), dtype=flow.float32),
+        ) -> tp.ListNumpy:
+            return flow.argwhere(x)
+
+
+        x = np.array([[0, 1, 0],
+                    [2, 0, 2]]).astype(np.float32)
+        out = argwhere_Job(x)
+
+        # out [array([[0, 1],
+        #             [1, 0],
+        #             [1, 2]], dtype=int32)]
+
+    """
+    if name is None:
+        name = id_util.UniqueStr("ArgWhere_")
+    if dtype is None:
+        dtype = flow.int32
+    op = (
+        flow.user_op_builder(name)
+        .Op("argwhere")
+        .Input("input", [condition])
+        .Attr("dtype", dtype)
+        .Output("output")
+        .Output("output_size")
+        .Build()
+    )
+    (output, output_size) = op.InferAndTryRun().RemoteBlobList()
+    return sync_dynamic_resize(output, output_size)
+
+
+def nonzero(
+    a: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator finds the indices of input Blob `condition` elements that are non-zero.
+
+    Args:
+        a (oneflow._oneflow_internal.BlobDesc): The input Blob.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+    """
+    if name is None:
+        argwhere_name = id_util.UniqueStr("Nonzero_ArgWhere_")
+        tranpose_name = id_util.UniqueStr("Nonzero_Transpose_")
+    else:
+        argwhere_name = name + "_ArgWhere"
+        tranpose_name = name + "_Transpose"
+    indices = argwhere(a, name=argwhere_name)
+    return transpose(indices, perm=(1, 0), name=tranpose_name)
+
+
+def where(
+    condition: oneflow._oneflow_internal.BlobDesc,
+    x: Optional[oneflow._oneflow_internal.BlobDesc] = None,
+    y: Optional[oneflow._oneflow_internal.BlobDesc] = None,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator returns the elements where condition is larger than 0.
+
+    If `x` and `y` is None, this operator is equal to `oneflow.compatible.single_client.argwhere`.
+
+    If `x` and `y` both are not None, If the element in condition is larger than 0,
+    it will take the `x` element, else it will take the `y` element.
+
+    Args:
+        condition (oneflow._oneflow_internal.BlobDesc): The input Blob.
+        x (Optional[oneflow._oneflow_internal.BlobDesc], optional): A Blob. Defaults to None.
+        y (Optional[oneflow._oneflow_internal.BlobDesc], optional): A Blob. Defaults to None.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Raises:
+        ValueError: It is not supported when exactly one of x or y is non-None
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob. Its type is `ListNumpy`.
+
+    For example:
+
+    Example 1:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def where_Job(condition: tp.Numpy.Placeholder(shape=(5, ), dtype=flow.int32),
+                    x: tp.Numpy.Placeholder(shape=(5, ), dtype=flow.float32),
+                    y: tp.Numpy.Placeholder(shape=(5, ), dtype=flow.float32),
+        ) -> tp.ListNumpy:
+            return flow.where(condition=condition,
+                            x=x,
+                            y=y)
+
+
+        condition = np.array([3, 0, 1, 0, 1]).astype(np.int32)
+        x = np.array([10, 20, 30, 40, 50]).astype(np.float32)
+        y = np.array([100, 200, 300, 400, 500]).astype(np.float32)
+        out = where_Job(condition, x, y)
+
+        # out [array([ 10., 200.,  30., 400.,  50.], dtype=float32)]
+
+    Example 2:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def where_Job(condition: tp.Numpy.Placeholder(shape=(5, ), dtype=flow.int32),
+        ) -> tp.ListNumpy:
+            return flow.where(condition=condition)
+
+
+        condition = np.array([3, 0, 1, 0, 1]).astype(np.int32)
+        out = where_Job(condition)
+
+        # out [array([[0],
+        #             [2],
+        #             [4]], dtype=int32)]
+
+    """
+    if x is None and y is None:
+        return argwhere(condition, name=name)
+    elif x is not None and y is not None:
+        if name is None:
+            name = id_util.UniqueStr("Where_")
+        if x.shape == condition.shape and y.shape == condition.shape:
+            broadcast_cond = condition
+            broadcast_x = x
+            broadcast_y = y
+        else:
+            broadcast_cond = flow.broadcast_to_compatible_with(condition, [x, y])
+            broadcast_x = flow.broadcast_to_compatible_with(x, [condition, y])
+            broadcast_y = flow.broadcast_to_compatible_with(y, [condition, x])
+        return (
+            flow.user_op_builder(name)
+            .Op("where")
+            .Input("condition", [broadcast_cond])
+            .Input("x", [broadcast_x])
+            .Input("y", [broadcast_y])
+            .Output("out")
+            .Build()
+            .InferAndTryRun()
+            .RemoteBlobList()[0]
+        )
+    else:
+        raise ValueError("it is not supported when exactly one of x or y is non-None")
+
+
+def elem_cnt(
+    inputs: oneflow._oneflow_internal.BlobDesc,
+    dtype: Optional[flow.dtype] = None,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator returns the amount of elements in input Blob.
+
+    Args:
+        inputs (oneflow._oneflow_internal.BlobDesc): The input Blob.
+        dtype (Optional[flow.dtype], optional): The data type. Defaults to None.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob. Its type is `ListNumpy`.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def elem_cnt_Job(x: tp.Numpy.Placeholder(shape=(5, ), dtype=flow.float32),
+        ) -> tp.ListNumpy:
+            return flow.elem_cnt(inputs=x, dtype=flow.int32)
+
+        x = np.array([10, 20, -30, 40, 50]).astype(np.float32)
+        out = elem_cnt_Job(x)
+
+        # [array([5], dtype=int32)]
+
+    """
+    op_conf = op_conf_util.OperatorConf()
+    setattr(
+        op_conf, "name", name if name is not None else id_util.UniqueStr("ElemCnt_")
+    )
+    op_conf.shape_elem_cnt_conf.x = inputs.unique_name
+    op_conf.shape_elem_cnt_conf.exclude_axis_conf.SetInParent()
+    if dtype is not None:
+        op_conf.shape_elem_cnt_conf.data_type = oneflow._oneflow_internal.deprecated.GetProtoDtype4OfDtype(
+            dtype
+        )
+    op_conf.shape_elem_cnt_conf.y = "y"
+    interpret_util.Forward(op_conf)
+    out_lbi = logical_blob_id_util.LogicalBlobId()
+    setattr(out_lbi, "op_name", op_conf.name)
+    setattr(out_lbi, "blob_name", "y")
+    return remote_blob_util.RemoteBlob(out_lbi)
+
+
+def sync_dynamic_resize(
+    inputs: oneflow._oneflow_internal.BlobDesc,
+    size: oneflow._oneflow_internal.BlobDesc,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """
+
+    Args:
+        inputs (oneflow._oneflow_internal.BlobDesc): The input Blob.
+        size (oneflow._oneflow_internal.BlobDesc): The size of new Blob.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob. Its type is `ListNumpy`.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def sync_dynamic_resize_Job(x: tp.Numpy.Placeholder(shape=(4, 3), dtype=flow.float32),
+                                    size: tp.Numpy.Placeholder(shape=(1, ), dtype=flow.int32),
+        ) -> tp.ListNumpy:
+            resize_Blob = flow.sync_dynamic_resize(inputs=x,
+                                                size=size)
+            return resize_Blob
+
+        x = np.array([[1, 2, 3],
+                    [4, 5, 6],
+                    [7, 8, 9],
+                    [10, 11, 12]]).astype(np.float32)
+        size = np.array([2]).astype(np.int32)
+        out = sync_dynamic_resize_Job(x, size)
+
+        # out [array([[1., 2., 3.],
+        #             [4., 5., 6.]], dtype=float32)]
+
+    """
+    op_conf = op_conf_util.OperatorConf()
+    setattr(
+        op_conf,
+        "name",
+        name if name is not None else id_util.UniqueStr("SyncDynamicResize_"),
+    )
+    setattr(op_conf.sync_dynamic_resize_conf, "in", inputs.unique_name)
+    setattr(op_conf.sync_dynamic_resize_conf, "size", size.unique_name)
+    setattr(op_conf.sync_dynamic_resize_conf, "axis", 0)
+    setattr(op_conf.sync_dynamic_resize_conf, "out", "out")
+    setattr(op_conf.sync_dynamic_resize_conf, "eager", flow.eager_execution_enabled())
+    interpret_util.Forward(op_conf)
+    out_lbi = logical_blob_id_util.LogicalBlobId()
+    setattr(out_lbi, "op_name", op_conf.name)
+    setattr(out_lbi, "blob_name", "out")
+    return remote_blob_util.RemoteBlob(out_lbi)
+
+
+def stack(
+    inputs: Sequence[oneflow._oneflow_internal.BlobDesc],
+    axis: int = 0,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator stacks the multiple Blobs on the specified axis.
+
+    Args:
+        inputs (Sequence[oneflow._oneflow_internal.BlobDesc]): A list of input Blob.
+        axis (int): The stack axis.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        import numpy as np
+
+
+        @flow.global_function()
+        def stack_job(x: tp.Numpy.Placeholder(shape=(2, 4, 6)),
+                    y: tp.Numpy.Placeholder(shape=(2, 4, 6)))->tp.Numpy:
+            out = flow.stack([x, y], axis=2)
+            return out
+
+        x = np.ones(shape=(2, 4, 6), dtype=np.float32)
+        y = np.ones(shape=(2, 4, 6), dtype=np.float32)
+
+        out = stack_job(x, y)
+
+        # output.shape (2, 4, 2, 6)
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+
+    """
+    if name is None:
+        name = id_util.UniqueStr("Stack_")
+    inputs = list(inputs)
+    _input_shape = inputs[0].shape
+    _max_dim = len(_input_shape)
+    if axis < 0:
+        axis = axis + _max_dim + 1
+    assert axis >= 0 and axis <= _max_dim
+    _input_list_length = len(inputs)
+    for i in range(_input_list_length):
+        _current_shape = inputs[i].shape
+        assert (
+            _input_shape == _current_shape
+        ), "Each tensor should have the same shape ! Found a tensor instance shape is: {}".format(
+            _current_shape
+        )
+        inputs[i] = flow.expand_dims(
+            inputs[i], axis=axis, name=name + "expand_dims_{}".format(i)
+        )
+    return flow.concat(inputs, axis=axis, name=name + "concat")
+
+
+def generate_random_batch_permutation_indices(
+    value: oneflow._oneflow_internal.BlobDesc,
+    seed: Optional[int] = None,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator generates a random permutation of indices in batch axis.
+
+    Args:
+        value (oneflow._oneflow_internal.BlobDesc): The input Blob.
+        seed (Optional[int], optional): The random seed. Defaults to None.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob. Its type is `ListNumpy`.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def random_indice_Job(x: tp.Numpy.Placeholder(shape=(4, 3), dtype=flow.int32),
+        ) -> tp.ListNumpy:
+            return flow.random.generate_random_batch_permutation_indices(value=x)
+
+        x = np.array([[1, 1, 1],
+                    [2, 2, 2],
+                    [3, 3, 3],
+                    [4, 4, 4]]).astype(np.int32)
+        out = random_indice_Job(x)
+
+        # out [array([3, 0, 2, 1], dtype=int32)]
+
+    """
+    import random
+
+    op = (
+        flow.user_op_builder(
+            name
+            if name is not None
+            else id_util.UniqueStr(value.op_name + "_random_batch_permutation_indices")
+        )
+        .Op("generate_random_batch_permutation_indices")
+        .Input("x", [value])
+        .Output("y")
+    )
+    if seed is not None:
+        op.Attr("seed", seed)
+        assert name is not None
+    else:
+        op.Attr("seed", random.randint(-(2 ** 63) + 1, 2 ** 63 - 1))
+    return op.Build().InferAndTryRun().RemoteBlobList()[0]
+
+
+def shuffle(
+    value: oneflow._oneflow_internal.BlobDesc,
+    seed: Optional[int] = None,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator shuffle the elements in input Blob.
+
+    Args:
+        value (oneflow._oneflow_internal.BlobDesc): The input Blob.
+        seed (Optional[int], optional): The random seed. Defaults to None.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def shuffle_Job(x: tp.Numpy.Placeholder(shape=(3, 3), dtype=flow.int32),
+        ) -> tp.Numpy:
+            return flow.random.shuffle(x)
+
+        x = np.array([[1, 1, 1],
+                    [2, 2, 2],
+                    [3, 3, 3]]).astype(np.int32)
+        out = shuffle_Job(x)
+
+        # out [[3 3 3]
+        #      [1 1 1]
+        #      [2 2 2]]
+
+    """
+    return flow.gather(value, generate_random_batch_permutation_indices(value, seed))
+
+
+def identity(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator returns a `Blob` that has identical content and data type to input `Blob`.
+
+    Analogous to `tf.identity <https://www.tensorflow.org/api_docs/python/tf/identity>`_
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): The input Blob.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def identity_Job(x: tp.Numpy.Placeholder(shape=(3, 3), dtype=flow.int32),
+        ) -> tp.Numpy:
+            return flow.identity(x)
+
+        x = np.array([[1, 1, 1],
+                    [2, 2, 2],
+                    [3, 3, 3]]).astype(np.int32)
+        out = identity_Job(x)
+
+        # out [[1 1 1]
+        #      [2 2 2]
+        #      [3 3 3]]
+
+    """
+    if name is None:
+        name = id_util.UniqueStr("Identity_")
+    op = (
+        flow.user_op_builder(name).Op("identity").Input("in", [x]).Output("out").Build()
+    )
+    return op.InferAndTryRun().SoleOutputBlob()
+
+
+def identity_n(
+    inputs: Sequence[oneflow._oneflow_internal.BlobDesc], name: Optional[str] = None
+) -> List[oneflow._oneflow_internal.BlobDesc]:
+    """This operator is similar to `oneflow.compatible.single_client.identity`. The difference is that the input and output
+    of `identity_n` is `List`.
+
+    Args:
+        inputs (Iterable[oneflow._oneflow_internal.BlobDesc]): A List of input Blob.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        List[oneflow._oneflow_internal.BlobDesc]: A list of result Blob.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+        from typing import List
+
+
+        @flow.global_function()
+        def identity_Job(x: tp.Numpy.Placeholder(shape=(1, 3), dtype=flow.int32),
+                        y: tp.Numpy.Placeholder(shape=(1, 3), dtype=flow.int32),
+                        z: tp.Numpy.Placeholder(shape=(1, 3), dtype=flow.int32)
+        ) -> List[tp.Numpy]:
+            return flow.identity_n([x, y, z])
+
+
+        x = np.array([[1, 1, 1]]).astype(np.int32)
+        y = np.array([[2, 2, 2]]).astype(np.int32)
+        z = np.array([[3, 3, 3]]).astype(np.int32)
+        out = identity_Job(x, y, z)
+
+        # out[0] [[1, 1, 1]]
+        # out[1] [[2, 2, 2]]
+        # out[2] [[3, 3, 3]]
+
+    """
+    return (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("IdentityN_")
+        )
+        .Op("tuple_identity")
+        .Input("in", inputs)
+        .Output("out", len(inputs))
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()
+    )
+
+
+def cast_to_static_shape(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator returns a `Blob` that has identical content and data type to input `Blob`, and whose shape is converted from dynamic to static
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): The input Blob which has dynamic shape.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob which is identical to input blob but has static shape.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def cast_to_static_shape_func(
+            x: tp.ListNumpy.Placeholder(shape=(3, 3), dtype=flow.float32),
+        ) -> tp.Numpy:
+            return flow.cast_to_static_shape(x)
+
+        x = np.array([[1, 1, 1],
+                      [2, 2, 2],
+                      [3, 3, 3]]).astype(np.float32)
+
+        out = cast_to_static_shape_func(x)
+
+        # out [[1 1 1]
+        #      [2 2 2]
+        #      [3 3 3]]
+
+    """
+    if not x.is_dynamic:
+        return x
+    if name is None:
+        name = id_util.UniqueStr("CastToStaticShape_")
+    op = (
+        flow.user_op_builder(name)
+        .Op("cast_to_static_shape")
+        .Input("input", [x])
+        .Output("output")
+        .Build()
+    )
+    return op.InferAndTryRun().SoleOutputBlob()
+
+
+def squeeze(
+    input: oneflow._oneflow_internal.BlobDesc,
+    axis: Optional[Sequence[int]] = None,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator removes the specified dimention which size is 1 of the input Blob.
+    If the `axis` is not specified, this operator will remove all the dimention which size is 1 of the input Blob.
+
+    The amount of element in return value is the same as Blob `input`.
+
+    Args:
+        input (oneflow._oneflow_internal.BlobDesc): The input Blob.
+        axis (Optional[Sequence[int]], optional): The axis. Defaults to None.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+
+    For example:
+
+    Example 1:
+
+    .. code-block:
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def squeeze_Job(x: tp.Numpy.Placeholder(shape=(1, 1, 1, 3), dtype=flow.int32),
+        ) -> tp.Numpy:
+            return flow.squeeze(x)
+
+
+        x = np.array([[[[1, 1, 1]]]]).astype(np.int32)
+        out = squeeze_Job(x)
+
+        # out.shape (3,)
+
+    Example 2:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def squeeze_Job(x: tp.Numpy.Placeholder(shape=(1, 1, 1, 3), dtype=flow.int32),
+        ) -> tp.Numpy:
+            return flow.squeeze(x, axis=[1, 2])
+
+
+        x = np.array([[[[1, 1, 1]]]]).astype(np.int32)
+        out = squeeze_Job(x)
+
+        # out.shape (1, 3)
+
+    """
+    if axis is None:
+        axis = [idx for (idx, dim) in enumerate(input.shape) if dim == 1]
+    else:
+        assert isinstance(axis, list) or isinstance(axis, tuple)
+        in_num_axes = len(input.shape)
+        for x in axis:
+            assert x >= -in_num_axes and x < in_num_axes
+    return (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("Squeeze_")
+        )
+        .Op("squeeze")
+        .Input("in", [input])
+        .Output("out")
+        .Attr("axes", list(axis))
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def expand(
+    x: oneflow._oneflow_internal.BlobDesc,
+    expand_size: Sequence[int],
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator expand the input Blob to a larger size.
+
+    Passing -1 as the size for a dimension means not changing the size of that dimension.
+
+    Blob can be also expanded to a larger number of dimensions and the new ones will be appended at the front.
+
+    For the new dimensions, the size cannot be set to -1.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): The input Blob.
+        expand_size (Sequence[int]): The desired expanded size.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def expandJob(x: tp.Numpy.Placeholder(shape=(1, 3, 1, 2), dtype=flow.int32),
+        ) -> tp.Numpy:
+            return flow.expand(input=x,
+                                expand_size=[1, 3, 2, 2])
+
+        x = np.array([[[[0, 1]],
+                       [[2, 3]],
+                       [[4, 5]]]]).astype(np.int32)
+
+        out = expandJob(x)
+        # out shape: [1, 3, 2, 2]
+        # [[[[0, 1],
+        #    [0, 1]],
+        #   [[2, 3],
+        #    [2, 3]],
+        #   [[4, 5],
+        #    [4, 5]]]]
+    """
+    expand_size = list(expand_size)
+    assert len(expand_size) >= len(
+        x.shape
+    ), "The desired expanded dims should not be less than the input dims."
+    original_stride = [1]
+    for i in range(len(x.shape) - 2, -1, -1):
+        original_stride.insert(0, original_stride[0] * x.shape[i + 1])
+    new_size = []
+    new_stride = []
+    diff = len(expand_size) - len(x.shape)
+    for i in range(len(expand_size) - 1, -1, -1):
+        if i >= diff:
+            if expand_size[i] == -1 or expand_size[i] == x.shape[i - diff]:
+                new_size.insert(0, x.shape[i - diff])
+                new_stride.insert(0, original_stride[i - diff])
+            else:
+                assert expand_size[i] >= 1 and x.shape[i - diff] == 1
+                new_size.insert(0, expand_size[i])
+                new_stride.insert(0, 0)
+        else:
+            assert expand_size[i] >= 1
+            new_size.insert(0, expand_size[i])
+            if expand_size[i] == 1:
+                new_stride.insert(0, new_stride[0])
+            else:
+                new_stride.insert(0, 0)
+    return (
+        flow.user_op_builder(name if name is not None else id_util.UniqueStr("Expand_"))
+        .Op("expand")
+        .Input("in", [x])
+        .Output("out")
+        .Attr("in_shape", list(x.shape))
+        .Attr("out_shape", new_size)
+        .Attr("stride", new_stride)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def expand_dims(
+    input: oneflow._oneflow_internal.BlobDesc, axis: int, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator inserts a dimention at the specified axis in the input Blob.
+    The size of new dimension can only be 1, and the amount of element in return value is the same as Blob `input`.
+
+    Args:
+        input (oneflow._oneflow_internal.BlobDesc): The input Blob.
+        axis (int): The specified dimension index.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def expand_dim_Job(x: tp.Numpy.Placeholder(shape=(1, 3, 3), dtype=flow.int32),
+        ) -> tp.Numpy:
+            return flow.expand_dims(input=x,
+                                    axis=2)
+
+
+        x = np.array([[[1, 1, 1],
+                    [1, 1, 1],
+                    [1, 1, 1]]]).astype(np.int32)
+        out = expand_dim_Job(x)
+
+        # out.shape (1, 3, 1, 3)
+
+    """
+    in_num_axes = len(input.shape)
+    assert axis >= -(in_num_axes + 1) and axis <= in_num_axes
+    return (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("ExpandDims_")
+        )
+        .Op("expand_dims")
+        .Input("in", [input])
+        .Output("out")
+        .Attr("axis", axis)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def broadcast_like(
+    x: oneflow._oneflow_internal.BlobDesc,
+    like: oneflow._oneflow_internal.BlobDesc,
+    broadcast_axes: Optional[Sequence[int]] = None,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator broadcast the input Blob `x` on the specified axis with input Blob `like`.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): The input Blob.
+        like (oneflow._oneflow_internal.BlobDesc): A Blob.
+        broadcast_axes (Optional[Sequence[int]], optional): The broadcast axis. Defaults to None.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Raises:
+        ValueError: The length of broadcast_axes must be greater than 0 and less than or equal to number of axes of like shape.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+
+    For example:
+
+    Example 1:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def broadcast_like_Job(x: tp.Numpy.Placeholder(shape=(3, 1), dtype=flow.float32)
+        ) -> tp.Numpy:
+            like_tensor = flow.constant(value=1.0,
+                                        dtype=flow.float32,
+                                        shape=(3, 3))
+            return flow.broadcast_like(x=x,
+                                    like=like_tensor,
+                                    broadcast_axes=(1, ))
+
+
+        x = np.array([[1], [1], [1]]).astype(np.float32)
+        out = broadcast_like_Job(x)
+
+        # out [[[1 1 1]
+        #       [1 1 1]
+        #       [1 1 1]]]
+
+        # out.shape (3, 3)
+
+    Example 2:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def broadcast_like_Job(x: tp.Numpy.Placeholder(shape=(3, 1, 1), dtype=flow.float32)
+        ) -> tp.Numpy:
+            like_tensor = flow.constant(value=1.0,
+                                        dtype=flow.float32,
+                                        shape=(3, 3, 3))
+            return flow.broadcast_like(x=x,
+                                    like=like_tensor,
+                                    broadcast_axes=(1, 2))
+
+
+        x = np.random.randn(3, 1, 1).astype(np.float32)
+        out = broadcast_like_Job(x)
+
+        # out.shape (3, 3, 3)
+
+    """
+    if name is None:
+        name = id_util.UniqueStr("BroadcastLike_")
+    if broadcast_axes is None:
+        broadcast_axes = list(range(len(like.shape)))
+    assert isinstance(broadcast_axes, (list, tuple))
+    if len(broadcast_axes) <= 0 or len(broadcast_axes) > len(like.shape):
+        raise ValueError(
+            "The length of broadcast_axes must be greater than 0 and less than or equal to number of axes of like shape"
+        )
+    op = (
+        flow.user_op_builder(name)
+        .Op("broadcast_like")
+        .Input("x", [x])
+        .Input("like", [like])
+        .Attr("broadcast_axes", broadcast_axes)
+        .Output("y")
+        .Build()
+    )
+    return op.InferAndTryRun().SoleOutputBlob()
+
+
+def masked_fill(
+    x: oneflow._oneflow_internal.BlobDesc,
+    mask: oneflow._oneflow_internal.BlobDesc,
+    value: Union[float, int],
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Fill a blob with a given value according to the given mask.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): Input Blob.
+        mask (oneflow._oneflow_internal.BlobDesc): Composed with 0 and 1, the input blob 'x' will be
+            filled with the given value where the mask is 1.
+        value (Union[int, int]): The value to use for filling the input blob.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+    Attention:
+        x and mask must be broadcastable to each other.
+        mask must be int type (int8/int32/int64).
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The value-filled Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def masked_fill_Job(x: tp.Numpy.Placeholder((4, ), mask: tp.Numpy.Placeholder((4, ),
+                            dtype = flow.int8))->tp.Numpy:
+            return flow.masked_fill(x, mask, value=5)
+
+        x = np.array([1, 2, 3, 4], dtype=np.float32)
+        mask = np.array([1, 0, 0, 1], dtype=np.int8)
+
+        out = masked_fill_Job(x, mask)
+
+        # output [5 2 3 5]
+
+    """
+    if name is None:
+        name = id_util.UniqueStr("MaskedFill_")
+    value_like_x = flow.constant_like(like=x, value=value, name=name + "_ConstantLike")
+    return flow.where(condition=mask, x=value_like_x, y=x, name=name + "_Where")
+
+
+def dim_gather(
+    input: oneflow._oneflow_internal.BlobDesc,
+    dim: int,
+    index: oneflow._oneflow_internal.BlobDesc,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """ This operator gathers elements from `input` according to `index` along with the axis `dim`.
+
+    Take a 3-D blob as example, the output is specified by:
+
+    .. code-block:: python
+
+        output[i][j][k] = input[index[i][j][k]][j][k]  # if dim == 0
+        output[i][j][k] = input[i][index[i][j][k]][k]  # if dim == 1
+        output[i][j][k] = input[i][j][index[i][j][k]]  # if dim == 2
+
+
+    The shape of `input` and `index` should be the same except in the `dim` dimension.
+
+    That is, if `input` is a n-dimension blob with shape :math:`(x_0, x_1, \\dots, x_{i-1}, x_i, x_{i+1}, \\dots, x_n)`,
+    and `dim = i`, then `index` must be a n-dimension blob with shape :math:`(x_0, x_1, \\dots, x_{i-1}, k, x_{i+1}, \\dots, x_n)`
+    where :math:`k \\geq 1`.
+
+    The return Blob `output` will have the same shape with `index`.
+
+    Args:
+        input (oneflow._oneflow_internal.BlobDesc): The input blob
+        dim (int): The axis along which to index
+        index (oneflow._oneflow_internal.BlobDesc): The index blob of elements to gather
+        name (Optional[str], optional): The name of the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The elements gathered from `input` will be returned as the output Blob.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def dim_gather_Job(input: tp.Numpy.Placeholder((2, 2), dtype=flow.float64),
+                        index:tp.Numpy.Placeholder((2, 2), dtype=flow.int32))->tp.Numpy:
+            return flow.dim_gather(input, 1, index)
+
+        input = np.array([[1, 2], [3, 4]]).astype(np.float64)
+        index = np.array([[1, 0], [0, 1]]).astype(np.int32)
+
+        out = dim_gather_Job(input, index)
+        # output
+        # [[2. 1.]
+        #  [3. 4.]]
+
+    """
+    if len(input.shape) != len(index.shape):
+        raise ValueError("Dimensions of input and index should equal")
+    for i in range(0, len(input.shape)):
+        if dim == i:
+            continue
+        elif input.shape[i] != index.shape[i]:
+            raise ValueError(
+                "Dimensions of input and index should be same except at dim"
+            )
+    if dim >= len(index.shape):
+        raise ValueError(
+            "Value of dim is out of range(dim should be less than len(index.shape))"
+        )
+    return (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("DimGather_")
+        )
+        .Op("dim_gather")
+        .Input("input", [input])
+        .Input("index", [index])
+        .Output("output")
+        .Attr("dim", int(dim))
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def amp_white_identity(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    if name is None:
+        name = id_util.UniqueStr("AmpWhiteIdentity_")
+    op = (
+        flow.user_op_builder(name)
+        .Op("amp_white_identity")
+        .Input("in", [x])
+        .Output("out")
+        .Build()
+    )
+    return op.InferAndTryRun().SoleOutputBlob()
+
+
+def zeros(
+    shape: Sequence[int], dtype: Optional[flow.dtype] = None, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator creates a Tensor filled with the scalar value `0`.
+
+    Args:
+        shape (Sequence[int]): The shape of the Tensor.
+        dtype (Optional[flow.dtype], optional): The data type. Defaults to None.
+        name (Optional[str], optional): The name for the operator. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Tensor filled with value `0`
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def zeros_job() -> tp.Numpy:
+            return flow.zeros(shape=(2, 3), dtype=flow.float32)
+
+
+        out = zeros_job()
+
+        # output: [[0. 0. 0.]
+        #          [0. 0. 0.]]
+
+    """
+    if name is None:
+        name = id_util.UniqueStr("Zeros_")
+    if dtype is None:
+        dtype = flow.float32
+    return flow.constant(value=0.0, shape=shape, dtype=dtype, name=name + "constant")
+
+
+def ones(
+    shape: Sequence[int], dtype: Optional[flow.dtype] = None, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator creates a Tensor filled with the scalar value `1`.
+
+    Args:
+        shape (Sequence[int]): The shape of the Tensor.
+        dtype (Optional[flow.dtype], optional): The data type. Defaults to None.
+        name (Optional[str], optional): The name for the operator. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob filled with value `1`
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def ones_job() -> tp.Numpy:
+            return flow.ones(shape=(2, 3), dtype=flow.float32)
+
+
+        out = ones_job()
+
+        # output: [[1. 1. 1.]
+        #          [1. 1. 1.]]
+    """
+    if name is None:
+        name = id_util.UniqueStr("Ones_")
+    if dtype is None:
+        dtype = flow.float32
+    return flow.constant(value=1.0, shape=shape, dtype=dtype, name=name + "constant")
+
+
+def nvtx_start(
+    x: oneflow._oneflow_internal.BlobDesc, mark_prefix: str, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    if name is None:
+        name = id_util.UniqueStr("NvtxStart_")
+    op = (
+        flow.user_op_builder(name)
+        .Op("nvtx_start")
+        .Input("in", [x])
+        .Output("out")
+        .Attr("mark_prefix", str(mark_prefix))
+        .Build()
+    )
+    return op.InferAndTryRun().SoleOutputBlob()
+
+
+def nvtx_end(
+    x: oneflow._oneflow_internal.BlobDesc, mark_prefix: str, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    if name is None:
+        name = id_util.UniqueStr("NvtxEnd_")
+    op = (
+        flow.user_op_builder(name)
+        .Op("nvtx_end")
+        .Input("in", [x])
+        .Output("out")
+        .Attr("mark_prefix", str(mark_prefix))
+        .Build()
+    )
+    return op.InferAndTryRun().SoleOutputBlob()
diff --git a/python/oneflow/compatible/single_client/ops/assign_op.py b/python/oneflow/compatible/single_client/ops/assign_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..de521594a6f815b9335d6e62a866fb50e148f0c4
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/assign_op.py
@@ -0,0 +1,97 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.eager import boxing_util as boxing_util
+from oneflow.compatible.single_client.framework import hob as hob
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework import interpret_util as interpret_util
+from oneflow.compatible.single_client.framework import (
+    placement_context as placement_ctx,
+)
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+from oneflow.compatible.single_client.support import enable_if as enable_if
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+from oneflow.core.register import logical_blob_id_pb2 as logical_blob_id_util
+
+
+def assign(ref, value, dtype=None, name=None):
+    if name is None:
+        name = id_util.UniqueStr("Assign_")
+    op = (
+        flow.consistent_user_op_builder(name)
+        .Op("assign")
+        .Input("ref", [ref])
+        .Input("value", [value])
+        .Build()
+    )
+    op.InferAndTryRun()
+
+
+def api_system_assign(ref, value, validate_shape=None, use_locking=None, name=None):
+    api = enable_if.unique([lazy_system_assign, eager_system_assign])
+    return api(
+        ref, value, validate_shape=validate_shape, use_locking=use_locking, name=name
+    )
+
+
+@enable_if.condition(hob.in_global_mode & ~hob.eager_execution_enabled)
+def lazy_system_assign(ref, value, validate_shape=None, use_locking=None, name=None):
+    op_conf = _SystemAssignOpConf(ref, value, name=name)
+    (
+        device_tag,
+        machine_device_ids,
+        hierarchy,
+    ) = oneflow._oneflow_internal.GetDeviceTagAndMachineDeviceIdsAndHierarchy(
+        ref.parallel_conf
+    )
+    if hierarchy is not None:
+        hierarchy = tuple(hierarchy.dim())
+    with flow.scope.placement(device_tag, machine_device_ids, hierarchy):
+        interpret_util.Forward(op_conf)
+    return ref
+
+
+@enable_if.condition(hob.in_global_mode & hob.eager_execution_enabled)
+def eager_system_assign(ref, value, validate_shape=None, use_locking=None, name=None):
+    op_conf = _SystemAssignOpConf(ref, value, name=name)
+    oneflow._oneflow_internal.deprecated.LogicalRun(
+        lambda builder: boxing_util.BuildAssignInstruction(
+            builder, ref.blob_object, value.blob_object, op_conf
+        )
+    )
+    return ref
+
+
+def api_one_to_one_assign(ref, value):
+    assert hob.eager_execution_enabled(None)
+    oneflow._oneflow_internal.deprecated.LogicalRun(
+        lambda builder: builder.Build121AssignInstruction(
+            ref.blob_object, value.blob_object
+        )
+    )
+    return ref
+
+
+def _SystemAssignOpConf(ref, value, name=None):
+    if name is None:
+        name = id_util.UniqueStr("Assign_")
+    op_conf = op_conf_util.OperatorConf()
+    op_conf.name = name
+    op_conf.assign_conf.ref = ref.unique_name
+    op_conf.assign_conf.value = value.unique_name
+    return op_conf
diff --git a/python/oneflow/compatible/single_client/ops/builtin_ops.py b/python/oneflow/compatible/single_client/ops/builtin_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..01fda38a6042dfbee3909ea94827b19bfdb39433
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/builtin_ops.py
@@ -0,0 +1,104 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework.attr_util import (
+    convert_to_user_attr_value,
+)
+
+
+class BuiltinOp(object):
+    def __init__(self, op_type_name, op_name=None):
+        if op_name is None:
+            op_name = id_util.UniqueStr(op_type_name)
+        self._builder = oneflow._oneflow_internal.one.OpBuilder(op_type_name, op_name)
+        self._op = None
+        self._op_type_name = op_type_name
+
+    @property
+    def op(self):
+        """access the builtin op
+
+        Returns:
+            the builtin op
+        """
+        if self._op is None:
+            self._op = self._builder.build()
+        return self._op
+
+    def Input(self, input_name, num=1):
+        """Set input blob of op
+
+        Args:
+            input_name (str): input name of blob
+            num (int, optional) : Defaults to 1.
+
+        Returns:
+            self
+        """
+        assert isinstance(num, int) and num >= 1
+        self._builder.input(input_name, num)
+        return self
+
+    def Output(self, output_name, num=1):
+        """Set output blob of op
+
+        Args:
+            output_name (str): name of output blob
+            num (int, optional):  Defaults to 1.
+
+        Returns:
+            self
+        """
+        assert isinstance(num, int) and num >= 1
+        self._builder.output(output_name, num)
+        return self
+
+    def Attr(self, attr_name, attr_value, attr_type_name=None):
+        """Set value of op's attribute.
+
+        Args:
+            attr_name (str): attribute name of op
+            attr_value (Any): attribute value of op
+
+        Raises:
+            ValueError: raised when value is not idential to op's attribute type.
+
+        Returns:
+            [type]: [description]
+        """
+        if attr_type_name is not None:
+            print(
+                'WARNING: Argument \'attr_type_name\' of UserOpConfBuilder.Attr has been deprecated. Please remove it.\n\n            For instance:\n                -     .Attr("out_num", out_num, "AttrTypeInt64")\n                +     .Attr("out_num", out_num)\n                        '
+            )
+            print(traceback.format_stack()[-2])
+        assert self._op_type_name is not None
+        self._builder.attr(
+            attr_name,
+            convert_to_user_attr_value(self._op_type_name, attr_name, attr_value),
+        )
+        return self
+
+    def Build(self):
+        """Explicitly complete the construction of the builtin op
+
+        Returns:
+            the completed builtin op
+        """
+        if self._op is None:
+            self._op = self._builder.build()
+        return self._op
diff --git a/python/oneflow/compatible/single_client/ops/categorical_ordinal_encode_op.py b/python/oneflow/compatible/single_client/ops/categorical_ordinal_encode_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..0eff419ee1ef9741b78991d7f7b27aa04f4efe7f
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/categorical_ordinal_encode_op.py
@@ -0,0 +1,164 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+
+
+def categorical_ordinal_encode(
+    table: oneflow._oneflow_internal.BlobDesc,
+    size: oneflow._oneflow_internal.BlobDesc,
+    input_tensor: oneflow._oneflow_internal.BlobDesc,
+    hash_precomputed: bool = True,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator maintains a hash table to encode the categorical ordinal Blob. It converts a discrete input value into a continuous integer ID.
+
+    Args:
+        table (oneflow._oneflow_internal.BlobDesc): The hash table, you can assign it as a variable.
+        size (oneflow._oneflow_internal.BlobDesc): The size of hash table.
+        input_tensor (oneflow._oneflow_internal.BlobDesc): The input Blob.
+        hash_precomputed (bool, optional): We currently only support the 'True' mode. The internal hash value will no longer be computed. Defaults to True.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def categorical_ordinal_encode_Job(x: tp.Numpy.Placeholder((3, 3), dtype=flow.int32)
+        ) -> tp.Numpy:
+            dtype = x.dtype
+            with flow.scope.namespace("categorical_ordinal_encode"):
+                table = flow.get_variable(
+                    name="Table",
+                    shape=(16,),
+                    dtype=dtype,
+                    initializer=flow.constant_initializer(0, dtype=dtype),
+                    trainable=False,
+                    reuse=False,
+                )
+                size = flow.get_variable(
+                    name="Size",
+                    shape=(1,),
+                    dtype=dtype,
+                    initializer=flow.constant_initializer(0, dtype=dtype),
+                    trainable=False,
+                    reuse=False,
+                )
+                return flow.categorical_ordinal_encode(
+                    table=table, size=size, input_tensor=x, name="Encode",
+                )
+
+        x = np.array([[7, 0, 2],
+                    [1, 7, 2],
+                    [0, 1, 7]]).astype(np.int32)
+
+        out = categorical_ordinal_encode_Job(x)
+
+        # out [[1 0 2]
+        #      [3 1 2]
+        #      [0 3 1]]
+
+    """
+    assert hash_precomputed is True
+    return (
+        flow.user_op_builder(name or id_util.UniqueStr("CategoricalOrdinalEncode_"))
+        .Op("CategoricalOrdinalEncode")
+        .Input("in", [input_tensor])
+        .Input("table", [table])
+        .Input("size", [size])
+        .Output("out")
+        .Attr("hash_precomputed", hash_precomputed)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def categorical_ordinal_encoder(
+    input_tensor: oneflow._oneflow_internal.BlobDesc,
+    capacity: int,
+    hash_precomputed: bool = True,
+    name: str = "CategoricalOrdinalEncoder",
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator uses `oneflow.compatible.single_client.categorical_ordinal_encode` to encapsulate a categorical_ordinal_encoder. More details please refer to `oneflow.compatible.single_client.categorical_ordinal_encode`
+
+    Args:
+        input_tensor (oneflow._oneflow_internal.BlobDesc): The input Blob.
+        capacity (int): The capacity of hash table.
+        hash_precomputed (bool, optional): We currently only support the 'True' mode. The internal hash value will no longer be computed. Defaults to True.
+        name (str, optional): The name for the operation. Defaults to "CategoricalOrdinalEncoder".
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def categorical_ordinal_encoder_Job(x: tp.Numpy.Placeholder((3, 3), dtype=flow.int32)
+        ) -> tp.Numpy:
+            return flow.layers.categorical_ordinal_encoder(x, 16)
+
+        x = np.array([[7, 0, 2],
+                    [1, 7, 2],
+                    [0, 1, 7]]).astype(np.int32)
+
+        out = categorical_ordinal_encoder_Job(x)
+
+        # out [[1 0 2]
+        #      [3 1 2]
+        #      [0 3 1]]
+
+    """
+    assert hash_precomputed is True
+    dtype = input_tensor.dtype
+    with flow.scope.namespace(name):
+        table = flow.get_variable(
+            name="Table",
+            shape=(capacity * 2,),
+            dtype=dtype,
+            initializer=flow.constant_initializer(0, dtype=dtype),
+            trainable=False,
+            reuse=False,
+        )
+        size = flow.get_variable(
+            name="Size",
+            shape=(1,),
+            dtype=dtype,
+            initializer=flow.constant_initializer(0, dtype=dtype),
+            trainable=False,
+            reuse=False,
+        )
+        return categorical_ordinal_encode(
+            table=table, size=size, input_tensor=input_tensor, name="Encode"
+        )
diff --git a/python/oneflow/compatible/single_client/ops/combined_margin_loss.py b/python/oneflow/compatible/single_client/ops/combined_margin_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..49983872e26d86b57b678dfe4d0170b22139c235
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/combined_margin_loss.py
@@ -0,0 +1,58 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+from typing import Optional, Sequence, Union
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework import interpret_util as interpret_util
+from oneflow.compatible.single_client.framework import module as module_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+from oneflow.compatible.single_client.ops import (
+    math_unary_elementwise_ops as math_unary_elementwise_ops,
+)
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+from oneflow.core.register import logical_blob_id_pb2 as logical_blob_id_util
+
+
+def combined_margin_loss(
+    x: oneflow._oneflow_internal.BlobDesc,
+    label: oneflow._oneflow_internal.BlobDesc,
+    m1: float = 1,
+    m2: float = 0,
+    m3: float = 0,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    depth = x.shape[1]
+    (y, theta) = (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("CombinedMarginLoss_")
+        )
+        .Op("combined_margin_loss")
+        .Input("x", [x])
+        .Input("label", [label])
+        .Output("y")
+        .Output("theta")
+        .Attr("m1", float(m1))
+        .Attr("m2", float(m2))
+        .Attr("m3", float(m3))
+        .Attr("depth", int(depth))
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()
+    )
+    return y
diff --git a/python/oneflow/compatible/single_client/ops/constant_op.py b/python/oneflow/compatible/single_client/ops/constant_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6d3386c41f476276eef130c2dbfb52c944e5712
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/constant_op.py
@@ -0,0 +1,303 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+from typing import Optional, Sequence, Union
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework import interpret_util as interpret_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+from oneflow.core.register import logical_blob_id_pb2 as logical_blob_id_util
+
+
+def constant(
+    value: Union[int, float],
+    dtype: Optional[flow.dtype] = None,
+    shape: Optional[Sequence[int]] = None,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator creates a constant Blob.
+
+    Args:
+        value (Union[int, float]): The constant value of Blob.
+        dtype (Optional[flow.dtype], optional): The data type of Blob. Defaults to None.
+        shape (Optional[Sequence[int]], optional): The shape of Blob. Defaults to None.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Raises:
+        NotImplementedError: The data type of value should be int or float.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result blob.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def constant_Job() -> tp.Numpy:
+            constant_blob = flow.constant(value=1.5,
+                                        shape=(1, 3, 3),
+                                        dtype=flow.float)
+            return constant_blob
+
+
+        out = constant_Job()
+
+        # out [[[1.5 1.5 1.5]
+        #       [1.5 1.5 1.5]
+        #       [1.5 1.5 1.5]]]
+
+    """
+    if name is None:
+        name = id_util.UniqueStr("Constant_")
+    assert value is not None
+    assert dtype is not None
+    if not isinstance(value, (int, float)):
+        raise NotImplementedError
+    if isinstance(value, float):
+        is_floating_value = True
+        floating_value = float(value)
+        integer_value = int(0)
+    else:
+        is_floating_value = False
+        floating_value = float(0)
+        integer_value = int(value)
+    if shape is not None:
+        assert isinstance(shape, (list, tuple))
+    else:
+        shape = []
+    return (
+        flow.user_op_builder(name)
+        .Op("constant")
+        .Output("out")
+        .Attr("floating_value", floating_value)
+        .Attr("integer_value", integer_value)
+        .Attr("is_floating_value", is_floating_value)
+        .Attr("dtype", dtype)
+        .Attr("shape", shape)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def constant_scalar(
+    value: Union[int, float],
+    dtype: Optional[flow.dtype] = None,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator creates a constant scalar Blob.
+
+    Args:
+        value (Union[int, float]): The constant value of Blob.
+        dtype (Optional[flow.dtype], optional): The data type of Blob. Defaults to None.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result blob.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def constant_scalar_Job() -> tp.Numpy:
+            constant_scalar = flow.constant_scalar(value=2.5,
+                                                dtype=flow.float)
+            return constant_scalar
+
+
+        out = constant_scalar_Job()
+
+        # out [2.5]
+
+    """
+    return flow.constant(value, dtype=dtype, shape=[1])
+
+
+def constant_like(
+    like: oneflow._oneflow_internal.BlobDesc,
+    value: Union[int, float],
+    dtype: Optional[flow.dtype] = None,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator creates a constant Blob that has the same shape as `like`.
+
+    Args:
+        like (oneflow._oneflow_internal.BlobDesc): A Blob.
+        value (Union[int, float]): The constant value of Blob.
+        dtype (Optional[flow.dtype], optional): The data type of Blob. Defaults to None.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Raises:
+        NotImplementedError: The data type of value should be int or float.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def constant_like_Job() -> tp.Numpy:
+            constant_blob = flow.constant(value=1.5,
+                                        shape=(1, 3, 3),
+                                        dtype=flow.float)
+            constant_like_blob = flow.constant_like(like=constant_blob,
+                                                    value=5.5,
+                                                    dtype=flow.float)
+            return constant_like_blob
+
+
+        out = constant_like_Job()
+
+        # out [[[5.5 5.5 5.5]
+        #       [5.5 5.5 5.5]
+        #       [5.5 5.5 5.5]]]
+
+    """
+    op_conf = op_conf_util.OperatorConf()
+    setattr(
+        op_conf,
+        "name",
+        name if name is not None else id_util.UniqueStr("ConstantLike_"),
+    )
+    setattr(op_conf.constant_like_conf, "like", like.unique_name)
+    if isinstance(value, int):
+        op_conf.constant_like_conf.int_operand = value
+    elif isinstance(value, float):
+        op_conf.constant_like_conf.float_operand = value
+    else:
+        raise NotImplementedError
+    if dtype is not None:
+        setattr(
+            op_conf.constant_like_conf,
+            "data_type",
+            oneflow._oneflow_internal.deprecated.GetProtoDtype4OfDtype(dtype),
+        )
+    setattr(op_conf.constant_like_conf, "out", "out")
+    interpret_util.Forward(op_conf)
+    out_lbi = logical_blob_id_util.LogicalBlobId()
+    setattr(out_lbi, "op_name", op_conf.name)
+    setattr(out_lbi, "blob_name", "out")
+    return remote_blob_util.RemoteBlob(out_lbi)
+
+
+def ones_like(
+    like: oneflow._oneflow_internal.BlobDesc,
+    dtype: Optional[flow.dtype] = None,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator creates a Blob with all elements set to `1` that has the same shape as `like`.
+
+    Args:
+        like (oneflow._oneflow_internal.BlobDesc): A Blob.
+        dtype (Optional[flow.dtype], optional): The data type of Blob. Defaults to None.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def ones_like_Job() -> tp.Numpy:
+            constant_blob = flow.constant(value=1.5,
+                                        shape=(1, 3, 3),
+                                        dtype=flow.float)
+            ones_like_blob = flow.ones_like(like=constant_blob,
+                                            dtype=flow.float)
+            return ones_like_blob
+
+
+        out = ones_like_Job()
+
+        # out [[[1. 1. 1.]
+        #       [1. 1. 1.]
+        #       [1. 1. 1.]]]
+
+    """
+    return constant_like(like, 1, dtype=dtype, name=name)
+
+
+def zeros_like(
+    like: oneflow._oneflow_internal.BlobDesc,
+    dtype: Optional[flow.dtype] = None,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator creates a Blob that has the same shape as `like` whose all elements are set to `0`.
+
+    Args:
+        like (oneflow._oneflow_internal.BlobDesc): A Blob.
+        dtype (Optional[flow.dtype], optional): The data type of Blob. Defaults to None.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def zeros_like_Job() -> tp.Numpy:
+            constant_blob = flow.constant(value=1.5,
+                                        shape=(1, 3, 3),
+                                        dtype=flow.float)
+            zeros_like_blob = flow.zeros_like(like=constant_blob,
+                                            dtype=flow.float)
+            return zeros_like_blob
+
+
+        out = zeros_like_Job()
+
+        # out [[[0. 0. 0.]
+        #       [0. 0. 0.]
+        #       [0. 0. 0.]]]
+
+    """
+    return constant_like(like, 0, dtype=dtype, name=name)
diff --git a/python/oneflow/compatible/single_client/ops/count_not_finite.py b/python/oneflow/compatible/single_client/ops/count_not_finite.py
new file mode 100644
index 0000000000000000000000000000000000000000..06dafcb7bf00d0992d2a1a8fc07bc4f5f85f5319
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/count_not_finite.py
@@ -0,0 +1,58 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+from typing import Optional, Sequence, Union
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import distribute as distribute_util
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+from oneflow.core.register import logical_blob_id_pb2 as logical_blob_id_util
+
+
+def count_not_finite(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    return (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("CountNotFinite_")
+        )
+        .Op("count_not_finite")
+        .Input("x", [x])
+        .Output("y")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def multi_count_not_finite(
+    x: Optional[Sequence[oneflow._oneflow_internal.BlobDesc]] = None,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    return (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("MultiCountNotFinite_")
+        )
+        .Op("multi_count_not_finite")
+        .Input("x", x)
+        .Output("y")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
diff --git a/python/oneflow/compatible/single_client/ops/data_ops.py b/python/oneflow/compatible/single_client/ops/data_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..17ce84ffdea9d2e9ce6c7817ba5886b48712fc5d
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/data_ops.py
@@ -0,0 +1,460 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import traceback
+from typing import List, Optional, Sequence, Tuple, Union
+
+import oneflow._oneflow_internal
+from oneflow import oneflow_deprecate
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework import interpret_util as interpret_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+from oneflow.core.job import initializer_conf_pb2 as initializer_conf_util
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+from oneflow.core.register import logical_blob_id_pb2 as logical_blob_id_util
+
+
+class ImagePreprocessor(object):
+    def __init__(self, preprocessor: str) -> None:
+        assert isinstance(preprocessor, str)
+        if preprocessor.lower() != "bgr2rgb" and preprocessor.lower() != "mirror":
+            raise ValueError('preprocessor must be "bgr2rgb" or "mirror".')
+        self.preprocessor = preprocessor
+
+    def is_rgb(self) -> bool:
+        return self.preprocessor.lower() == "bgr2rgb"
+
+    def is_mirror(self) -> bool:
+        return self.preprocessor.lower() == "mirror"
+
+
+class ImageResizePreprocessor(object):
+    def __init__(self, width: int, height: int) -> None:
+        assert isinstance(width, int)
+        assert isinstance(height, int)
+        self.width = width
+        self.height = height
+
+
+class ImageCodec(object):
+    def __init__(
+        self,
+        image_preprocessors: Optional[
+            Sequence[Union[ImagePreprocessor, ImageResizePreprocessor]]
+        ] = None,
+    ) -> None:
+        if isinstance(image_preprocessors, (list, tuple)):
+            self.image_preprocessors = list(image_preprocessors)
+        else:
+            self.image_preprocessors = []
+
+    def color_space(self) -> str:
+        for img_preprocessor in self.image_preprocessors:
+            if (
+                isinstance(img_preprocessor, ImagePreprocessor)
+                and img_preprocessor.is_rgb()
+            ):
+                return "RGB"
+        return "BGR"
+
+    def do_mirror(self) -> bool:
+        for img_preprocessor in self.image_preprocessors:
+            if (
+                isinstance(img_preprocessor, ImagePreprocessor)
+                and img_preprocessor.is_mirror()
+            ):
+                return True
+        return False
+
+    def do_resize(self):
+        for img_preprocessor in self.image_preprocessors:
+            if isinstance(img_preprocessor, ImageResizePreprocessor):
+                return (True, img_preprocessor.width, img_preprocessor.height)
+        return (False, -1, -1)
+
+
+class RawCodec(object):
+    def __init__(self, truncate: bool = False, auto_zero_padding: bool = False) -> None:
+        if auto_zero_padding:
+            print(
+                "WARNING: auto_zero_padding has been deprecated, Please use truncate instead.\n                "
+            )
+        self.truncate = truncate or auto_zero_padding
+
+
+class NormByChannelPreprocessor(object):
+    def __init__(
+        self,
+        mean_values: Union[List[float], Tuple[float]],
+        std_values: Union[List[float], Tuple[float]] = (1.0, 1.0, 1.0),
+        data_format: str = "channels_last",
+    ) -> None:
+        assert isinstance(mean_values, (list, tuple))
+        assert isinstance(std_values, (list, tuple))
+        assert isinstance(data_format, str)
+        self.mean_values = mean_values
+        self.std_values = std_values
+        self.data_format = data_format
+
+    def output_layout(self) -> str:
+        if self.data_format == "channels_last":
+            return "NHWC"
+        else:
+            return "NCHW"
+
+
+class BlobConf(object):
+    def __init__(
+        self,
+        name: str,
+        shape: Sequence[int],
+        dtype: flow.dtype,
+        codec: Union[ImageCodec, RawCodec],
+        preprocessors: Optional[Sequence[Union[NormByChannelPreprocessor,]]] = None,
+    ) -> None:
+        assert isinstance(name, str)
+        assert isinstance(shape, (list, tuple))
+        self.name = name
+        self.shape = shape
+        self.dtype = dtype
+        self.codec = codec
+        if isinstance(preprocessors, (list, tuple)):
+            self.preprocessors = list(preprocessors)
+        else:
+            self.preprocessors = []
+
+    def decode_blob(
+        self, input_blob: oneflow._oneflow_internal.BlobDesc, batch_size: int
+    ) -> oneflow._oneflow_internal.BlobDesc:
+        if isinstance(self.codec, ImageCodec):
+            color_space = self.codec.color_space()
+            image = flow.data.ofrecord_image_decoder(
+                input_blob=input_blob, blob_name=self.name, color_space=color_space
+            )
+            coin_flip = None
+            if self.codec.do_mirror():
+                coin_flip = flow.random.coin_flip(batch_size)
+            (do_resize, width, height) = self.codec.do_resize()
+            if do_resize:
+                assert width > 0 and height > 0
+                (image, _, _) = flow.image.resize(
+                    image=image, target_size=(width, height)
+                )
+            else:
+                assert len(self.shape) >= 2
+                (image, _, _) = flow.image.resize(
+                    image=image, target_size=(self.shape[0], self.shape[1])
+                )
+            for preprocess in self.preprocessors:
+                image = flow.image.crop_mirror_normalize(
+                    input_blob=image,
+                    mirror_blob=coin_flip,
+                    color_space=color_space,
+                    output_layout=preprocess.output_layout(),
+                    mean=preprocess.mean_values,
+                    std=preprocess.std_values,
+                    output_dtype=self.dtype,
+                )
+            return image
+        elif isinstance(self.codec, RawCodec):
+            raw = flow.data.ofrecord_raw_decoder(
+                input_blob=input_blob,
+                blob_name=self.name,
+                shape=self.shape,
+                dtype=self.dtype,
+                truncate=self.codec.truncate,
+            )
+            return raw
+        else:
+            raise NotImplementedError
+
+
+from oneflow import oneflow_deprecate
+
+
+@oneflow_deprecate()
+def decode_ofrecord(
+    ofrecord_dir: str,
+    blobs: Sequence[BlobConf],
+    batch_size: int = 1,
+    data_part_num: int = 1,
+    part_name_prefix: str = "part-",
+    part_name_suffix_length: int = -1,
+    shuffle: bool = False,
+    buffer_size: int = 1024,
+    name: str = None,
+) -> Tuple[oneflow._oneflow_internal.BlobDesc]:
+    print(
+        "WARNING:",
+        "oneflow.compatible.single_client.data.decode_ofrecord is deprecated, and NOT work in eager mode, please use: \n",
+        "    1)   ofrecord = oneflow.compatible.single_client.data.ofrecord_reader(...) to read ofrecord; \n",
+        "    2)   image = oneflow.compatible.single_client.data.ofrecord_image_decoder(...) to decode image; \n",
+        "    3)   raw = oneflow.compatible.single_client.data.ofrecord_raw_decoder(...) to decode raw data like label; \n",
+        traceback.format_stack()[-2],
+    )
+    assert not flow.eager_execution_enabled()
+    ofrecord = flow.data.ofrecord_reader(
+        ofrecord_dir=ofrecord_dir,
+        batch_size=batch_size,
+        data_part_num=data_part_num,
+        part_name_prefix=part_name_prefix,
+        part_name_suffix_length=part_name_suffix_length,
+        random_shuffle=shuffle,
+        shuffle_buffer_size=buffer_size,
+        name=name,
+    )
+    result_blob_list = []
+    for blob_conf in blobs:
+        result_blob_list.append(
+            blob_conf.decode_blob(input_blob=ofrecord, batch_size=batch_size)
+        )
+    return tuple(result_blob_list)
+
+
+def ofrecord_loader(
+    ofrecord_dir: str,
+    batch_size: int = 1,
+    data_part_num: int = 1,
+    part_name_prefix: str = "part-",
+    part_name_suffix_length: int = -1,
+    shuffle: bool = False,
+    shuffle_buffer_size: int = 1024,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    print(
+        "WARNING:",
+        "oneflow.compatible.single_client.data.ofrecord_loader is deprecated, and NOT work in eager mode, please use: \n",
+        "    ofrecord = oneflow.compatible.single_client.data.ofrecord_reader(...) to read ofrecord; \n",
+        traceback.format_stack()[-2],
+    )
+    return flow.data.ofrecord_reader(
+        ofrecord_dir=ofrecord_dir,
+        batch_size=batch_size,
+        data_part_num=data_part_num,
+        part_name_prefix=part_name_prefix,
+        part_name_suffix_length=part_name_suffix_length,
+        random_shuffle=shuffle,
+        shuffle_buffer_size=shuffle_buffer_size,
+        name=name,
+    )
+
+
+def ofrecord_reader(
+    ofrecord_dir: str,
+    batch_size: int = 1,
+    data_part_num: int = 1,
+    part_name_prefix: str = "part-",
+    part_name_suffix_length: int = -1,
+    random_shuffle: bool = False,
+    shuffle_buffer_size: int = 1024,
+    shuffle_after_epoch: bool = False,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Get ofrecord object from ofrecord dataset.
+
+    Args:
+        ofrecord_dir (str): Path to ofrecord dataset.
+        batch_size (int, optional): Batch size. Defaults to 1.
+        data_part_num (int, optional): Number of dataset's partitions. Defaults to 1.
+        part_name_prefix (str, optional): Prefix of dataset's parition file. Defaults to "part-".
+        part_name_suffix_length (int, optional): Total length of padded suffix number , -1 means no padding. eg: 3 for `part-001`. Defaults to -1.
+        random_shuffle (bool, optional): Determines records shuffled or not. Defaults to False.
+        shuffle_buffer_size (int, optional): Shuffle buffer size. Defaults to 1024.
+        shuffle_after_epoch (bool, optional): Shuffled or not after each epoch. Defaults to False.
+        name (Optional[str], optional): Optional name. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        from typing import Tuple
+
+
+        @flow.global_function(type="predict")
+        def ofrecord_reader_job() -> Tuple[tp.Numpy, tp.Numpy]:
+            batch_size = 16
+            with flow.scope.placement("cpu", "0:0"):
+                # our ofrecord file path is "./dataset/part-0"
+                ofrecord = flow.data.ofrecord_reader(
+                    "./dataset/",
+                    batch_size=batch_size,
+                    data_part_num=1,
+                    part_name_suffix_length=-1,
+                    part_name_prefix='part-',
+                    random_shuffle=True,
+                    shuffle_after_epoch=True,
+                )
+                # image shape is (28*28, )
+                image = flow.data.OFRecordRawDecoder(
+                    ofrecord, "images", shape=(784, ), dtype=flow.int32
+                )
+                # label shape is (1, )
+                label = flow.data.OFRecordRawDecoder(
+                    ofrecord, "labels", shape=(1, ), dtype=flow.int32
+                )
+
+                return image, label
+
+        if __name__ == "__main__":
+            images, labels = ofrecord_reader_job()
+            print("In per batch, images shape is", images.shape)
+            print("In per batch, labels shape is", labels.shape)
+
+            # In per batch, images shape is (16, 784)
+            # In per batch, labels shape is (16, 1)
+
+    """
+    if name is None:
+        name = id_util.UniqueStr("OFRecord_Reader_")
+    return (
+        flow.user_op_builder(name)
+        .Op("OFRecordReader")
+        .Output("out")
+        .Attr("data_dir", ofrecord_dir)
+        .Attr("data_part_num", data_part_num)
+        .Attr("batch_size", batch_size)
+        .Attr("part_name_prefix", part_name_prefix)
+        .Attr("random_shuffle", random_shuffle)
+        .Attr("shuffle_buffer_size", shuffle_buffer_size)
+        .Attr("shuffle_after_epoch", shuffle_after_epoch)
+        .Attr("part_name_suffix_length", part_name_suffix_length)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def decode_random(
+    shape: Sequence[int],
+    dtype: flow.dtype,
+    batch_size: int = 1,
+    initializer: Optional[initializer_conf_util.InitializerConf] = None,
+    tick: Optional[oneflow._oneflow_internal.BlobDesc] = None,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    op_conf = op_conf_util.OperatorConf()
+    if name is None:
+        name = id_util.UniqueStr("DecodeRandom_")
+    assert isinstance(name, str)
+    op_conf.name = name
+    assert isinstance(shape, (list, tuple))
+    op_conf.decode_random_conf.shape.dim.extend(shape)
+    assert dtype is not None
+    setattr(
+        op_conf.decode_random_conf,
+        "data_type",
+        oneflow._oneflow_internal.deprecated.GetProtoDtype4OfDtype(dtype),
+    )
+    op_conf.decode_random_conf.batch_size = batch_size
+    if initializer is not None:
+        op_conf.decode_random_conf.data_initializer.CopyFrom(initializer)
+    else:
+        op_conf.decode_random_conf.data_initializer.CopyFrom(
+            flow.random_uniform_initializer()
+        )
+    if tick:
+        op_conf.decode_random_conf.tick = tick.unique_name
+    op_conf.decode_random_conf.out = "out"
+    lbi = logical_blob_id_util.LogicalBlobId()
+    lbi.op_name = op_conf.name
+    lbi.blob_name = "out"
+    interpret_util.ConsistentForward(op_conf)
+    return remote_blob_util.RemoteBlob(lbi)
+
+
+def image_decoder_random_crop_resize(
+    input_blob: oneflow._oneflow_internal.BlobDesc,
+    target_width: int,
+    target_height: int,
+    num_attempts: Optional[int] = None,
+    seed: Optional[int] = None,
+    random_area: Optional[Sequence[float]] = None,
+    random_aspect_ratio: Optional[Sequence[float]] = None,
+    num_workers: Optional[int] = None,
+    warmup_size: Optional[int] = None,
+    max_num_pixels: Optional[int] = None,
+    name: Optional[str] = None,
+) -> Tuple[oneflow._oneflow_internal.BlobDesc]:
+    if name is None:
+        name = id_util.UniqueStr("ImageDecoderRandomCropResize_")
+    op_conf = op_conf_util.OperatorConf()
+    op_conf.name = name
+    setattr(op_conf.image_decoder_random_crop_resize_conf, "in", input_blob.unique_name)
+    op_conf.image_decoder_random_crop_resize_conf.out = "out"
+    op_conf.image_decoder_random_crop_resize_conf.target_width = target_width
+    op_conf.image_decoder_random_crop_resize_conf.target_height = target_height
+    if num_attempts is not None:
+        op_conf.image_decoder_random_crop_resize_conf.num_attempts = num_attempts
+    if seed is not None:
+        op_conf.image_decoder_random_crop_resize_conf.seed = seed
+    if random_area is not None:
+        assert len(random_area) == 2
+        op_conf.image_decoder_random_crop_resize_conf.random_area_min = random_area[0]
+        op_conf.image_decoder_random_crop_resize_conf.random_area_max = random_area[1]
+    if random_aspect_ratio is not None:
+        assert len(random_aspect_ratio) == 2
+        op_conf.image_decoder_random_crop_resize_conf.random_aspect_ratio_min = random_aspect_ratio[
+            0
+        ]
+        op_conf.image_decoder_random_crop_resize_conf.random_aspect_ratio_max = random_aspect_ratio[
+            1
+        ]
+    if num_workers is not None:
+        op_conf.image_decoder_random_crop_resize_conf.num_workers = num_workers
+    if warmup_size is not None:
+        op_conf.image_decoder_random_crop_resize_conf.warmup_size = warmup_size
+    if max_num_pixels is not None:
+        op_conf.image_decoder_random_crop_resize_conf.max_num_pixels = max_num_pixels
+    interpret_util.Forward(op_conf)
+    lbi = logical_blob_id_util.LogicalBlobId()
+    lbi.op_name = op_conf.name
+    lbi.blob_name = "out"
+    return remote_blob_util.RemoteBlob(lbi)
+
+
+def onerec_reader(
+    files,
+    batch_size=1,
+    random_shuffle=False,
+    shuffle_mode="instance",
+    shuffle_buffer_size=1024,
+    shuffle_after_epoch=False,
+    verify_example=True,
+    name=None,
+):
+    assert isinstance(files, (list, tuple))
+    if name is None:
+        name = id_util.UniqueStr("OneRecReader_")
+    return (
+        flow.user_op_builder(name)
+        .Op("OneRecReader")
+        .Output("out")
+        .Attr("files", files)
+        .Attr("batch_size", batch_size)
+        .Attr("random_shuffle", random_shuffle)
+        .Attr("shuffle_mode", shuffle_mode)
+        .Attr("shuffle_buffer_size", shuffle_buffer_size)
+        .Attr("shuffle_after_epoch", shuffle_after_epoch)
+        .Attr("verify_example", verify_example)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
diff --git a/python/oneflow/compatible/single_client/ops/diag_ops.py b/python/oneflow/compatible/single_client/ops/diag_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb13e311fc0fdfe3302c6f7b44ece04fbe6603a8
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/diag_ops.py
@@ -0,0 +1,70 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+
+
+def diag(
+    input: oneflow._oneflow_internal.BlobDesc,
+    diagonal: Optional[int] = 0,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator compute diagonal. 
+
+    If input is a vector, then returns a square matrix with the elements of input as the diagonal.
+    If input is a matrix, then returns a vector with the diagonal elements of input.
+    Args:
+        input (remote_blob_util.BlobDef): The input Blob.
+        diagonal (Optional[int], 0): The diagonal to consider. If diagonal = 0, it is the main diagonal. If diagonal > 0, it is above the main diagonal. If diagonal < 0, it is below the main diagonal. Defaults to 0.
+
+    Returns:
+        remote_blob_util.BlobDef: The result Blob. 
+
+    For example: 
+
+    .. code-block:: python 
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def Diag_Job(input: tp.Numpy.Placeholder((3, 3), dtype=flow.float32),) -> tp.Numpy:
+            return flow.diag(input)
+
+
+        input = np.array([[1.0, 2.0, 3.0],
+                          [4.0, 5.0, 6.0],
+                          [7.0, 8.0, 9.0],], dtype=np.float32)
+        out = Diag_Job(input)
+        # out [1. 5. 9.]
+
+    """
+    return (
+        flow.user_op_builder(name if name is not None else id_util.UniqueStr("Diag_"))
+        .Op("diag")
+        .Input("in", [input])
+        .Attr("diagonal", int(diagonal))
+        .Output("out")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
diff --git a/python/oneflow/compatible/single_client/ops/domain_ops.py b/python/oneflow/compatible/single_client/ops/domain_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a40c4e88d57df4d7e03317905cc64cc1df64e06
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/domain_ops.py
@@ -0,0 +1,42 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import typing
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import id_util as id_util
+
+
+def api_fused_self_attention_query_mul_key_and_value(
+    x: oneflow._oneflow_internal.BlobDesc,
+    head_size: int,
+    alpha: float = 1.0,
+    name: typing.Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    if name is None:
+        name = id_util.UniqueStr("FusedSelfAttentionQueryMulKeyAndValue_")
+    op = (
+        flow.user_op_builder(name)
+        .Op("fused_self_attention_query_mul_key_and_value")
+        .Input("hidden_states", [x])
+        .Attr("head_size", int(head_size))
+        .Attr("alpha", float(alpha))
+        .Output("query_mul_key")
+        .Output("value")
+        .Build()
+    )
+    (qmk, v) = op.InferAndTryRun().RemoteBlobList()
+    return (qmk, v)
diff --git a/python/oneflow/compatible/single_client/ops/eager_nccl_ops.py b/python/oneflow/compatible/single_client/ops/eager_nccl_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..b32e70a9e1d6bb56cfdb7e358518c12444015d28
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/eager_nccl_ops.py
@@ -0,0 +1,40 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+
+
+def eager_nccl_all_reduce(
+    x: oneflow._oneflow_internal.BlobDesc,
+    parallel_conf: str,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    return (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("EagerNcclAllReduce_")
+        )
+        .Op("eager_nccl_all_reduce")
+        .Input("in", [x])
+        .Output("out")
+        .Attr("parallel_conf", parallel_conf)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
diff --git a/python/oneflow/compatible/single_client/ops/get_variable.py b/python/oneflow/compatible/single_client/ops/get_variable.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a73083ce179a00dfff17e627b71133271ecaf41
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/get_variable.py
@@ -0,0 +1,385 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+from typing import Optional, Sequence, Union
+
+import oneflow._oneflow_internal
+from oneflow._oneflow_internal.oneflow.core.register import logical_blob_id as lbi_util
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.eager import boxing_util as boxing_util
+from oneflow.compatible.single_client.eager import gradient_util as gradient_util
+from oneflow.compatible.single_client.eager import op_executor as op_executor
+from oneflow.compatible.single_client.experimental import namescope as name_scope
+from oneflow.compatible.single_client.framework import (
+    compile_context as compile_context,
+)
+from oneflow.compatible.single_client.framework import distribute as distribute_util
+from oneflow.compatible.single_client.framework import hob as hob
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+from oneflow.compatible.single_client.framework import runtime_mode as rt_mode
+from oneflow.compatible.single_client.framework import session_context as session_ctx
+from oneflow.compatible.single_client.support import enable_if as enable_if
+from oneflow.core.job import initializer_conf_pb2 as initializer_conf_util
+from oneflow.core.job import regularizer_conf_pb2 as regularizer_conf_util
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+from oneflow.core.register import logical_blob_id_pb2 as logical_blob_id_util
+
+blob_register = oneflow._oneflow_internal.GetDefaultBlobRegister()
+
+
+def api_get_variable(
+    name: str,
+    shape: Optional[Sequence[int]] = None,
+    dtype: Optional[flow.dtype] = flow.float32,
+    initializer: Optional[initializer_conf_util.InitializerConf] = None,
+    regularizer: Optional[regularizer_conf_util.RegularizerConf] = None,
+    trainable: Optional[bool] = None,
+    model_name: Optional[str] = None,
+    random_seed: Optional[int] = None,
+    parallel_distribution: Optional[
+        Union[
+            Sequence[oneflow._oneflow_internal.distribute.Distribute],
+            Sequence[str],
+            str,
+        ]
+    ] = None,
+    distribute: Optional[oneflow._oneflow_internal.distribute.Distribute] = None,
+    reuse: bool = True,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Create a variable or retrieve an existing one.
+
+    Args:
+        name: Name of this variable. One variable could be shared by multiple OneFlow functions. `None` by default
+        shape: Shape of the variable. `None` by default
+        dtype: Data type of the variable. `None` by default
+        initializer: A initializer object. For instance, a :func:`~oneflow.compatible.single_client.ones_initializer`. `None` by default
+        trainable: A `bool` to indicate if this variable is trainable. `True` by default
+        model_name: A `string`. `'weight'` or `'bias'`. `None` by default
+        random_seed: Random seed for random initializers. `None` by default
+
+    For example:
+
+    Example 1:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+
+
+        def watch_handler(y: tp.Numpy):
+            print("out", y)
+
+
+        @flow.global_function()
+        def variable_Job() -> None:
+            init = flow.constant_initializer(1.25)
+            variable = flow.get_variable(
+                "variable-weight",
+                shape=(1, 3, 2, 2),
+                initializer=init,
+                trainable=True
+            )
+            flow.watch(variable, watch_handler)
+
+
+        checkpoint = flow.train.CheckPoint()
+        checkpoint.init()
+        variable_Job()
+
+        # out [[[[1.25 1.25]
+        #        [1.25 1.25]]
+
+        #       [[1.25 1.25]
+        #        [1.25 1.25]]
+
+        #       [[1.25 1.25]
+        #        [1.25 1.25]]]]
+
+    Example 2:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        def conv2d(input, filters, kernel_size, strides, padding, name):
+            input_shape = input.shape
+            weight_initializer = flow.truncated_normal(0.1)
+            weight_regularizer = flow.regularizers.l2(0.0005)
+            weight_shape = (filters,
+                            input_shape[1],
+                            kernel_size[0],
+                            kernel_size[1])
+
+            weight = flow.get_variable(
+                name + "-weight",
+                shape=weight_shape,
+                initializer=weight_initializer,
+                regularizer=weight_regularizer,
+            )
+            return flow.nn.conv2d(input, weight, strides, padding, name=name)
+
+
+        @flow.global_function()
+        def conv2d_Job(x: tp.Numpy.Placeholder((1, 64, 32, 32))
+        ) -> tp.Numpy:
+            conv = conv2d(x,
+                        filters=128,
+                        kernel_size=[3, 3],
+                        strides=2,
+                        padding='SAME',
+                        name="ConvLayer")
+            return conv
+
+
+        x = np.random.randn(1, 64, 32, 32).astype(np.float32)
+        out = conv2d_Job(x)
+
+        # out.shape (1, 128, 16, 16)
+
+    """
+    if distribute is not None:
+        assert parallel_distribution is None
+        parallel_distribution = [distribute]
+    if parallel_distribution is None:
+        parallel_distribution = []
+    if isinstance(parallel_distribution, str):
+        parallel_distribution = parallel_distribution.split(",")
+    assert isinstance(parallel_distribution, (list, tuple))
+
+    def distribute_to_str(dist):
+        if dist is None:
+            return ""
+        elif type(dist) is str:
+            return dist
+        elif type(dist) is oneflow._oneflow_internal.distribute.SplitDistribute:
+            return "S({})".format(dist.axis)
+        elif type(dist) is oneflow._oneflow_internal.distribute.BroadcastDistribute:
+            return "B"
+        else:
+            raise ValueError("unsupported distribute")
+
+    parallel_distribution = list(map(distribute_to_str, parallel_distribution))
+    api = enable_if.unique([get_lazy_variable, get_eager_variable])
+    return api(
+        name,
+        shape=shape,
+        dtype=dtype,
+        initializer=initializer,
+        regularizer=regularizer,
+        trainable=trainable,
+        model_name=model_name,
+        random_seed=random_seed,
+        parallel_distribution=parallel_distribution,
+        reuse=reuse,
+    )
+
+
+@enable_if.condition(hob.in_global_mode & hob.eager_execution_enabled)
+def get_eager_variable(
+    name,
+    shape=None,
+    dtype=None,
+    initializer=None,
+    regularizer=None,
+    trainable=None,
+    model_name=None,
+    random_seed=None,
+    parallel_distribution=None,
+    reuse=True,
+):
+    assert isinstance(name, str)
+    assert isinstance(
+        shape, (list, tuple)
+    ), "param shape should be a list or tuple of dimension"
+    job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+    name = name_scope.GetJobNameScopePrefix(job_name) + name
+    sess = session_ctx.GetDefaultSession()
+    (var_blob, job_var_blob) = sess.TryGetVariableBlobOfJobFromStash(job_name, name)
+    if reuse is False:
+        assert (
+            job_var_blob is None
+        ), "variable '{}' already exists, getting the same variable is not allowed when reuse is False".format(
+            name
+        )
+    if job_var_blob is None:
+        op_conf = GenerateVariableOpConf(
+            name=name,
+            shape=shape,
+            dtype=dtype,
+            initializer=initializer,
+            regularizer=regularizer,
+            trainable=trainable,
+            model_name=model_name,
+            random_seed=random_seed,
+            parallel_distribution=parallel_distribution,
+        )
+        op_attribute = compile_context.CurJobAddConsistentOp(op_conf)
+        if var_blob is None:
+            var_blob = CreateEagerVariableBlob(op_attribute)
+            op_executor.EagerInitVariableBlob(sess, op_conf, var_blob)
+        assert isinstance(var_blob, oneflow._oneflow_internal.EagerConsistentBlob)
+        sess.StashVariableBlob4Job(job_name, op_conf.name, var_blob)
+    else:
+        assert isinstance(job_var_blob, oneflow._oneflow_internal.EagerConsistentBlob)
+        assert isinstance(var_blob, oneflow._oneflow_internal.EagerConsistentBlob)
+        assert var_blob.IdenticalTo(job_var_blob)
+    bw_blob_register = gradient_util.GetDefaultBackwardBlobRegister()
+    bw_blob_register.TrySetObject4BlobName(
+        var_blob.logical_blob_name, var_blob.blob_object
+    )
+    return var_blob
+
+
+@enable_if.condition(hob.in_global_mode & ~hob.eager_execution_enabled)
+def get_lazy_variable(
+    name,
+    shape=None,
+    dtype=None,
+    initializer=None,
+    regularizer=None,
+    trainable=None,
+    model_name=None,
+    random_seed=None,
+    parallel_distribution=None,
+    reuse=True,
+):
+    assert isinstance(name, str)
+    assert isinstance(
+        shape, (list, tuple)
+    ), "param shape should be a list or tuple of dimension"
+    job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+    name = name_scope.GetJobNameScopePrefix(job_name) + name
+    sess = session_ctx.GetDefaultSession()
+    (var_blob, job_var_blob) = sess.TryGetVariableBlobOfJobFromStash(job_name, name)
+    if reuse is False:
+        assert (
+            job_var_blob is None
+        ), "variable '{}' already exists, getting the same variable is not allowed when param reuse is False".format(
+            name
+        )
+    if job_var_blob is None:
+        op_conf = GenerateVariableOpConf(
+            name=name,
+            shape=shape,
+            dtype=dtype,
+            initializer=initializer,
+            regularizer=regularizer,
+            trainable=trainable,
+            model_name=model_name,
+            random_seed=random_seed,
+            parallel_distribution=parallel_distribution,
+        )
+        job_var_blob = _CreateVariableBlob(op_conf)
+        assert isinstance(job_var_blob, oneflow._oneflow_internal.LazyConsistentBlob)
+        sess.StashVariableBlob4Job(job_name, op_conf.name, job_var_blob)
+        if var_blob is not None:
+            assert isinstance(var_blob, oneflow._oneflow_internal.LazyConsistentBlob)
+            assert var_blob.IdenticalTo(job_var_blob)
+    else:
+        assert isinstance(job_var_blob, oneflow._oneflow_internal.LazyConsistentBlob)
+        assert isinstance(var_blob, oneflow._oneflow_internal.LazyConsistentBlob)
+        assert var_blob.IdenticalTo(job_var_blob)
+    return job_var_blob
+
+
+def GenerateVariableOpConf(
+    name,
+    shape,
+    dtype=None,
+    initializer=None,
+    regularizer=None,
+    trainable=None,
+    model_name=None,
+    random_seed=None,
+    parallel_distribution=None,
+):
+    op_conf = op_conf_util.OperatorConf()
+    op_conf.name = name
+    op_conf.variable_conf.shape.dim.extend(shape)
+    assert dtype is not None
+    op_conf.variable_conf.data_type = oneflow._oneflow_internal.deprecated.GetProtoDtype4OfDtype(
+        dtype
+    )
+    if rt_mode.CurrentMode() == rt_mode.NORMAL_MODE:
+        root_path = None
+    else:
+        root_path = (
+            compile_context.GetCurJobConfigProto().default_initialize_with_snapshot_path()
+        )
+        dir_path = os.path.join(root_path, name)
+        file_path = os.path.join(dir_path, "out")
+    if root_path and os.path.isfile(file_path):
+        op_conf.variable_conf.initialize_with_snapshot.path = dir_path
+        op_conf.variable_conf.initialize_with_snapshot.key = "out"
+    else:
+        if root_path:
+            print("{} not found, will be initialized".format(file_path))
+        if initializer is not None:
+            op_conf.variable_conf.initializer.CopyFrom(initializer)
+    if regularizer is not None:
+        op_conf.variable_conf.regularizer.CopyFrom(regularizer)
+    if trainable is not None:
+        op_conf.variable_conf.trainable = trainable
+    if model_name is not None:
+        op_conf.variable_conf.model_name = model_name
+    if parallel_distribution is None:
+        parallel_distribution = []
+    op_conf.variable_conf.parallel_distribution.extend(parallel_distribution)
+    if random_seed is not None:
+        op_conf.variable_conf.random_seed = random_seed
+    op_conf.variable_conf.out = "out"
+    return op_conf
+
+
+def _CreateVariableBlob(op_conf):
+    compile_context.CurJobAddConsistentOp(op_conf)
+    lbi = logical_blob_id_util.LogicalBlobId()
+    lbi.op_name = op_conf.name
+    lbi.blob_name = op_conf.variable_conf.out
+    return remote_blob_util.RemoteBlob(lbi)
+
+
+def CreateEagerVariableBlob(op_attribute, job_name=""):
+    bn_in_op2blob_object = oneflow._oneflow_internal.deprecated.BnInOp2BlobObject()
+
+    def BuildInstruction(builder):
+        parallel_conf = flow.current_scope().device_parallel_desc_symbol.parallel_conf
+        cfg_op_attribute = oneflow._oneflow_internal.deprecated.MakeOpAttributeByString(
+            str(op_attribute)
+        )
+        builder.StatelessCall(
+            cfg_op_attribute, parallel_conf, bn_in_op2blob_object, boxing_util.BoxingTo
+        )
+
+    oneflow._oneflow_internal.deprecated.LogicalRun(BuildInstruction)
+    lbi = lbi_util.LogicalBlobId()
+    lbi.set_op_name(op_attribute.op_conf.name)
+    lbi.set_blob_name(op_attribute.op_conf.variable_conf.out)
+    if not isinstance(lbi, lbi_util.LogicalBlobId):
+        cfg_lbi = lbi_util.LogicalBlobId()
+        cfg_lbi.set_op_name(lbi.op_name)
+        cfg_lbi.set_blob_name(lbi.blob_name)
+        lbi = cfg_lbi
+    return oneflow._oneflow_internal.EagerConsistentBlob(
+        lbi,
+        blob_object=bn_in_op2blob_object["out"],
+        blob_register=blob_register,
+        job_name=job_name,
+    )
diff --git a/python/oneflow/compatible/single_client/ops/initializer_util.py b/python/oneflow/compatible/single_client/ops/initializer_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6ea50ca25fe8475e080120450b7d500e5a5e895
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/initializer_util.py
@@ -0,0 +1,1217 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import functools
+import math
+from typing import Optional, Sequence, Union
+
+import numpy as np
+
+from oneflow.compatible import single_client as flow
+from oneflow.core.job import initializer_conf_pb2 as initializer_conf_util
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+
+
+def empty_initializer(
+    dtype: flow.dtype = flow.float,
+) -> initializer_conf_util.InitializerConf:
+    initializer = initializer_conf_util.InitializerConf()
+    empty_conf = initializer_conf_util.EmptyInitializerConf()
+    initializer.empty_conf.CopyFrom(empty_conf)
+    return initializer
+
+
+def constant_initializer(
+    value: float = 0, dtype: flow.dtype = flow.float
+) -> initializer_conf_util.InitializerConf:
+    """Initializer that generates blob with constant values.
+
+    Args:
+        value (float, optional): A Python scalar. All elements of the initialized variable . Defaults to 0.
+        dtype (flow.dtype, optional): Default data type. Defaults to flow.float.
+
+    Raises:
+        NotImplementedError:  Do not support such data type.
+
+    Returns:
+        initializer_conf_util.InitializerConf:  An InitializerConf object.
+    
+    For example: 
+
+    Example 1:
+
+    .. code-block:: python 
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+
+
+        def watch_handler(y: tp.Numpy):
+            print("out", y)
+
+
+        @flow.global_function()
+        def constant_Job() -> None:
+            init = flow.constant_initializer(2.5)
+            blob = flow.get_variable(
+                "blob-weight",
+                shape=(3, ),
+                initializer=init,
+                trainable=True
+            )
+            flow.watch(blob, watch_handler)
+
+
+        checkpoint = flow.train.CheckPoint()
+        checkpoint.init()
+        constant_Job()
+
+        # out [2.5 2.5 2.5]
+
+    Example 2:
+
+    .. code-block:: python 
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def conv2d_constant_Job(x: tp.Numpy.Placeholder((1, 256, 32, 32))
+        ) -> tp.Numpy:
+            initializer = flow.constant_initializer(0.01)
+            conv2d = flow.layers.conv2d(
+                x,
+                filters=128,
+                kernel_size=3,
+                strides=1,
+                padding='SAME',
+                kernel_initializer=initializer, 
+                name="Conv2d"
+            )
+            return conv2d
+
+
+        x = np.random.randn(1, 256, 32, 32).astype(np.float32)
+        out = conv2d_constant_Job(x)
+
+        # out.shape (1, 128, 32, 32)
+
+    """
+    initializer = initializer_conf_util.InitializerConf()
+    if dtype in [flow.float, flow.double]:
+        setattr(initializer.constant_conf, "value", float(value))
+    elif dtype in [flow.int8, flow.int32, flow.int64]:
+        setattr(initializer.constant_int_conf, "value", int(value))
+    else:
+        raise NotImplementedError("Do not support such data type")
+    return initializer
+
+
+def zeros_initializer(
+    dtype: flow.dtype = flow.float,
+) -> initializer_conf_util.InitializerConf:
+    """Initializer that generates blobs initialized to 0
+
+    Args:
+        dtype (flow.dtype, optional): Default data type. Defaults to flow.float.
+
+    Returns:
+        initializer_conf_util.InitializerConf: constant_initializer
+
+    For example: 
+
+    Example 1: 
+
+    .. code-block:: python 
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+
+
+        def watch_handler(y: tp.Numpy):
+            print("out", y)
+
+
+        @flow.global_function()
+        def zeros_Job() -> None:
+            init = flow.zeros_initializer()
+            blob = flow.get_variable(
+                "blob-weight",
+                shape=(3, ),
+                initializer=init,
+                trainable=True
+            )
+            flow.watch(blob, watch_handler)
+
+
+        checkpoint = flow.train.CheckPoint()
+        checkpoint.init()
+        zeros_Job()
+
+        # out [0. 0. 0.]
+
+    Example 2: 
+
+    .. code-block:: python 
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def conv2d_zero_Job(x: tp.Numpy.Placeholder((1, 256, 32, 32))
+        ) -> tp.Numpy:
+            initializer = flow.zeros_initializer()
+            conv2d = flow.layers.conv2d(
+                x,
+                filters=128,
+                kernel_size=3,
+                strides=1,
+                padding='SAME',
+                kernel_initializer=initializer, 
+                name="Conv2d"
+            )
+            return conv2d
+
+
+        x = np.random.randn(1, 256, 32, 32).astype(np.float32)
+        out = conv2d_zero_Job(x)
+
+        # out.shape (1, 128, 32, 32)
+
+    """
+    return constant_initializer(0.0, dtype)
+
+
+def ones_initializer(
+    dtype: flow.dtype = flow.float,
+) -> initializer_conf_util.InitializerConf:
+    """Initializer that generates blobs initialized to 1.
+
+    Args:
+        dtype (flow.dtype, optional): Default data type. Defaults to flow.float.
+
+    Returns:
+        initializer_conf_util.InitializerConf: constant_initializer
+
+    For example: 
+
+    Example 1: 
+
+    .. code-block:: python 
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+
+
+        def watch_handler(y: tp.Numpy):
+            print("out", y)
+
+
+        @flow.global_function()
+        def ones_Job() -> None:
+            init = flow.ones_initializer()
+            blob = flow.get_variable(
+                "blob-weight",
+                shape=(3, ),
+                initializer=init,
+                trainable=True
+            )
+            flow.watch(blob, watch_handler)
+
+
+        checkpoint = flow.train.CheckPoint()
+        checkpoint.init()
+        ones_Job()
+
+        # out [1. 1. 1.]
+
+    Example 2: 
+
+    .. code-block:: python 
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def conv2d_one_Job(x: tp.Numpy.Placeholder((1, 256, 32, 32))
+        ) -> tp.Numpy:
+            initializer = flow.ones_initializer()
+            conv2d = flow.layers.conv2d(
+                x,
+                filters=128,
+                kernel_size=3,
+                strides=1,
+                padding='SAME',
+                kernel_initializer=initializer, 
+                name="Conv2d"
+            )
+            return conv2d
+
+
+        x = np.random.randn(1, 256, 32, 32).astype(np.float32)
+        out = conv2d_one_Job(x)
+        
+        # out.shape (1, 128, 32, 32)
+
+    """
+    return constant_initializer(1.0, dtype)
+
+
+def random_uniform_initializer(
+    minval: float = 0, maxval: float = 1, dtype: flow.dtype = flow.float
+) -> initializer_conf_util.InitializerConf:
+    """Initializer that generates blobs with a uniform distribution. 
+
+    Args:
+        minval (float, optional): A python scalar. Lower bound of the range of random values to generate. Defaults to 0.
+        maxval (float, optional): A python scalar. Upper bound of the range of random values to generate. Defaults to 1.
+        dtype (flow.dtype, optional): Default data type. Defaults to flow.float.
+
+    Raises:
+        NotImplementedError: Do not support such data type.
+
+    Returns:
+        initializer_conf_util.InitializerConf:  Initial configuration
+
+    For example: 
+
+    Example 1: 
+
+    .. code-block:: python 
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+
+
+        def watch_handler(y: tp.Numpy):
+            print("out", y)
+
+
+        @flow.global_function()
+        def random_uniform_Job() -> None:
+            init = flow.random_uniform_initializer(minval=0, maxval=0.5)
+            blob = flow.get_variable(
+                "blob-weight",
+                shape=(3, ),
+                initializer=init,
+                trainable=True
+            )
+            flow.watch(blob, watch_handler)
+
+
+        checkpoint = flow.train.CheckPoint()
+        checkpoint.init()
+        random_uniform_Job()
+
+        # out [0.07557311 0.3943565  0.31875622]
+
+    Example 2: 
+
+    .. code-block:: python 
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def conv2d_random_uniform_Job(x: tp.Numpy.Placeholder((1, 256, 32, 32))
+        ) -> tp.Numpy:
+            initializer = flow.random_uniform_initializer(minval=0, maxval=0.5)
+
+            conv2d = flow.layers.conv2d(
+                x,
+                filters=128,
+                kernel_size=3,
+                strides=1,
+                padding='SAME',
+                kernel_initializer=initializer, 
+                name="Conv2d"
+            )
+            return conv2d
+
+
+        x = np.random.randn(1, 256, 32, 32).astype(np.float32)
+        out = conv2d_random_uniform_Job(x)
+        
+        # out.shape (1, 128, 32, 32)
+
+    """
+    assert minval <= maxval
+    initializer = initializer_conf_util.InitializerConf()
+    if dtype in [flow.float, flow.double]:
+        setattr(initializer.random_uniform_conf, "min", float(minval))
+        setattr(initializer.random_uniform_conf, "max", float(maxval))
+    elif dtype in [flow.int8, flow.int32, flow.int64]:
+        setattr(initializer.random_uniform_int_conf, "min", int(minval))
+        setattr(initializer.random_uniform_int_conf, "max", int(maxval))
+    else:
+        raise NotImplementedError("Do not support such data type")
+    return initializer
+
+
+def random_normal_initializer(
+    mean: float = 0.0,
+    stddev: float = 1.0,
+    seed: Optional[int] = None,
+    dtype: Optional[flow.dtype] = None,
+) -> initializer_conf_util.InitializerConf:
+    """Initializer that generates blob with a normal distribution.
+
+    Args:
+        mean (float, optional): A python scalar. Mean of the random values to generate.. Defaults to 0.0.
+        stddev (float, optional): A python scalar. Standard deviation of the random values to generate. Defaults to 1.0.
+        seed (Optional[int], optional): None. Not support yet. Defaults to None.
+        dtype (Optional[flow.dtype], optional): . Defaults to None.
+
+    Returns:
+        initializer_conf_util.InitializerConf: Initial configuration
+
+    For example: 
+
+    Example 1: 
+
+    .. code-block:: python 
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+
+
+        def watch_handler(y: tp.Numpy):
+            print("out", y)
+
+
+        @flow.global_function()
+        def random_normal_Job() -> None:
+            init = flow.random_normal_initializer(mean=1, stddev=1)
+            blob = flow.get_variable(
+                "blob-weight",
+                shape=(3, ),
+                initializer=init,
+                trainable=True
+            )
+            flow.watch(blob, watch_handler)
+
+
+        checkpoint = flow.train.CheckPoint()
+        checkpoint.init()
+        random_normal_Job()
+
+        # out [1.4190257 2.7663114 1.7114428]
+
+    Example 2: 
+
+    .. code-block:: python 
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def conv2d_random_normal_Job(x: tp.Numpy.Placeholder((1, 256, 32, 32))
+        ) -> tp.Numpy:
+            initializer = flow.random_normal_initializer(mean=0, stddev=1)
+
+            conv2d = flow.layers.conv2d(
+                x,
+                filters=128,
+                kernel_size=3,
+                strides=1,
+                padding='SAME',
+                kernel_initializer=initializer, 
+                name="Conv2d"
+            )
+            return conv2d
+
+
+        x = np.random.randn(1, 256, 32, 32).astype(np.float32)
+        out = conv2d_random_normal_Job(x)
+
+        # out.shape (1, 128, 32, 32)
+
+    """
+    assert seed is None
+    assert dtype is None
+    if seed is not None:
+        assert name is not None
+    initializer = initializer_conf_util.InitializerConf()
+    setattr(initializer.random_normal_conf, "mean", float(mean))
+    setattr(initializer.random_normal_conf, "std", float(stddev))
+    return initializer
+
+
+def truncated_normal_initializer(
+    mean: float = 0.0, stddev: float = 1.0
+) -> initializer_conf_util.InitializerConf:
+    """Initializer that generates a truncated normal distribution.
+
+    Args:
+        mean (float, optional): A scalar (float). Defaults to 0.0.
+        stddev (float, optional): A scalar (float). Defaults to 1.0.
+
+    Returns:
+        initializer_conf_util.InitializerConf: Initial configuration
+
+    For example: 
+
+    Example 1: 
+
+    .. code-block:: python 
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+
+
+        def watch_handler(y: tp.Numpy):
+            print("out", y)
+
+
+        @flow.global_function()
+        def truncated_normal_Job() -> None:
+            init = flow.truncated_normal_initializer(mean=1, stddev=1)
+            blob = flow.get_variable(
+                "blob-weight",
+                shape=(3, ),
+                initializer=init,
+                trainable=True
+            )
+            flow.watch(blob, watch_handler)
+
+
+        checkpoint = flow.train.CheckPoint()
+        checkpoint.init()
+        truncated_normal_Job()
+
+        # out [1.8303236  0.09787154 0.83049864]
+
+    Example 2: 
+
+    .. code-block:: python 
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def conv2d_truncated_normal_Job(x: tp.Numpy.Placeholder((1, 256, 32, 32))
+        ) -> tp.Numpy:
+            initializer = flow.truncated_normal_initializer(mean=0, stddev=1)
+
+            conv2d = flow.layers.conv2d(
+                x,
+                filters=128,
+                kernel_size=3,
+                strides=1,
+                padding='SAME',
+                kernel_initializer=initializer, 
+                name="Conv2d"
+            )
+            return conv2d
+
+
+        x = np.random.randn(1, 256, 32, 32).astype(np.float32)
+        out = conv2d_truncated_normal_Job(x)
+
+        # out.shape (1, 128, 32, 32)
+
+    """
+    initializer = initializer_conf_util.InitializerConf()
+    setattr(initializer.truncated_normal_conf, "mean", float(mean))
+    setattr(initializer.truncated_normal_conf, "std", float(stddev))
+    return initializer
+
+
+def glorot_uniform_initializer(
+    data_format: str = "",
+) -> initializer_conf_util.InitializerConf:
+    """Initializer that generates a Xavier uniform distribution. 
+    
+    It also can be called as `oneflow.compatible.single_client.glorot_uniform_initializer`.  
+
+    The equation is: 
+
+    .. math:: 
+
+        W\\sim U(-\\sqrt{\\frac{{6}}{{n_j+n_{j+1}}}},\\sqrt{\\frac{{6}}{{n_j+n_{j+1}}}})
+
+    :math:`U` means uniform distribution 
+
+    :math:`n_j` means the amount of Nth layer parameters 
+
+    Args:
+        data_format (str, optional): The data format. Defaults to "".
+
+    Returns:
+        initializer_conf_util.InitializerConf: Initial configuration
+
+    For example: 
+
+    Example 1:
+
+    .. code-block:: python 
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+
+
+        def watch_handler(y: tp.Numpy):
+            print("out", y)
+
+
+        @flow.global_function()
+        def xavier_uniform_Job() -> None:
+            init = flow.xavier_uniform_initializer()
+            blob = flow.get_variable(
+                "blob-weight",
+                shape=(3, 3),
+                initializer=init,
+                trainable=True
+            )
+            flow.watch(blob, watch_handler)
+
+
+        checkpoint = flow.train.CheckPoint()
+        checkpoint.init()
+        xavier_uniform_Job()
+
+        # out [[-0.14424723 -0.9532095  -0.08723891]
+        #      [-0.8011227  -0.29729813 -0.26769108]
+        #      [ 0.9208976  -0.5971756  -0.15077025]]
+
+    Example 2: 
+
+    .. code-block:: python 
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def conv2d_xavier_uniform_Job(x: tp.Numpy.Placeholder((1, 256, 32, 32))
+        ) -> tp.Numpy:
+            initializer = flow.xavier_uniform_initializer()
+            conv2d = flow.layers.conv2d(
+                x,
+                filters=128,
+                kernel_size=3,
+                strides=1,
+                padding='SAME',
+                kernel_initializer=initializer, 
+                name="Conv2d"
+            )
+            return conv2d
+
+
+        x = np.random.randn(1, 256, 32, 32).astype(np.float32)
+        out = conv2d_xavier_uniform_Job(x)
+
+        # out.shape (1, 128, 32, 32)
+
+    """
+    return variance_scaling_initializer(1.0, "fan_avg", "random_uniform", data_format)
+
+
+def glorot_normal_initializer(
+    data_format: str = "",
+) -> initializer_conf_util.InitializerConf:
+    """Initializer that generates a Xavier normal distribution. 
+    
+    It also can be called as `oneflow.compatible.single_client.glorot_normal_initializer`.  
+
+    The equation is: 
+
+    .. math:: 
+
+        W\\sim N(0, \\sqrt{\\frac{{2}}{{n_j+n_{j+1}}}})
+
+    :math:`N` means normal distribution 
+
+    :math:`n_j` means the amount of Nth layer parameters 
+
+    Args:
+        data_format (str, optional): The data format. Defaults to "".
+
+    Returns:
+        initializer_conf_util.InitializerConf: Initial configuration
+
+    For example: 
+
+    Example 1: 
+
+    .. code-block:: python 
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+
+
+        def watch_handler(y: tp.Numpy):
+            print("out", y)
+
+
+        @flow.global_function()
+        def xavier_normal_Job() -> None:
+            init = flow.xavier_normal_initializer()
+            blob = flow.get_variable(
+                "blob-weight",
+                shape=(3, 3),
+                initializer=init,
+                trainable=True
+            )
+            flow.watch(blob, watch_handler)
+
+
+        checkpoint = flow.train.CheckPoint()
+        checkpoint.init()
+        xavier_normal_Job()
+
+        # out [[ 0.5908121  -0.10804518 -0.6148571 ]
+        #      [ 1.4007381  -0.08172473  0.36579943]
+        #      [-0.6461796  -0.15923311  0.33653972]]
+
+    Example 2: 
+
+    .. code-block:: python 
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def conv2d_xavier_normal_Job(x: tp.Numpy.Placeholder((1, 256, 32, 32))
+        ) -> tp.Numpy:
+            initializer = flow.xavier_normal_initializer()
+            conv2d = flow.layers.conv2d(
+                x,
+                filters=128,
+                kernel_size=3,
+                strides=1,
+                padding='SAME',
+                kernel_initializer=initializer, 
+                name="Conv2d"
+            )
+            return conv2d
+
+
+        x = np.random.randn(1, 256, 32, 32).astype(np.float32)
+        out = conv2d_xavier_normal_Job(x)
+
+        # out.shape (1, 128, 32, 32)
+
+    """
+    return variance_scaling_initializer(1.0, "fan_avg", "random_normal", data_format)
+
+
+def variance_scaling_initializer(
+    scale: float = 1.0,
+    mode: str = "fan_in",
+    distribution: str = "truncated_normal",
+    data_format: str = "",
+) -> initializer_conf_util.InitializerConf:
+    """Initializer that generates a truncated normal distribution or a random normal distribution or a random uniform distribution with a scale adapting to it.
+
+    When the distribution is "truncated_normal"
+
+    The equation is: 
+
+    .. math:: 
+
+        W\\sim N(0, \\sqrt{\\frac{{scale}}{{n}}})
+
+    If mode is "fan_in", the "n" is the number of input units in the weight Blob. 
+
+    If mode is "fan_out", the "n" is the number of output units in the weight Blob. 
+
+    if mode is "fan_avg", the "n" is the average of the number of input and output units in the weight Blob
+
+    Args:
+        scale (float, optional): Scaling factor (positive float). Defaults to 1.0.
+        mode (str, optional): One of "fan_in", "fan_out", "fan_avg". Defaults to "fan_in".
+        distribution (str, optional): Random distribution to use. One of "truncated_normal",. Defaults to "truncated_normal".
+        data_format (str, optional): A string be one of "N...C" or "NC...". Defaults to "".
+
+    Returns:
+        initializer_conf_util.InitializerConf: Initial configuration
+
+    For example: 
+
+    Example 1: 
+
+    .. code-block:: python 
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+
+
+        def watch_handler(y: tp.Numpy):
+            print("out", y)
+
+
+        @flow.global_function()
+        def variance_scale_Job() -> None:
+            init = flow.variance_scaling_initializer(scale=2.0, mode="fan_avg")
+            blob = flow.get_variable(
+                "blob-weight",
+                shape=(3, 3),
+                initializer=init,
+                trainable=True
+            )
+            flow.watch(blob, watch_handler)
+
+
+        checkpoint = flow.train.CheckPoint()
+        checkpoint.init()
+        variance_scale_Job()
+
+        # out [[-0.13931477  0.12266728 -0.9434968 ]
+        #      [-0.49665168  0.10231158 -0.19194333]
+        #      [-0.7902896  -1.7034698  -0.38695997]]
+
+    Example 2: 
+
+    .. code-block:: python 
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def conv2d_variance_scaling_Job(x: tp.Numpy.Placeholder((1, 256, 32, 32))
+        ) -> tp.Numpy:
+            initializer = flow.variance_scaling_initializer(mode="fan_out")
+            conv2d = flow.layers.conv2d(
+                x,
+                filters=128,
+                kernel_size=3,
+                strides=1,
+                padding='SAME',
+                kernel_initializer=initializer, 
+                name="Conv2d"
+            )
+            return conv2d
+
+
+        x = np.random.randn(1, 256, 32, 32).astype(np.float32)
+        out = conv2d_variance_scaling_Job(x)
+
+        # out.shape (1, 128, 32, 32)
+
+    """
+    initializer = initializer_conf_util.InitializerConf()
+    setattr(initializer.variance_scaling_conf, "scale", float(scale))
+    setattr(
+        initializer.variance_scaling_conf, "variance_norm", _get_variance_norm(mode)
+    )
+    setattr(
+        initializer.variance_scaling_conf,
+        "distribution",
+        _get_random_distribution(distribution),
+    )
+    setattr(
+        initializer.variance_scaling_conf, "data_format", _get_data_format(data_format)
+    )
+    return initializer
+
+
+def kaiming_initializer(
+    shape: Sequence[int],
+    distribution: str = "random_normal",
+    mode: str = "fan_in",
+    nonlinearity: str = "leaky_relu",
+    negative_slope: float = 0.0,
+    data_format: str = "NCHW",
+) -> None:
+    """Initialize weight according to the method described in `Delving deep into
+    rectifiers: Surpassing human-level performance on ImageNet classification`
+    - He, K. et al. (2015), using a normal or uniform distribution.
+
+    When distribution is "random_normal"
+
+    The equation is: 
+
+    .. math:: 
+
+        W \\sim N(0, \\sqrt{\\frac{{2}}{{n}}})
+
+    When distribution is "random_uniform"
+
+    The equation is: 
+
+    .. math:: 
+
+        W \\sim U(-\\sqrt{\\frac{{6}}{{n}}}, \\sqrt{\\frac{{6}}{{n}}})
+    
+    If mode is "fan_in", the "n" is the number of input units in the weight Blob. 
+
+    If mode is "fan_out", the "n" is the number of output units in the weight Blob. 
+
+    if mode is "fan_avg", the "n" is the average of the number of input and output units in the weight Blob
+
+    Args:
+        shape (Sequence[int]): Blob shape.
+        distribution (str, optional): 'random_normal' or 'random_uniform'. Defaults to "random_normal".
+        mode (str, optional): 'fan_in', 'fan_out' or 'fan_avg'. Defaults to "fan_in".
+        nonlinearity (str, optional): None, 'tanh', 'sigmoid', 'relu' or 'leaky_relu'. Defaults to "leaky_relu".
+        negative_slope (float, optional): The negative slope of leaky_relu. Defaults to 0.0.
+        data_format (str, optional):  'NCHW', 'NHWC'. Defaults to "NCHW".
+
+    Raises:
+        NotImplementedError: Only support normal and uniform distribution
+
+    Returns:
+        [type]: flow.random_normal_initializer or flow.random_uniform_initializer
+
+    For example: 
+
+    Example 1: 
+
+    .. code-block:: python 
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+
+
+        def watch_handler(y: tp.Numpy):
+            print("out", y)
+
+
+        @flow.global_function()
+        def kaiming_Job() -> None:
+            init = flow.kaiming_initializer(shape=(3, 3), 
+                                            mode="fan_avg", 
+                                            nonlinearity="relu")
+            blob = flow.get_variable(
+                "blob-weight",
+                shape=(3, 3),
+                initializer=init,
+                trainable=True
+            )
+            flow.watch(blob, watch_handler)
+
+
+        checkpoint = flow.train.CheckPoint()
+        checkpoint.init()
+        kaiming_Job()
+
+        # out [[ 0.54521346  0.32585594  1.3474437 ]
+        #      [ 0.30729076 -0.19158769  0.2709008 ]
+        #      [-0.95830524 -0.05093324  0.28178614]]
+
+    Example 2: 
+
+    .. code-block:: python 
+    
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def conv2d_kaiming_Job(x: tp.Numpy.Placeholder((1, 256, 32, 32))
+        ) -> tp.Numpy:
+            initializer = flow.kaiming_initializer(shape=(1, 256, 32, 32))
+            conv2d = flow.layers.conv2d(
+                x,
+                filters=128,
+                kernel_size=3,
+                strides=1,
+                padding='SAME',
+                kernel_initializer=initializer, 
+                name="Conv2d"
+            )
+            return conv2d
+
+
+        x = np.random.randn(1, 256, 32, 32).astype(np.float32)
+        out = conv2d_kaiming_Job(x)
+
+        # out.shape (1, 128, 32, 32)
+
+    """
+    assert isinstance(shape, (tuple, flow.Size))
+    assert len(shape) >= 2
+    elem_cnt = functools.reduce(lambda a, b: a * b, shape, 1)
+    assert elem_cnt > 0
+    assert distribution in ["random_normal", "random_uniform"]
+    assert mode in ["fan_in", "fan_out", "fan_avg"]
+    assert nonlinearity in [None, "tanh", "sigmoid", "relu", "leaky_relu"]
+    assert data_format in ["NCHW", "NHWC"]
+    fan = _CalcFan(shape, mode, _get_data_format(data_format))
+    gain = CalcGain(nonlinearity, negative_slope)
+    std = gain / math.sqrt(fan)
+    if distribution == "random_normal":
+        return flow.random_normal_initializer(0.0, std)
+    elif distribution == "random_uniform":
+        bound = math.sqrt(3.0) * std
+        return flow.random_uniform_initializer(-bound, bound)
+    else:
+        raise NotImplementedError("Only support normal and uniform distribution")
+
+
+def _get_variance_norm(mode):
+    if mode.lower() == "fan_in":
+        return initializer_conf_util.kFanIn
+    elif mode.lower() == "fan_out":
+        return initializer_conf_util.kFanOut
+    elif mode.lower() == "fan_avg":
+        return initializer_conf_util.kAverage
+    else:
+        raise ValueError("Invalid variance_norm")
+
+
+def _get_random_distribution(distribution):
+    if distribution.lower() == "truncated_normal":
+        return initializer_conf_util.kTruncatedNormal
+    elif distribution.lower() == "random_normal":
+        return initializer_conf_util.kRandomNormal
+    elif distribution.lower() == "random_uniform":
+        return initializer_conf_util.kRandomUniform
+    else:
+        raise ValueError("Invalid random_distribution")
+
+
+def _get_data_format(data_format):
+    assert isinstance(data_format, str), "data_format must be a string"
+    if data_format.startswith("NC"):
+        return "channels_first"
+    elif data_format.startswith("N") and data_format.endswith("C"):
+        return "channels_last"
+    else:
+        assert data_format == "", ValueError(
+            'data_format must be "N...C" or "NC..." or ""'
+        )
+        return ""
+
+
+def _CalcFan(shape, mode, data_format):
+    if len(shape) == 2:
+        fan_in = shape[1]
+        fan_out = shape[0]
+    else:
+        fan_in = 1.0
+        for dim in shape[1:]:
+            fan_in *= dim
+        fan_out = shape[0]
+        if data_format == "channels_first":
+            for dim in shape[2:]:
+                fan_out *= dim
+        elif data_format == "channels_last":
+            for dim in shape[1:-1]:
+                fan_out *= dim
+        else:
+            raise NotImplementedError(
+                "Only support 'channels_first' and 'channels_last' data format"
+            )
+    if mode == "fan_avg":
+        return (float(fan_in) + float(fan_out)) / 2
+    elif mode == "fan_in":
+        return float(fan_in)
+    elif mode == "fan_out":
+        return float(fan_out)
+    else:
+        raise NotImplementedError("Only support 'fan_in', 'fan_out' and 'fan_avg' mode")
+
+
+def CalcGain(nonlinearity, param):
+    linear_fns = [
+        "linear",
+        "conv1d",
+        "conv2d",
+        "conv3d",
+        "conv_transpose1d",
+        "conv_transpose2d",
+        "conv_transpose3d",
+    ]
+    if nonlinearity in linear_fns or nonlinearity == "sigmoid":
+        return 1
+    elif nonlinearity == "tanh":
+        return 5.0 / 3
+    elif nonlinearity == "relu":
+        return math.sqrt(2.0)
+    elif nonlinearity == "leaky_relu":
+        if param is None:
+            negative_slope = 0.01
+        elif (
+            not isinstance(param, bool)
+            and isinstance(param, int)
+            or isinstance(param, float)
+        ):
+            negative_slope = param
+        else:
+            raise ValueError("negative_slope {} not a valid number".format(param))
+        return math.sqrt(2.0 / (1 + negative_slope ** 2))
+    elif nonlinearity == "selu":
+        return 3.0 / 4
+    else:
+        raise ValueError("Unsupported nonlinearity {}".format(nonlinearity))
+
+
+_init_map = {}
+
+
+def register_initializer(flow_initializer):
+    def deco(func):
+        _init_map[flow_initializer] = func
+        return func
+
+    return deco
+
+
+def GetInitializer(initializer_conf, random_seed, var_blob_shape):
+    f = None
+    for m in _init_map:
+        if initializer_conf.HasField(m):
+            f = _init_map[m]
+            break
+    assert f is not None, initializer_conf
+    return f(getattr(initializer_conf, m), random_seed, var_blob_shape)
+
+
+@register_initializer("constant_conf")
+@register_initializer("constant_int_conf")
+def ConstantInitializerImpl(
+    initializer_conf: Union[
+        initializer_conf_util.ConstantInitializerConf,
+        initializer_conf_util.ConstantIntInitializerConf,
+    ],
+    random_seed: int,
+    var_blob_shape: Sequence[int],
+):
+    return lambda length: np.full((length,), initializer_conf.value)
+
+
+@register_initializer("random_normal_conf")
+def RandomNormalInitializerImpl(
+    initializer_conf: initializer_conf_util.RandomNormalInitializerConf,
+    random_seed: int,
+    var_blob_shape: Sequence[int],
+):
+    rng = np.random.default_rng(random_seed)
+    return lambda length: rng.normal(
+        loc=initializer_conf.mean, scale=initializer_conf.std, size=length
+    )
+
+
+@register_initializer("random_uniform_conf")
+def RandomUniformInitializerImpl(
+    initializer_conf: initializer_conf_util.RandomUniformIntInitializerConf,
+    random_seed: int,
+    var_blob_shape: Sequence[int],
+):
+    rng = np.random.default_rng(random_seed)
+    return lambda length: rng.uniform(
+        low=initializer_conf.min,
+        high=np.nextafter(initializer_conf.max, float("inf")),
+        size=length,
+    )
+
+
+@register_initializer("random_uniform_int_conf")
+def RandomUniformIntInitializerImpl(
+    initializer_conf: initializer_conf_util.RandomUniformIntInitializerConf,
+    random_seed: int,
+    var_blob_shape: Sequence[int],
+):
+    rng = np.random.default_rng(random_seed)
+    return lambda length: rng.integers(
+        low=initializer_conf.min, high=initializer_conf.max, size=length
+    )
+
+
+def RngTruncatedNormal(mean, std, length, rng):
+    truncated_value = 2 * std
+    data = np.empty(length)
+    generated = 0
+    ratio = 1.2
+    while generated < length:
+        remaining = length - generated
+        norm = rng.normal(mean, std, size=int(remaining * ratio))
+        truncated = norm[np.abs(norm - mean) < truncated_value][:remaining]
+        data[generated : generated + len(truncated)] = truncated
+        generated += len(truncated)
+    return data
+
+
+@register_initializer("truncated_normal_conf")
+def TruncatedNormalInitializerImpl(
+    initializer_conf: initializer_conf_util.TruncatedNormalInitializerConf,
+    random_seed: int,
+    var_blob_shape: Sequence[int],
+):
+    rng = np.random.default_rng(random_seed)
+    return lambda length: RngTruncatedNormal(
+        initializer_conf.mean, initializer_conf.std, length, rng
+    )
+
+
+def GenInitialFan(initializer_conf, var_blob_shape: Sequence[int]):
+    variance_norm = initializer_conf.variance_norm
+    data_format = initializer_conf.data_format
+    fan_in = np.prod(var_blob_shape[1:]).astype(np.int).item()
+    fan_out = var_blob_shape[0]
+    if data_format == "channel_first":
+        fan_out *= np.prod(var_blob_shape[2:]).astype(np.int).item()
+    else:
+        fan_out *= np.prod(var_blob_shape[1:-1]).astype(np.int).item()
+    if variance_norm == initializer_conf_util.kAverage:
+        fan = (fan_in + fan_out) / 2
+    elif variance_norm == initializer_conf_util.kFanIn:
+        fan = fan_in
+    elif variance_norm == initializer_conf_util.kFanOut:
+        fan = fan_out
+    else:
+        raise NotImplemented()
+    return fan
+
+
+@register_initializer("variance_scaling_conf")
+def VarianceScalingInitializerImpl(
+    initializer_conf: initializer_conf_util.VarianceScalingInitializerConf,
+    random_seed: int,
+    var_blob_shape: Sequence[int],
+):
+    scale = initializer_conf.scale / GenInitialFan(initializer_conf, var_blob_shape)
+    distribution = initializer_conf.distribution
+    rng = np.random.default_rng(random_seed)
+    if distribution == initializer_conf_util.kTruncatedNormal:
+        stddev = math.sqrt(scale) / 0.8796256610342398
+        return lambda length: RngTruncatedNormal(0, stddev, length, rng)
+    elif distribution == initializer_conf_util.kRandomNormal:
+        stddev = math.sqrt(scale)
+        return lambda length: rng.normal(0, stddev, size=length)
+    elif distribution == initializer_conf_util.kRandomUniform:
+        limit = math.sqrt(3.0 * scale)
+        return lambda length: rng.uniform(low=-limit, high=limit, size=length)
+    else:
+        raise NotImplemented()
+
+
+@register_initializer("empty_conf")
+def EmptyInitializerImpl(
+    initializer_conf: initializer_conf_util.EmptyInitializerConf,
+    random_seed: int,
+    var_blob_shape: Sequence[int],
+):
+    return None
diff --git a/python/oneflow/compatible/single_client/ops/layers.py b/python/oneflow/compatible/single_client/ops/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f64355013b6e7f7ad79cbd2282b85e7c29c16d5
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/layers.py
@@ -0,0 +1,1482 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Callable, Optional, Sequence, Tuple, Union
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import distribute as distribute_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+from oneflow.core.job import initializer_conf_pb2 as initializer_conf_util
+from oneflow.core.job import regularizer_conf_pb2 as regularizer_conf_util
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+
+IntPair = Tuple[int, int]
+
+
+def dense(
+    inputs: oneflow._oneflow_internal.BlobDesc,
+    units: int,
+    activation: Optional[
+        Callable[
+            [oneflow._oneflow_internal.BlobDesc, str],
+            oneflow._oneflow_internal.BlobDesc,
+        ]
+    ] = None,
+    use_bias: bool = True,
+    kernel_initializer: Optional[initializer_conf_util.InitializerConf] = None,
+    bias_initializer: Optional[initializer_conf_util.InitializerConf] = None,
+    kernel_regularizer: Optional[regularizer_conf_util.RegularizerConf] = None,
+    bias_regularizer: Optional[regularizer_conf_util.RegularizerConf] = None,
+    trainable: bool = True,
+    name: str = "Dense",
+    model_distribute: oneflow._oneflow_internal.distribute.Distribute = oneflow._oneflow_internal.distribute.broadcast(),
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Fully-connected layer.
+
+    The fully-connected layer multiplies input Blob with weight matrix and produces an Output Blob.
+
+    Args:
+        inputs (oneflow._oneflow_internal.BlobDesc): A 2D input `Blob`.
+        units (int): A positive integer for the dimensionality of the output space.
+        activation (Optional[oneflow._oneflow_internal.BlobDesc], optional):  Activation function. Defaults to None.
+        use_bias (bool, optional): A boolean specifies whether to use a bias vector. Defaults to True.
+        kernel_initializer (Optional[initializer_conf_util.InitializerConf], optional): Initializer for the kernel weights matrix. Defaults to None.
+        bias_initializer (Optional[initializer_conf_util.InitializerConf], optional): Initializer for the bias vector. Defaults to None.
+        kernel_regularizer (Optional[regularizer_conf_util.RegularizerConf], optional): Regularizer function applied to the kernel weights matrix. Defaults to None.
+        bias_regularizer (Optional[regularizer_conf_util.RegularizerConf], optional): Regularizer for the bias vector. Defaults to None.
+        trainable (bool, optional): A boolean specifies whether to train the variables. Defaults to True.
+        name (Optional[str], optional): This layer's name. Defaults to None.
+        model_distribute (oneflow._oneflow_internal.distribute.Distribute, optional): Define the way to ditribute the model. Defaults to oneflow._oneflow_internal.distribute.broadcast().
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc:  A N-D `Blob` with the shape of (batch_size, units).
+
+    Raises:
+        ValueError: The dimension of input `Blob` must be less than 2.
+        VauleError: Model distribute must be in auto, broadcast, split.
+        ValueError: The input must be a 2D `Blob` when the model distribute is split.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def dense_Job(x: tp.Numpy.Placeholder((1, 256))
+        ) -> tp.Numpy:
+            initializer = flow.truncated_normal(0.1)
+            hidden = flow.layers.dense(
+                x,
+                512,
+                activation=flow.nn.relu,
+                kernel_initializer=initializer,
+                name="dense1",
+            )
+            return hidden
+
+
+        x = np.random.randn(1, 256).astype(np.float32)
+        out = dense_Job(x)
+
+        # out.shape (1, 512)
+
+    """
+    in_shape = inputs.shape
+    in_num_axes = len(in_shape)
+    assert in_num_axes >= 2
+    assert (
+        model_distribute is oneflow._oneflow_internal.distribute.auto()
+        or model_distribute is oneflow._oneflow_internal.distribute.broadcast()
+        or model_distribute is oneflow._oneflow_internal.distribute.split(0)
+    )
+    if model_distribute is oneflow._oneflow_internal.distribute.split(0):
+        assert in_num_axes == 2
+    if in_num_axes > 2:
+        inputs = flow.reshape(inputs, (-1, in_shape[-1]))
+    with flow.scope.namespace(name):
+        if kernel_initializer is None:
+            kernel_initializer = flow.constant_initializer(0)
+        weight = flow.get_variable(
+            name="weight",
+            shape=(units, inputs.shape[1]),
+            dtype=inputs.dtype,
+            initializer=kernel_initializer,
+            regularizer=kernel_regularizer,
+            trainable=trainable,
+            model_name="weight",
+            distribute=model_distribute,
+            reuse=False,
+        )
+        weight = weight.with_distribute(model_distribute)
+        out = flow.matmul(a=inputs, b=weight, transpose_b=True, name="matmul")
+        if use_bias:
+            if bias_initializer is None:
+                bias_initializer = flow.constant_initializer(0)
+            bias = flow.get_variable(
+                name="bias",
+                shape=(units,),
+                dtype=inputs.dtype,
+                initializer=bias_initializer,
+                regularizer=bias_regularizer,
+                trainable=trainable,
+                model_name="bias",
+                distribute=model_distribute,
+                reuse=False,
+            )
+            bias = bias.with_distribute(model_distribute)
+            out = flow.nn.bias_add(out, bias, name="bias_add")
+        if callable(activation):
+            out = activation(out, name="activation")
+    if in_num_axes > 2:
+        out = flow.reshape(out, in_shape[:-1] + (units,))
+    return out
+
+
+def conv1d(
+    inputs: oneflow._oneflow_internal.BlobDesc,
+    filters: int,
+    kernel_size: Union[int, Tuple[int]] = 1,
+    strides: Union[int, Tuple[int]] = 1,
+    padding: Union[str, Tuple[IntPair, IntPair, IntPair]] = "VALID",
+    data_format: str = "NCW",
+    dilation_rate: Optional[Union[int, Tuple[int]]] = None,
+    groups: int = 1,
+    activation: Optional[
+        Callable[
+            [oneflow._oneflow_internal.BlobDesc, str],
+            oneflow._oneflow_internal.BlobDesc,
+        ]
+    ] = None,
+    use_bias: bool = True,
+    kernel_initializer: Optional[initializer_conf_util.InitializerConf] = None,
+    bias_initializer: Optional[initializer_conf_util.InitializerConf] = None,
+    kernel_regularizer: Optional[regularizer_conf_util.RegularizerConf] = None,
+    bias_regularizer: Optional[regularizer_conf_util.RegularizerConf] = None,
+    trainable: bool = True,
+    name: str = "Conv1d",
+    weight_name: Optional[str] = None,
+    bias_name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """1D convolution layer.
+
+    This layer computes a 1-D convolution with 3D input Blob and filters.
+
+    Args:
+        inputs (oneflow._oneflow_internal.BlobDesc): A 3D input `Blob`.
+        filters (int): An integer specifies the dimensionality of the output space.
+        kernel_size (Union[int, List[int], Tuple[int]], optional): An integer or tuple/list specifies the height and width of the convolution window.
+                        When it is an integer, a square window is applied to the input. Defaults to 1.
+        strides (Union[int, List[int], Tuple[int]], optional): An integer or tuple/list specifies the strides of the convolution window along the height and width.
+                        When it is an integer, the same value for the all spatial dimesions is applied. Defaults to 1.
+        padding (str, Tuple[IntPair, IntPair, IntPair], optional): padding: `string` `"SAME"` or `"SAME_LOWER"` or `"SAME_UPPER"` or `"VALID" or Tuple[IntPair, IntPair, IntPair]` indicating the type of padding algorithm to use, or a list indicating the explicit paddings at the start and end of each dimension. Defaults to "VALID".
+        data_format (str, optional): A string specifies the format of the input `Blob`, one of "NCW" or "NWC" (default: "NCW"). "NCW" cooresponds to channels_first, i.e. the input `Blob` with shape (batch_size, channels, width).
+                        "NWC" cooresponds to channels_last, i.e. the input `Blob` with shape (batch_size, channels, width). Defaults to "NCW".
+        dilation_rate (Optional[Union[int, Tuple[int]]], optional): An integer or tuple/list specifies the dilation rate for the dilated convolution. When it is an integer, the same dilation rate is applied for the all dimensions. Defaults to 1.
+        groups (int, optional): A positive integer specifies number of groups for the Group conv. Defaults to 1.
+        activation (Optional[ Callable[[oneflow._oneflow_internal.BlobDesc, str], oneflow._oneflow_internal.BlobDesc] ], optional): Activation function. Defaults to None.
+        use_bias (bool, optional): A boolean specifies whether to use a bias vector. Defaults to True.
+        kernel_initializer (Optional[initializer_conf_util.InitializerConf], optional): Initializer for the kernel weights matrix. Defaults to None.
+        bias_initializer (Optional[initializer_conf_util.InitializerConf], optional): Initializer for the bias vector. Defaults to None.
+        kernel_regularizer (Optional[regularizer_conf_util.RegularizerConf], optional): Regularizer for the kernel weights matrix. Defaults to None.
+        bias_regularizer (Optional[regularizer_conf_util.RegularizerConf], optional): Regularizer for the bias vector . Defaults to None.
+        trainable (bool, optional): A boolean specifies whether to train variables. Defaults to True.
+        name (Optional[str], optional): This layer's name. Defaults to None.
+
+    Raises:
+        ValueError: If the type of kernel_size is not one of integer, list, tuple.
+        ValueError: The number of groups must be positive and number of filters must be divisible by it.
+        ValueError: If data_format is not one of 'NCW', 'NWC'.
+        ValueError: If number of input channels is not divisible by number of groups or less than number of groups.
+        ValueError: Number of group must be one when data_format is 'NWC'.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A 3D `Blob` with the shape of (batch_size, filters, new_width).
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def conv1d_Job(x: tp.Numpy.Placeholder((1, 64, 32))
+        ) -> tp.Numpy:
+            initializer = flow.truncated_normal(0.1)
+            conv1d = flow.layers.conv1d(
+                x,
+                filters=128,
+                kernel_size=3,
+                strides=1,
+                padding='SAME',
+                kernel_initializer=initializer,
+                name="Conv1d"
+            )
+            return conv1d
+
+
+        x = np.random.randn(1, 64, 32).astype(np.float32)
+        out = conv1d_Job(x)
+
+        # out.shape (1, 128, 32)
+
+    """
+    if isinstance(kernel_size, int):
+        kernel_size = (kernel_size,)
+    else:
+        assert isinstance(kernel_size, (list, tuple))
+        assert len(kernel_size) == 1
+        kernel_size = tuple(kernel_size)
+    assert isinstance(groups, int)
+    assert groups > 0
+    assert groups <= filters
+    assert filters % groups == 0
+    if data_format.upper() == "NCW":
+        assert groups <= inputs.shape[1]
+        assert inputs.shape[1] % groups == 0
+        weight_shape = (filters, inputs.shape[1] // groups) + kernel_size
+    elif data_format.upper() == "NWC":
+        assert groups == 1
+        assert groups <= inputs.shape[2]
+        assert inputs.shape[2] % groups == 0
+        weight_shape = (filters, kernel_size[0], inputs.shape[2] // groups)
+    else:
+        raise ValueError("data_format must be in NCW or NWC")
+    if kernel_initializer is None:
+        kernel_initializer = flow.xavier_uniform_initializer(data_format=data_format)
+    if weight_name is None:
+        with flow.scope.namespace(name):
+            weight = flow.get_variable(
+                name="weight",
+                shape=weight_shape,
+                dtype=inputs.dtype,
+                initializer=kernel_initializer,
+                regularizer=kernel_regularizer,
+                trainable=trainable,
+                model_name="weight",
+                reuse=False,
+            )
+    else:
+        weight = flow.get_variable(
+            name=weight_name,
+            shape=weight_shape,
+            dtype=inputs.dtype,
+            initializer=kernel_initializer,
+            regularizer=kernel_regularizer,
+            trainable=trainable,
+            model_name="weight",
+            reuse=False,
+        )
+    output = flow.nn.conv1d(
+        inputs,
+        weight,
+        strides,
+        padding,
+        data_format,
+        dilation_rate,
+        groups=groups,
+        name=name,
+    )
+    if use_bias:
+        if bias_initializer is None:
+            bias_initializer = flow.constant_initializer(0)
+        if bias_name is None:
+            with flow.scope.namespace(name):
+                bias = flow.get_variable(
+                    name="bias",
+                    shape=(filters,),
+                    dtype=inputs.dtype,
+                    initializer=bias_initializer,
+                    regularizer=bias_regularizer,
+                    trainable=trainable,
+                    model_name="bias",
+                    reuse=False,
+                )
+        else:
+            bias = flow.get_variable(
+                name=bias_name,
+                shape=(filters,),
+                dtype=inputs.dtype,
+                initializer=bias_initializer,
+                regularizer=bias_regularizer,
+                trainable=trainable,
+                model_name="bias",
+                reuse=False,
+            )
+        with flow.scope.namespace(name):
+            output = flow.nn.bias_add(output, bias, data_format, name="bias_add")
+    if callable(activation):
+        with flow.scope.namespace(name):
+            output = activation(output, name="activation")
+    return output
+
+
+def conv2d(
+    inputs: oneflow._oneflow_internal.BlobDesc,
+    filters: int,
+    kernel_size: Union[int, IntPair] = 1,
+    strides: Union[int, IntPair] = 1,
+    padding: Union[str, Tuple[IntPair, IntPair, IntPair, IntPair]] = "VALID",
+    data_format: str = "NCHW",
+    dilation_rate: Optional[Union[int, IntPair]] = None,
+    groups: int = 1,
+    activation: Optional[
+        Callable[
+            [oneflow._oneflow_internal.BlobDesc, str],
+            oneflow._oneflow_internal.BlobDesc,
+        ]
+    ] = None,
+    use_bias: bool = True,
+    kernel_initializer: Optional[initializer_conf_util.InitializerConf] = None,
+    bias_initializer: Optional[initializer_conf_util.InitializerConf] = None,
+    kernel_regularizer: Optional[regularizer_conf_util.RegularizerConf] = None,
+    bias_regularizer: Optional[regularizer_conf_util.RegularizerConf] = None,
+    trainable: bool = True,
+    name: str = "Conv2d",
+    weight_name: Optional[str] = None,
+    bias_name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """2D convolution layer.
+
+    This layer computes a 2D convolution with 4D input Blob and filters.
+
+    Args:
+        inputs (oneflow._oneflow_internal.BlobDesc): A 4D input `Blob`.
+        filters (int): An integer specifies the dimensionality of the output space.
+        kernel_size (Union[int, List[int], Tuple[int]], optional): An integer or tuple/list specifies the height and width of the convolution window.
+                        When it is an integer, a square window is applied to the input. Defaults to 1.
+        strides (Union[int, List[int], Tuple[int]], optional): An integer or tuple/list specifies the strides of the convolution window along the height and width.
+                        When it is an integer, the same value for the all spatial dimesions is applied. Defaults to 1.
+        padding (str, Tuple[IntPair, IntPair, IntPair, IntPair], optional): padding: `string` `"SAME"` or `"SAME_LOWER"` or `"SAME_UPPER"` or `"VALID" or Tuple[IntPair, IntPair, IntPair]` indicating the type of padding algorithm to use, or a list indicating the explicit paddings at the start and end of each dimension. Defaults to "VALID".
+        data_format (str, optional): A string specifies the format of the input `Blob`, one of "NCHW" or "NHWC" (default: "NCHW"). "NCHW" cooresponds to channels_first, i.e. the input `Blob` with shape (batch_size, channels, height, width).
+                        "NHWC" cooresponds to channels_last, i.e. the input `Blob` with shape (batch_size, height, width, channels). Defaults to "NCHW".
+        dilation_rate (int, optional): An integer or tuple/list specifies the dilation rate for the dilated convolution. When it is an integer, the same dilation rate is applied for the all dimensions. Defaults to 1.
+        groups (int, optional): A positive integer specifies number of groups for the Group conv. Defaults to 1.
+        activation (Optional[ Callable[[oneflow._oneflow_internal.BlobDesc, str], oneflow._oneflow_internal.BlobDesc] ], optional): Activation function. Defaults to None.
+        use_bias (bool, optional): A boolean specifies whether to use a bias vector. Defaults to True.
+        kernel_initializer (Optional[initializer_conf_util.InitializerConf], optional): Initializer for the kernel weights matrix. Defaults to None.
+        bias_initializer (Optional[initializer_conf_util.InitializerConf], optional): Initializer for the bias vector. Defaults to None.
+        kernel_regularizer (Optional[regularizer_conf_util.RegularizerConf], optional): Regularizer for the kernel weights matrix. Defaults to None.
+        bias_regularizer (Optional[regularizer_conf_util.RegularizerConf], optional): Regularizer for the bias vector . Defaults to None.
+        trainable (bool, optional): A boolean specifies whether to train variables. Defaults to True.
+        name (Optional[str], optional): This layer's name. Defaults to None.
+        weight_name (Optional[str], optional): This weight's name. Defaults to None.
+        bias_name (Optional[str], optional):  This bias's name. Defaults to None.
+
+    Raises:
+        ValueError: If the type of kernel_size is not one of integer, list, tuple.
+        ValueError: The number of groups must be positive and number of filters must be divisible by it.
+        ValueError: If data_format is not one of 'NCHW', 'NHWC'.
+        ValueError: If number of input channels is not divisible by number of groups or less than number of groups.
+        ValueError: Number of group must be one when data_format is 'NHWC'.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A 4D `Blob` with the shape of (batch_size, filters, new_height, new_width).
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def conv2d_Job(x: tp.Numpy.Placeholder((1, 256, 32, 32))
+        ) -> tp.Numpy:
+            initializer = flow.truncated_normal(0.1)
+            conv2d = flow.layers.conv2d(
+                x,
+                filters=128,
+                kernel_size=3,
+                strides=1,
+                padding='SAME',
+                kernel_initializer=initializer,
+                name="Conv2d"
+            )
+            return conv2d
+
+
+        x = np.random.randn(1, 256, 32, 32).astype(np.float32)
+        out = conv2d_Job(x)
+
+        # out.shape (1, 128, 32, 32)
+
+    """
+    if isinstance(kernel_size, int):
+        kernel_size = (kernel_size, kernel_size)
+    else:
+        assert isinstance(kernel_size, (list, tuple))
+        assert len(kernel_size) == 2
+        kernel_size = tuple(kernel_size)
+    assert isinstance(groups, int)
+    assert groups > 0
+    assert groups <= filters
+    assert filters % groups == 0
+    if data_format.upper() == "NCHW":
+        assert groups <= inputs.shape[1]
+        assert inputs.shape[1] % groups == 0
+        weight_shape = (filters, inputs.shape[1] // groups) + kernel_size
+    elif data_format.upper() == "NHWC":
+        assert groups == 1
+        assert groups <= inputs.shape[3]
+        assert inputs.shape[3] % groups == 0
+        weight_shape = (
+            filters,
+            kernel_size[0],
+            kernel_size[1],
+            inputs.shape[3] // groups,
+        )
+    else:
+        raise ValueError("data_format must be in NCHW or NHWC")
+    if kernel_initializer is None:
+        kernel_initializer = flow.xavier_uniform_initializer(data_format=data_format)
+    if weight_name is None:
+        with flow.scope.namespace(name):
+            weight = flow.get_variable(
+                name="weight",
+                shape=weight_shape,
+                dtype=inputs.dtype,
+                initializer=kernel_initializer,
+                regularizer=kernel_regularizer,
+                trainable=trainable,
+                model_name="weight",
+                reuse=False,
+            )
+    else:
+        weight = flow.get_variable(
+            name=weight_name,
+            shape=weight_shape,
+            dtype=inputs.dtype,
+            initializer=kernel_initializer,
+            regularizer=kernel_regularizer,
+            trainable=trainable,
+            model_name="weight",
+            reuse=False,
+        )
+    output = flow.nn.conv2d(
+        inputs,
+        weight,
+        strides=strides,
+        padding=padding,
+        bias=None,
+        data_format=data_format,
+        dilations=dilation_rate,
+        groups=groups,
+        name=name,
+    )
+    if use_bias:
+        if bias_initializer is None:
+            bias_initializer = flow.constant_initializer(0)
+        if bias_name is None:
+            with flow.scope.namespace(name):
+                bias = flow.get_variable(
+                    name="bias",
+                    shape=(filters,),
+                    dtype=inputs.dtype,
+                    initializer=bias_initializer,
+                    regularizer=bias_regularizer,
+                    trainable=trainable,
+                    model_name="bias",
+                    reuse=False,
+                )
+        else:
+            bias = flow.get_variable(
+                name=bias_name,
+                shape=(filters,),
+                dtype=inputs.dtype,
+                initializer=bias_initializer,
+                regularizer=bias_regularizer,
+                trainable=trainable,
+                model_name="bias",
+                reuse=False,
+            )
+        with flow.scope.namespace(name):
+            output = flow.nn.bias_add(output, bias, data_format, name="bias_add")
+    if callable(activation):
+        with flow.scope.namespace(name):
+            output = activation(output, name="activation")
+    return output
+
+
+def conv3d(
+    inputs: oneflow._oneflow_internal.BlobDesc,
+    filters: int,
+    kernel_size: Union[int, Sequence[int]] = 1,
+    strides: Union[int, Sequence[int]] = 1,
+    padding: Union[str, Tuple[IntPair, IntPair, IntPair, IntPair, IntPair]] = "VALID",
+    data_format: str = "NCDHW",
+    dilation_rate: Optional[Union[int, IntPair]] = None,
+    groups: int = 1,
+    activation: Optional[
+        Callable[
+            [oneflow._oneflow_internal.BlobDesc, str],
+            oneflow._oneflow_internal.BlobDesc,
+        ]
+    ] = None,
+    use_bias: bool = True,
+    kernel_initializer: Optional[initializer_conf_util.InitializerConf] = None,
+    bias_initializer: Optional[initializer_conf_util.InitializerConf] = None,
+    kernel_regularizer: Optional[regularizer_conf_util.RegularizerConf] = None,
+    bias_regularizer: Optional[regularizer_conf_util.RegularizerConf] = None,
+    trainable: bool = True,
+    name: str = "Conv3d",
+    weight_name: Optional[str] = None,
+    bias_name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """3D convolution layer.
+
+    This layer computes 3D convolution with 5D input Blob and filters
+
+    Args:
+        inputs (oneflow._oneflow_internal.BlobDesc): A 5D input `Blob`.
+        filters (int): An integer specifies the dimensionality of the output space.
+        kernel_size (Union[int, List[int], Sequence[int]], optional): An integer or tuple/list specifies the height and width of the convolution window.
+                        When it is an integer, a square window is applied to the input. Defaults to 1.
+        strides (Union[int, List[int], Sequence[int]], optional): An integer or tuple/list specifies the strides of the convolution window along the height and width.
+                        When it is an integer, the same value for the all spatial dimesions is applied. Defaults to 1.
+        padding (str, Tuple[IntPair, IntPair, IntPair, IntPair, IntPair], optional): padding: `string` `"SAME"` or `"SAME_LOWER"` or `"SAME_UPPER"` or `"VALID" or Tuple[IntPair, IntPair, IntPair, IntPair, IntPair]` indicating the type of padding algorithm to use, or a list indicating the explicit paddings at the start and end of each dimension. Defaults to "VALID".
+        data_format (str, optional): A string specifies the format of the input `Blob`, one of "NCDHW" or "NDHWC" (default: "NCDHW"). "NCDHW" cooresponds to channels_first, i.e. the input `Blob` with shape (batch_size, channels, depth, height, width).
+                        "NDHWC" cooresponds to channels_last, i.e. the input `Blob` with shape (batch_size, channels, depth, height, width). Defaults to "NCDHW".
+        dilation_rate (int, optional): An integer or tuple/list specifies the dilation rate for the dilated convolution. When it is an integer, the same dilation rate is applied for the all dimensions. Defaults to 1.
+        groups (int, optional): A positive integer specifies number of groups for the Group conv. Defaults to 1.
+        activation (Optional[ Callable[[oneflow._oneflow_internal.BlobDesc, str], oneflow._oneflow_internal.BlobDesc] ], optional): Activation function. Defaults to None.
+        use_bias (bool, optional): A boolean specifies whether to use a bias vector. Defaults to True.
+        kernel_initializer (Optional[initializer_conf_util.InitializerConf], optional): Initializer for the kernel weights matrix. Defaults to None.
+        bias_initializer (Optional[initializer_conf_util.InitializerConf], optional): Initializer for the bias vector. Defaults to None.
+        kernel_regularizer (Optional[regularizer_conf_util.RegularizerConf], optional): Regularizer for the kernel weights matrix. Defaults to None.
+        bias_regularizer (Optional[regularizer_conf_util.RegularizerConf], optional): Regularizer for the bias vector . Defaults to None.
+        trainable (bool, optional): A boolean specifies whether to train variables. Defaults to True.
+        name (Optional[str], optional): This layer's name. Defaults to None.
+        weight_name (Optional[str], optional): This weight's name. Defaults to None.
+        bias_name (Optional[str], optional):  This bias's name. Defaults to None.
+
+    Raises:
+        ValueError: If the type of kernel_size is not one of integer, list, tuple.
+        ValueError: The number of groups must be positive and number of filters must be divisible by it.
+        ValueError: If data_format is not one of 'NCDHW', 'NDHWC'.
+        ValueError: If number of input channels is not divisible by number of groups or less than number of groups.
+        ValueError: Number of group must be one when data_format is 'NDHWC'.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A 5D `Blob` with the shape of (batch_size, filters, new_height, new_width).
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def conv3d_Job(x: tp.Numpy.Placeholder((1, 64, 16, 16, 16))
+        ) -> tp.Numpy:
+            initializer = flow.truncated_normal(0.1)
+            conv3d = flow.layers.conv3d(
+                x,
+                filters=128,
+                kernel_size=3,
+                strides=1,
+                padding='SAME',
+                kernel_initializer=initializer,
+                name="Conv3d"
+            )
+            return conv3d
+
+
+        x = np.random.randn(1, 64, 16, 16, 16).astype(np.float32)
+        out = conv3d_Job(x)
+
+        # out.shape (1, 128, 16, 16, 16)
+
+    """
+    need_transpose = 0
+    if data_format.upper() == "NDHWC":
+        need_transpose = 1
+        data_format = "NCDHW"
+    if need_transpose:
+        inputs = flow.transpose(inputs, perm=[0, 4, 1, 2, 3])
+        if isinstance(padding, (list, tuple)):
+            padding = list(padding)
+            (padding[1], padding[4]) = (padding[4], padding[1])
+    if isinstance(kernel_size, int):
+        kernel_size = (kernel_size, kernel_size, kernel_size)
+    else:
+        assert isinstance(kernel_size, (list, tuple))
+        assert len(kernel_size) == 3
+        kernel_size = tuple(kernel_size)
+    assert isinstance(groups, int)
+    assert groups > 0
+    assert groups <= filters
+    assert filters % groups == 0
+    if data_format.upper() == "NCDHW":
+        assert groups <= inputs.shape[1]
+        assert inputs.shape[1] % groups == 0
+        weight_shape = (filters, inputs.shape[1] // groups) + kernel_size
+    elif data_format.upper() == "NDHWC":
+        assert groups == 1
+        assert groups <= inputs.shape[3]
+        assert inputs.shape[3] % groups == 0
+        weight_shape = (
+            filters,
+            kernel_size[0],
+            kernel_size[1],
+            kernel_size[2],
+            inputs.shape[4] // groups,
+        )
+    else:
+        raise ValueError("data_format must be in NCHW or NHWC")
+    if kernel_initializer is None:
+        kernel_initializer = flow.xavier_uniform_initializer(data_format=data_format)
+    if weight_name is None:
+        with flow.scope.namespace(name):
+            weight = flow.get_variable(
+                name="weight",
+                shape=weight_shape,
+                dtype=inputs.dtype,
+                initializer=kernel_initializer,
+                regularizer=kernel_regularizer,
+                trainable=trainable,
+                model_name="weight",
+                reuse=False,
+            )
+    else:
+        weight = flow.get_variable(
+            name=weight_name,
+            shape=weight_shape,
+            dtype=inputs.dtype,
+            initializer=kernel_initializer,
+            regularizer=kernel_regularizer,
+            trainable=trainable,
+            model_name="weight",
+            reuse=False,
+        )
+    output = flow.nn.conv3d(
+        inputs,
+        weight,
+        strides,
+        padding,
+        data_format,
+        dilation_rate,
+        groups=groups,
+        name=name,
+    )
+    if use_bias:
+        if bias_initializer is None:
+            bias_initializer = flow.constant_initializer(0)
+        if bias_name is None:
+            with flow.scope.namespace(name):
+                bias = flow.get_variable(
+                    name="bias",
+                    shape=(filters,),
+                    dtype=inputs.dtype,
+                    initializer=bias_initializer,
+                    regularizer=bias_regularizer,
+                    trainable=trainable,
+                    model_name="bias",
+                    reuse=False,
+                )
+        else:
+            bias = flow.get_variable(
+                name=bias_name,
+                shape=(filters,),
+                dtype=inputs.dtype,
+                initializer=bias_initializer,
+                regularizer=bias_regularizer,
+                trainable=trainable,
+                model_name="bias",
+                reuse=False,
+            )
+        with flow.scope.namespace(name):
+            output = flow.nn.bias_add(output, bias, data_format, name="bias_add")
+    if callable(activation):
+        with flow.scope.namespace(name):
+            output = activation(output, name="activation")
+    if need_transpose:
+        output = flow.transpose(output, perm=[0, 2, 3, 4, 1])
+    return output
+
+
+def layer_norm(
+    inputs: oneflow._oneflow_internal.BlobDesc,
+    center: bool = True,
+    scale: bool = True,
+    trainable: bool = True,
+    begin_norm_axis: int = 1,
+    begin_params_axis: int = -1,
+    epsilon: float = 1e-05,
+    name: str = "LayerNorm",
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Layer Normalization.
+
+    Args:
+        inputs (oneflow._oneflow_internal.BlobDesc): Input `Blob`.
+        center (bool, optional): A boolean specifies whether to shift input `Blob`. Defaults to True.
+        scale (bool, optional): A boolean specifies whether to scale input `Blob`. Defaults to True.
+        trainable (bool, optional): A boolean specifies whether to train variables. Defaults to True.
+        begin_norm_axis (int, optional): An integer specifies which axis to normalize at first. Defaults to 1.
+        begin_params_axis (int, optional):  An integer specifies which axis params at . Defaults to -1.
+        epsilon (float, optional): A small float is added to avoid division by zero. Defaults to 1e-5.
+        name (Optional[str], optional): This layer's name. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A normalized `Blob` with same shape of input.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def layer_norm_Job(x: tp.Numpy.Placeholder((1, 64, 128, 128))
+        ) -> tp.Numpy:
+            layer_norm = flow.layers.layer_norm(
+                x,
+                name="LayerNorm1"
+            )
+            return layer_norm
+
+
+        x = np.random.randn(1, 64, 128, 128).astype(np.float32)
+        out = layer_norm_Job(x)
+
+        # out.shape (1, 64, 128, 128)
+
+    """
+    if center is False and scale is False:
+        trainable = False
+    beta = None
+    gamma = None
+    param_shape = inputs.shape[begin_params_axis:]
+    if center:
+        with flow.scope.namespace(name):
+            beta = flow.get_variable(
+                name="beta",
+                shape=param_shape,
+                dtype=inputs.dtype,
+                initializer=flow.constant_initializer(0.0),
+                trainable=trainable,
+                model_name="beta",
+                distribute=oneflow._oneflow_internal.distribute.broadcast(),
+                reuse=False,
+            )
+    if scale:
+        with flow.scope.namespace(name):
+            gamma = flow.get_variable(
+                name="gamma",
+                shape=param_shape,
+                dtype=inputs.dtype,
+                initializer=flow.constant_initializer(1.0),
+                trainable=trainable,
+                model_name="gamma",
+                distribute=oneflow._oneflow_internal.distribute.broadcast(),
+                reuse=False,
+            )
+    if flow.current_scope().device_parallel_desc_symbol.device_tag == "cpu":
+        if begin_norm_axis < 0:
+            begin_norm_axis = begin_norm_axis + len(inputs.shape)
+        reduce_axis = []
+        for dim in range(len(inputs.shape)):
+            if dim >= begin_norm_axis:
+                reduce_axis.append(dim)
+        (mean, variance) = flow.nn.moments(inputs, reduce_axis, keepdims=True)
+        axis = begin_norm_axis
+        normalized = flow.nn.batch_normalization(
+            x=inputs,
+            mean=mean,
+            variance=variance,
+            variance_epsilon=epsilon,
+            axis=axis,
+            name=name,
+        )
+        nd_params_shape = [1] * (len(inputs.shape) - len(param_shape)) + list(
+            param_shape
+        )
+        affined = normalized
+        if gamma:
+            gamma = flow.reshape(gamma, nd_params_shape)
+            affined *= gamma
+        if beta:
+            beta = flow.reshape(beta, nd_params_shape)
+            affined += beta
+        return affined
+    elif flow.current_scope().device_parallel_desc_symbol.device_tag == "gpu":
+        op_builder = (
+            flow.user_op_builder(name)
+            .Op("layer_norm")
+            .Input("x", [inputs])
+            .Output("y")
+            .Output("mean")
+            .Output("inv_variance")
+        )
+        if beta is not None:
+            op_builder.Input("beta", [beta])
+        if gamma is not None:
+            op_builder.Input("gamma", [gamma])
+            op_builder.Output("normalized")
+        op_builder.Attr("center", center)
+        op_builder.Attr("scale", scale)
+        op_builder.Attr("begin_norm_axis", begin_norm_axis)
+        op_builder.Attr("begin_params_axis", begin_params_axis)
+        op_builder.Attr("epsilon", epsilon)
+        return op_builder.Build().InferAndTryRun().RemoteBlobList()[0]
+    else:
+        raise NotImplementedError
+
+
+def layer_norm_grad(
+    dy: oneflow._oneflow_internal.BlobDesc,
+    x: oneflow._oneflow_internal.BlobDesc,
+    mean: oneflow._oneflow_internal.BlobDesc,
+    inv_variance: oneflow._oneflow_internal.BlobDesc,
+    begin_norm_axis: int = 1,
+    name: str = "LayerNormGrad",
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Layer normalization
+
+    Args:
+        dy (oneflow._oneflow_internal.BlobDesc): Upstream derivstives.
+        x (oneflow._oneflow_internal.BlobDesc): Input `Blob`.
+        mean (oneflow._oneflow_internal.BlobDesc): Mean over neurons.
+        inv_variance (oneflow._oneflow_internal.BlobDesc): Variance over neurons.
+        begin_norm_axis (int, optional): An integer specifies which axis to normalize at first. Defaults to 1.
+        name (Optional[str], optional): This layer's name. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: Gradient with respect to input `Blob`.
+    """
+    op = (
+        flow.user_op_builder(name)
+        .Op("layer_norm_grad")
+        .Input("dy", [dy])
+        .Input("x", [x])
+        .Input("mean", [mean])
+        .Input("inv_variance", [inv_variance])
+        .Output("dx")
+        .Attr("begin_norm_axis", begin_norm_axis)
+        .Attr("epsilon", 1e-05)
+        .Build()
+    )
+    return op.InferAndTryRun().SoleOutputBlob()
+
+
+def layer_norm_param_grad(
+    dy: oneflow._oneflow_internal.BlobDesc,
+    norm: oneflow._oneflow_internal.BlobDesc,
+    gamma: oneflow._oneflow_internal.BlobDesc,
+    begin_params_axis: int = -1,
+    name: str = "LayerNormParamGrad",
+) -> Tuple[
+    oneflow._oneflow_internal.BlobDesc,
+    oneflow._oneflow_internal.BlobDesc,
+    oneflow._oneflow_internal.BlobDesc,
+]:
+    """Backward pass for layer normalization
+
+    Args:
+        dy (oneflow._oneflow_internal.BlobDesc): Upstream derivstives.
+        norm (oneflow._oneflow_internal.BlobDesc): Normalized output.
+        gamma (oneflow._oneflow_internal.BlobDesc): Scale parameter.
+        begin_params_axis (int, optional): From which parameters to begin with. Defaults to -1.
+        name (Optional[str], optional): This layer's name. Defaults to 'LayerNormParamGrad'.
+
+    Returns:
+        Tuple[oneflow._oneflow_internal.BlobDesc]:
+                normalized_diff: Gradient with respect to input `Blob`.
+                beta_diff: Gradient with respect to shift parameter beta.
+                gamma_diff: Gradient with respect to scale parameter gamma.
+    """
+    op = (
+        flow.user_op_builder(name)
+        .Op("layer_norm_param_grad")
+        .Input("dy", [dy])
+        .Input("normalized", [norm])
+        .Input("gamma", [gamma])
+        .Output("normalized_diff")
+        .Output("beta_diff")
+        .Output("gamma_diff")
+        .Output("reduce_buf")
+        .Attr("begin_params_axis", begin_params_axis)
+        .Build()
+    )
+    (
+        normalized_diff,
+        beta_diff,
+        gamma_diff,
+        reduce_buf,
+    ) = op.InferAndTryRun().RemoteBlobList()
+    return (normalized_diff, beta_diff, gamma_diff)
+
+
+def _get_batch_normalization_variables(
+    name,
+    gamma_name,
+    beta_name,
+    moving_mean_name,
+    moving_variance_name,
+    center,
+    scale,
+    params_shape,
+    params_dtype,
+    trainable,
+    beta_initializer,
+    beta_regularizer,
+    gamma_initializer,
+    gamma_regularizer,
+    moving_mean_initializer,
+    moving_variance_initializer,
+):
+    def get_beta_var(name):
+        if center:
+            beta = flow.get_variable(
+                name=name,
+                shape=params_shape,
+                dtype=params_dtype,
+                initializer=beta_initializer or flow.zeros_initializer(),
+                regularizer=beta_regularizer,
+                trainable=trainable,
+                distribute=oneflow._oneflow_internal.distribute.broadcast(),
+                reuse=False,
+            )
+        else:
+            beta = flow.constant(0, dtype=params_dtype, shape=params_shape, name=name)
+        return beta
+
+    if beta_name is None:
+        with flow.scope.namespace(name):
+            beta = get_beta_var("beta")
+    else:
+        beta = get_beta_var(beta_name)
+
+    def get_gamma_var(name):
+        if scale:
+            gamma = flow.get_variable(
+                name=name,
+                shape=params_shape,
+                dtype=params_dtype,
+                initializer=gamma_initializer or flow.ones_initializer(),
+                regularizer=gamma_regularizer,
+                trainable=trainable,
+                distribute=oneflow._oneflow_internal.distribute.broadcast(),
+                reuse=False,
+            )
+        else:
+            gamma = flow.constant(1, dtype=params_dtype, shape=params_shape, name=name)
+        return gamma
+
+    if gamma_name is None:
+        with flow.scope.namespace(name):
+            gamma = get_gamma_var("gamma")
+    else:
+        gamma = get_gamma_var(gamma_name)
+
+    def get_moving_mean_var(name):
+        moving_mean = flow.get_variable(
+            name=name,
+            shape=params_shape,
+            dtype=params_dtype,
+            initializer=moving_mean_initializer or flow.zeros_initializer(),
+            trainable=False,
+            distribute=oneflow._oneflow_internal.distribute.broadcast(),
+            reuse=False,
+        )
+        return moving_mean
+
+    if moving_mean_name is None:
+        with flow.scope.namespace(name):
+            moving_mean = get_moving_mean_var("moving_mean")
+    else:
+        moving_mean = get_moving_mean_var(moving_mean_name)
+
+    def get_moving_variance_var(name):
+        moving_variance = flow.get_variable(
+            name=name,
+            shape=params_shape,
+            dtype=params_dtype,
+            initializer=moving_variance_initializer or flow.ones_initializer(),
+            trainable=False,
+            distribute=oneflow._oneflow_internal.distribute.broadcast(),
+            reuse=False,
+        )
+        return moving_variance
+
+    if moving_variance_name is None:
+        with flow.scope.namespace(name):
+            moving_variance = get_moving_variance_var("moving_variance")
+    else:
+        moving_variance = get_moving_variance_var(moving_variance_name)
+    return (beta, gamma, moving_mean, moving_variance)
+
+
+def batch_normalization(
+    inputs: oneflow._oneflow_internal.BlobDesc,
+    axis: int = -1,
+    momentum: float = 0.99,
+    epsilon: float = 0.001,
+    center: bool = True,
+    scale: bool = True,
+    beta_initializer: Optional[initializer_conf_util.InitializerConf] = None,
+    gamma_initializer: Optional[initializer_conf_util.InitializerConf] = None,
+    beta_regularizer: Optional[regularizer_conf_util.RegularizerConf] = None,
+    gamma_regularizer: Optional[regularizer_conf_util.RegularizerConf] = None,
+    moving_mean_initializer: Optional[initializer_conf_util.InitializerConf] = None,
+    moving_variance_initializer: Optional[initializer_conf_util.InitializerConf] = None,
+    trainable: bool = True,
+    training: bool = True,
+    name: str = "BatchNorm",
+    gamma_name: Optional[str] = None,
+    beta_name: Optional[str] = None,
+    moving_mean_name: Optional[str] = None,
+    moving_variance_name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """The BatchNormalization Layer.
+
+    This layer can be used in conv or dense layer.
+
+    The input data will be normalized by the mean and variance of the current batch data
+
+    Args:
+        inputs (oneflow._oneflow_internal.BlobDesc): Input `Blob`.
+        axis (int, optional): An int specifies the axis that should be normalized . Default is -1, which normalizes the last axis.
+        momentum (float, optional):  A float specifies the momentum for the moving average. Defaults to 0.99.
+        epsilon (float, optional): A small float added to avoid division by zero. Defaults to 0.001.
+        center (bool, optional): A boolean specifies whether to add offset to normalized `Blob`. Defaults to True.
+        scale (bool, optional): A boolean specifies whether to multiply normalized `Blob` by gamma. Defaults to True.
+        beta_initializer (Optional[initializer_conf_util.InitializerConf], optional): Initializer for beta. Defaults to None.
+        gamma_initializer (Optional[initializer_conf_util.InitializerConf], optional): Initializer for gamma. Defaults to None.
+        beta_regularizer (Optional[regularizer_conf_util.RegularizerConf], optional): Regularizer for beta. Defaults to None.
+        gamma_regularizer (Optional[regularizer_conf_util.RegularizerConf], optional): Regularizer for gamma. Defaults to None.
+        moving_mean_initializer (Optional[initializer_conf_util.InitializerConf], optional): Initializer for moving mean. Defaults to None.
+        moving_variance_initializer (Optional[initializer_conf_util.InitializerConf], optional): Initializer for moving variance. Defaults to None.
+        trainable (bool, optional): A boolean specifies whether to train variables. Defaults to True.
+        training (bool, optional): A boolean specifies whether now is training the model. Defaults to True.
+        name (Optional[str], optional): This layer's name. Defaults to None.
+        gamma_name (Optional[str], optional): This gamma's name. Defaults to None.
+        beta_name (Optional[str], optional): This beta's name. Defaults to None.
+        moving_mean_name (Optional[str], optional): This moving_mean's name. Defaults to None.
+        moving_variance_name (Optional[str], optional): This moving_var's name. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc:  A `Blob` with same shape of input.
+
+    Raises:
+        ValueError: If axis is out of dimension of input.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def batch_norm_Job(x: tp.Numpy.Placeholder((1, 64, 128, 128))
+        ) -> tp.Numpy:
+            initializer = flow.truncated_normal(0.1)
+            conv2d = flow.layers.conv2d(
+                x,
+                filters=128,
+                kernel_size=3,
+                strides=2,
+                padding='SAME',
+                kernel_initializer=initializer,
+                name="Conv2d"
+            )
+            batch_norm = flow.layers.batch_normalization(
+                conv2d,
+                axis=1
+            )
+            return batch_norm
+
+
+        x = np.random.randn(1, 64, 128, 128).astype(np.float32)
+        out = batch_norm_Job(x)
+
+        # out.shape (1, 128, 64, 64)
+
+    """
+    if axis < 0:
+        axis += len(inputs.shape)
+    assert axis >= 0 and axis < len(inputs.shape)
+    params_shape = [inputs.shape[axis]]
+    params_dtype = flow.float32 if inputs.dtype == flow.float16 else inputs.dtype
+    if not flow.current_global_function_desc().IsTrainable() or not trainable:
+        training = False
+    (beta, gamma, moving_mean, moving_variance) = _get_batch_normalization_variables(
+        name,
+        gamma_name,
+        beta_name,
+        moving_mean_name,
+        moving_variance_name,
+        center,
+        scale,
+        params_shape,
+        params_dtype,
+        trainable,
+        beta_initializer,
+        beta_regularizer,
+        gamma_initializer,
+        gamma_regularizer,
+        moving_mean_initializer,
+        moving_variance_initializer,
+    )
+    if flow.current_scope().device_parallel_desc_symbol.device_tag == "cpu":
+        if training:
+            reduce_axis = []
+            for dim in range(len(inputs.shape)):
+                if dim != axis:
+                    reduce_axis.append(dim)
+            (mean, variance) = flow.nn.moments(inputs, reduce_axis, keepdims=False)
+
+            def update_moving(moving, this_batch):
+                moving_identity = flow.identity(moving)
+                flow.assign(
+                    moving, momentum * moving_identity + (1 - momentum) * this_batch
+                )
+
+            update_moving(moving_mean, mean)
+            update_moving(moving_variance, variance)
+            return flow.nn.batch_normalization(
+                x=inputs,
+                mean=mean,
+                variance=variance,
+                offset=beta,
+                scale=gamma,
+                variance_epsilon=epsilon,
+                axis=axis,
+                name=name,
+            )
+        else:
+            mean = moving_mean
+            variance = moving_variance
+            return flow.nn.batch_normalization(
+                x=inputs,
+                mean=mean,
+                variance=variance,
+                offset=beta,
+                scale=gamma,
+                variance_epsilon=epsilon,
+                axis=axis,
+                name=name,
+            )
+    else:
+        builder = (
+            flow.user_op_builder(name)
+            .Op("normalization")
+            .Input("x", [inputs])
+            .Input("moving_mean", [moving_mean])
+            .Input("moving_variance", [moving_variance])
+            .Input("gamma", [gamma])
+            .Input("beta", [beta])
+            .Output("y")
+            .Attr("axis", axis)
+            .Attr("epsilon", epsilon)
+            .Attr("training", training)
+            .Attr("momentum", momentum)
+        )
+        if trainable and training:
+            builder = builder.Output("mean").Output("inv_variance")
+        return builder.Build().InferAndTryRun().RemoteBlobList()[0]
+
+
+def batch_normalization_add_relu(
+    inputs: oneflow._oneflow_internal.BlobDesc,
+    addend: Optional[oneflow._oneflow_internal.BlobDesc] = None,
+    axis: int = -1,
+    momentum: float = 0.99,
+    epsilon: float = 0.001,
+    center: bool = True,
+    scale: bool = True,
+    beta_initializer: Optional[initializer_conf_util.InitializerConf] = None,
+    gamma_initializer: Optional[initializer_conf_util.InitializerConf] = None,
+    beta_regularizer: Optional[regularizer_conf_util.RegularizerConf] = None,
+    gamma_regularizer: Optional[regularizer_conf_util.RegularizerConf] = None,
+    moving_mean_initializer: Optional[initializer_conf_util.InitializerConf] = None,
+    moving_variance_initializer: Optional[initializer_conf_util.InitializerConf] = None,
+    trainable: bool = True,
+    training: bool = True,
+    name: str = "BatchNorm",
+    gamma_name: Optional[str] = None,
+    beta_name: Optional[str] = None,
+    moving_mean_name: Optional[str] = None,
+    moving_variance_name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Fused flow.layers.batch_normalization + flow.math.add + flow.math.relu
+
+    Args:
+        inputs (oneflow._oneflow_internal.BlobDesc): Input `Blob`.
+        addend (oneflow._oneflow_internal.BlobDesc): `Blob` add to batch_normalization output.
+        axis (int, optional): An int specifies the axis that should be normalized . Default is -1, which normalizes the last axis.
+        momentum (float, optional):  A float specifies the momentum for the moving average. Defaults to 0.99.
+        epsilon (float, optional): A small float added to avoid division by zero. Defaults to 0.001.
+        center (bool, optional): A boolean specifies whether to add offset to normalized `Blob`. Defaults to True.
+        scale (bool, optional): A boolean specifies whether to multiply normalized `Blob` by gamma. Defaults to True.
+        beta_initializer (Optional[initializer_conf_util.InitializerConf], optional): Initializer for beta. Defaults to None.
+        gamma_initializer (Optional[initializer_conf_util.InitializerConf], optional): Initializer for gamma. Defaults to None.
+        beta_regularizer (Optional[regularizer_conf_util.RegularizerConf], optional): Regularizer for beta. Defaults to None.
+        gamma_regularizer (Optional[regularizer_conf_util.RegularizerConf], optional): Regularizer for gamma. Defaults to None.
+        moving_mean_initializer (Optional[initializer_conf_util.InitializerConf], optional): Initializer for moving mean. Defaults to None.
+        moving_variance_initializer (Optional[initializer_conf_util.InitializerConf], optional): Initializer for moving variance. Defaults to None.
+        trainable (bool, optional): A boolean specifies whether to train variables. Defaults to True.
+        training (bool, optional): A boolean specifies whether now is training the model. Defaults to True.
+        name (Optional[str], optional): This layer's name. Defaults to None.
+        gamma_name (Optional[str], optional): This gamma's name. Defaults to None.
+        beta_name (Optional[str], optional): This beta's name. Defaults to None.
+        moving_mean_name (Optional[str], optional): This moving_mean's name. Defaults to None.
+        moving_variance_name (Optional[str], optional): This moving_var's name. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc:  A `Blob` with same shape of input.
+
+    Raises:
+        ValueError: If axis is out of dimension of input.
+
+    """
+    if not flow.current_global_function_desc().IsTrainable() or not trainable:
+        training = False
+    if (
+        not training
+        or flow.current_scope().device_parallel_desc_symbol.device_tag == "cpu"
+    ):
+        out = flow.layers.batch_normalization(
+            inputs,
+            axis=axis,
+            momentum=momentum,
+            epsilon=epsilon,
+            center=center,
+            scale=scale,
+            beta_initializer=beta_initializer,
+            gamma_initializer=gamma_initializer,
+            beta_regularizer=beta_regularizer,
+            gamma_regularizer=gamma_regularizer,
+            moving_mean_initializer=moving_mean_initializer,
+            moving_variance_initializer=moving_variance_initializer,
+            trainable=trainable,
+            training=training,
+            name=name,
+        )
+        with flow.scope.namespace("BatchNormAddRelu"):
+            if addend is not None:
+                out = out + addend
+            return flow.math.relu(out)
+    if axis < 0:
+        axis += len(inputs.shape)
+    assert 0 <= axis < len(inputs.shape)
+    params_shape = [inputs.shape[axis]]
+    params_dtype = flow.float32 if inputs.dtype == flow.float16 else inputs.dtype
+    (beta, gamma, moving_mean, moving_variance) = _get_batch_normalization_variables(
+        name,
+        gamma_name,
+        beta_name,
+        moving_mean_name,
+        moving_variance_name,
+        center,
+        scale,
+        params_shape,
+        params_dtype,
+        trainable,
+        beta_initializer,
+        beta_regularizer,
+        gamma_initializer,
+        gamma_regularizer,
+        moving_mean_initializer,
+        moving_variance_initializer,
+    )
+    builder = (
+        flow.user_op_builder(name)
+        .Op("normalization_add_relu")
+        .Input("x", [inputs])
+        .Input("moving_mean", [moving_mean])
+        .Input("moving_variance", [moving_variance])
+        .Input("gamma", [gamma])
+        .Input("beta", [beta])
+        .Output("y")
+        .Output("mean")
+        .Output("inv_variance")
+        .Output("reserve_space")
+        .Attr("axis", axis)
+        .Attr("epsilon", epsilon)
+        .Attr("momentum", momentum)
+    )
+    if addend is not None:
+        builder = builder.Input("addend", [addend])
+    return builder.Build().InferAndTryRun().RemoteBlobList()[0]
+
+
+def batch_normalization_relu(
+    inputs: oneflow._oneflow_internal.BlobDesc,
+    axis: int = -1,
+    momentum: float = 0.99,
+    epsilon: float = 0.001,
+    center: bool = True,
+    scale: bool = True,
+    beta_initializer: Optional[initializer_conf_util.InitializerConf] = None,
+    gamma_initializer: Optional[initializer_conf_util.InitializerConf] = None,
+    beta_regularizer: Optional[regularizer_conf_util.RegularizerConf] = None,
+    gamma_regularizer: Optional[regularizer_conf_util.RegularizerConf] = None,
+    moving_mean_initializer: Optional[initializer_conf_util.InitializerConf] = None,
+    moving_variance_initializer: Optional[initializer_conf_util.InitializerConf] = None,
+    trainable: bool = True,
+    training: bool = True,
+    name: str = "BatchNorm",
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Fused flow.layers.batch_normalization + flow.math.relu
+
+Args:
+    inputs (oneflow._oneflow_internal.BlobDesc): Input `Blob`.
+    axis (int, optional): An int specifies the axis that should be normalized . Default is -1, which normalizes the last axis.
+    momentum (float, optional):  A float specifies the momentum for the moving average. Defaults to 0.99.
+    epsilon (float, optional): A small float added to avoid division by zero. Defaults to 0.001.
+    center (bool, optional): A boolean specifies whether to add offset to normalized `Blob`. Defaults to True.
+    scale (bool, optional): A boolean specifies whether to multiply normalized `Blob` by gamma. Defaults to True.
+    beta_initializer (Optional[initializer_conf_util.InitializerConf], optional): Initializer for beta. Defaults to None.
+    gamma_initializer (Optional[initializer_conf_util.InitializerConf], optional): Initializer for gamma. Defaults to None.
+    beta_regularizer (Optional[regularizer_conf_util.RegularizerConf], optional): Regularizer for beta. Defaults to None.
+    gamma_regularizer (Optional[regularizer_conf_util.RegularizerConf], optional): Regularizer for gamma. Defaults to None.
+    moving_mean_initializer (Optional[initializer_conf_util.InitializerConf], optional): Initializer for moving mean. Defaults to None.
+    moving_variance_initializer (Optional[initializer_conf_util.InitializerConf], optional): Initializer for moving variance. Defaults to None.
+    trainable (bool, optional): A boolean specifies whether to train variables. Defaults to True.
+    training (bool, optional): A boolean specifies whether now is training the model. Defaults to True.
+    name (Optional[str], optional): This layer's name. Defaults to None.
+
+Returns:
+    oneflow._oneflow_internal.BlobDesc:  A `Blob` with same shape of input.
+
+Raises:
+    ValueError: If axis is out of dimension of input.
+
+"""
+    return flow.layers.batch_normalization_add_relu(
+        inputs,
+        axis=axis,
+        momentum=momentum,
+        epsilon=epsilon,
+        center=center,
+        scale=scale,
+        beta_initializer=beta_initializer,
+        gamma_initializer=gamma_initializer,
+        beta_regularizer=beta_regularizer,
+        gamma_regularizer=gamma_regularizer,
+        moving_mean_initializer=moving_mean_initializer,
+        moving_variance_initializer=moving_variance_initializer,
+        trainable=trainable,
+        training=training,
+        name=name,
+    )
+
+
+def upsample(
+    x: oneflow._oneflow_internal.BlobDesc,
+    size: Sequence[int] = (2, 2),
+    align_corners: bool = False,
+    data_format: str = "NCHW",
+    interpolation: str = "nearest",
+    name: str = "Upsample2D",
+):
+    """The Upsample Layer, this layer can upsample the feature map to a specified scale.
+
+    Args:
+        x ([type]): Input `Blob`.
+        size (tuple, optional): (height_scale, width_scale)  Defaults to (2, 2).
+        align_corners (bool, optional): Defaults to False.
+        data_format (str, optional): A string specifies the format of the input `Blob`, one of "NCHW" or "NHWC" (default: "NCHW"). "NCHW" cooresponds to channels_first, i.e. the input `Blob` with shape (batch_size, channels, height, width).
+                        "NHWC" cooresponds to channels_last, i.e. the input `Blob` with shape (batch_size, height, width, channels).. Defaults to "NCHW".
+        interpolation (str, optional): Image interpolation algorithm to enlarge the image size. Defaults to "nearest". "nearest" and "bilinear" are available now.
+        name ([type], optional): This layer's name. Defaults to None.
+
+    Raises:
+        ValueError: interpolation must be "nearest" or "bilinear".
+        ValueError: data_format must be "NHWC" or "NCHW"
+
+    Returns:
+        [type]: oneflow._oneflow_internal.BlobDesc:  A `Blob` which is the upsampled `x`. If `size` is (2, 2), the shape of return value is [N, C, 2H, 2W].
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def upsample_Job(x: tp.Numpy.Placeholder((1, 32, 32, 32))
+        ) -> tp.Numpy:
+            upsample = flow.layers.upsample_2d(
+                x,
+                size=(2, 2),
+                name="Upsample1"
+            )
+            return upsample
+
+
+        x = np.random.randn(1, 32, 32, 32).astype(np.float32)
+        out = upsample_Job(x)
+
+        # out.shape (1, 32, 64, 64)
+
+    """
+    if isinstance(size, int):
+        height_scale = size
+        width_scale = size
+    else:
+        assert isinstance(size, (list, tuple))
+        assert len(size) == 2
+        height_scale = size[0]
+        width_scale = size[1]
+    if interpolation != "nearest" and interpolation != "bilinear":
+        raise ValueError('interpolation must be "nearest" or "bilinear".')
+    if interpolation == "nearest" and align_corners:
+        raise ValueError('interpolation "nearest" does not support align_corners.')
+    if data_format.upper() != "NCHW" and data_format.upper() != "NHWC":
+        raise ValueError('data_format must be "NHWC" or "NCHW".')
+    need_transpose = 0
+    if data_format.upper() == "NHWC":
+        need_transpose = 1
+    if need_transpose:
+        x = flow.transpose(x, perm=[0, 3, 1, 2])
+    op = (
+        flow.user_op_builder(name)
+        .Op("upsample")
+        .Input("x", [x])
+        .Output("y")
+        .Attr("height_scale", float(height_scale))
+        .Attr("width_scale", float(width_scale))
+        .Attr("align_corners", align_corners)
+        .Attr("data_format", "channels_first")
+        .Attr("interpolation", interpolation)
+        .Build()
+    )
+    output = op.InferAndTryRun().SoleOutputBlob()
+    if need_transpose:
+        output = flow.transpose(output, perm=[0, 2, 3, 1])
+    return output
diff --git a/python/oneflow/compatible/single_client/ops/linalg.py b/python/oneflow/compatible/single_client/ops/linalg.py
new file mode 100644
index 0000000000000000000000000000000000000000..d702078cd08d35167f4df89c6b988f43b46de646
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/linalg.py
@@ -0,0 +1,124 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+from typing import Optional
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework import interpret_util as interpret_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+from oneflow.core.register import logical_blob_id_pb2 as logical_blob_id_util
+
+
+def matmul(
+    a: oneflow._oneflow_internal.BlobDesc,
+    b: oneflow._oneflow_internal.BlobDesc,
+    transpose_a: bool = False,
+    transpose_b: bool = False,
+    alpha: float = 1.0,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator applies matrix multiplication to two Blobs.
+
+    Args:
+        a (oneflow._oneflow_internal.BlobDesc): A Blob
+        b (oneflow._oneflow_internal.BlobDesc): A Blob
+        transpose_a (bool, optional): Whether to transpose A Blob. Defaults to False.
+        transpose_b (bool, optional): Whether to transpose B Blob. Defaults to False.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def matmul_Job(A: tp.Numpy.Placeholder((3, 3)),
+                    B: tp.Numpy.Placeholder((3, 3))
+        ) -> tp.Numpy:
+            return flow.linalg.matmul(A, B)
+
+
+        A = np.array([[1, 0, 0],
+                    [0, 1, 1],
+                    [0, 0, 1]]).astype(np.float32)
+        B = np.array([[3, 4, 5],
+                    [6, 7, 8],
+                    [9, 10, 11]]).astype(np.float32)
+        out = matmul_Job(A, B)
+
+        # output [[ 3.  4.  5.]
+        #         [15. 17. 19.]
+        #         [ 9. 10. 11.]]
+
+    """
+    if name is None:
+        name = id_util.UniqueStr("Matmul_")
+    assert len(a.shape) >= 2
+    assert len(b.shape) >= 2
+    if len(a.shape) == len(b.shape):
+        if len(a.shape) == 2:
+            op = (
+                flow.user_op_builder(name)
+                .Op("matmul")
+                .Input("a", [a])
+                .Input("b", [b])
+                .Output("out")
+                .Attr("transpose_a", transpose_a)
+                .Attr("transpose_b", transpose_b)
+                .Attr("alpha", float(alpha))
+                .Build()
+            )
+        else:
+            op = (
+                flow.user_op_builder(name)
+                .Op("batch_matmul")
+                .Input("a", [a])
+                .Input("b", [b])
+                .Output("out")
+                .Attr("transpose_a", transpose_a)
+                .Attr("transpose_b", transpose_b)
+                .Attr("alpha", float(alpha))
+                .Build()
+            )
+    else:
+        if len(b.shape) != 2:
+            raise ValueError(
+                "don't support number of dimensions of a being less than number of dimensions of b"
+            )
+        if transpose_a:
+            raise ValueError("don't support tensor a to be tranpose")
+        op = (
+            flow.user_op_builder(name)
+            .Op("broadcast_matmul")
+            .Input("a", [a])
+            .Input("b", [b])
+            .Output("out")
+            .Attr("transpose_a", transpose_a)
+            .Attr("transpose_b", transpose_b)
+            .Attr("alpha", float(alpha))
+            .Build()
+        )
+    return op.InferAndTryRun().SoleOutputBlob()
diff --git a/python/oneflow/compatible/single_client/ops/loss_ops.py b/python/oneflow/compatible/single_client/ops/loss_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c00ef169839e7e2539e4aff3edb97ce7324c3dc
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/loss_ops.py
@@ -0,0 +1,275 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional, Tuple
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+
+
+def smooth_l1_loss(
+    prediction: oneflow._oneflow_internal.BlobDesc,
+    label: oneflow._oneflow_internal.BlobDesc,
+    beta: float = 1.0,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the smooth l1 loss.
+
+    The equation is:
+
+    .. math::
+
+        & out = \\frac{(\\beta*x)^2}{2}, \\left|x\\right|<\\frac{1}{{\\beta}^2}
+
+        & out = \\left|x\\right|-\\frac{0.5}{{\\beta}^2}, otherwise
+
+
+    Args:
+        prediction (oneflow._oneflow_internal.BlobDesc): The prediction Blob
+        label (oneflow._oneflow_internal.BlobDesc): The label Blob
+        beta (float, optional): The :math:`\\beta` in the equation. Defaults to 1.0.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def smooth_l1_loss_Job(prediction: tp.Numpy.Placeholder((5, )),
+                            label: tp.Numpy.Placeholder((5, ))
+        ) -> tp.Numpy:
+            return flow.smooth_l1_loss(prediction=prediction,
+                                    label=label)
+
+
+        prediction = np.array([0.1, 0.4, 0.3, 0.5, 0.9]).astype(np.float32)
+        label = np.array([0.3, 0.9, 2.5, 0.4, 0.3]).astype(np.float32)
+        out = smooth_l1_loss_Job(prediction, label)
+
+        # out [0.02       0.12499999 1.7        0.005      0.17999998]
+
+    """
+    op = (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("SmoothL1Loss_")
+        )
+        .Op("smooth_l1_loss")
+        .Input("prediction", [prediction])
+        .Input("label", [label])
+        .Output("loss")
+    )
+    op.Attr("beta", float(beta))
+    return op.Build().InferAndTryRun().RemoteBlobList()[0]
+
+
+def ctc_loss(
+    log_probs: oneflow._oneflow_internal.BlobDesc,
+    targets: oneflow._oneflow_internal.BlobDesc,
+    input_lengths: oneflow._oneflow_internal.BlobDesc,
+    target_lengths: oneflow._oneflow_internal.BlobDesc,
+    blank: int = 0,
+    reduction: str = "mean",
+    zero_infinity: bool = False,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Computes the CTC(Connectionist Temporal Classification) loss.
+    This operator implements the CTC loss as presented in (Graves et al., 2006).
+
+
+    Args:
+        log_probs (oneflow._oneflow_internal.BlobDesc): A Blob of shape [input_length, batch_size, num_labels]. The logarithmized probabilities of the outputs (e.g. obtained with flow.nn.logsoftmax()).
+        targets (oneflow._oneflow_internal.BlobDesc): A Blob of shape [batch_size, max_target_length]. It represent the target sequences. Each element in the target sequence is a class index. And the target index cannot be blank (default=0).
+        input_lengths (oneflow._oneflow_internal.BlobDesc): A Blob of shape [batch_size]. It represent the lengths of the inputs. And the lengths are specified for each sequence to achieve masking under the assumption that sequences are padded to equal lengths.
+        target_lengths (oneflow._oneflow_internal.BlobDesc): A Blob of shape [batch_size]. It represent lengths of the targets. Lengths are specified for each sequence to achieve masking under the assumption that sequences are padded to equal lengths.
+        blank (int, optional): Blank label. Defaults to 0.
+        reduction (str, optional): The reduce type, it can be the one of "none", "mean", "sum". "none": no reduction will be applied, "mean": the output losses will be divided by the target lengths and then the mean over the batch is taken, "sum": the output will be summed. Defaults to "mean".
+        zero_infinity (bool, optional):  Whether to zero infinite losses and the associated gradients. Infinite losses mainly occur when the inputs are too short to be aligned to the targets. Defaults to False.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        import numpy as np
+
+
+        @flow.global_function()
+        def ctc_loss_job(
+            log_probs: tp.Numpy.Placeholder(shape=(5, 2, 3)),
+            targets: tp.Numpy.Placeholder(shape=(2, 3), dtype=flow.int32),
+            input_lengths: tp.Numpy.Placeholder(shape=(2,), dtype=flow.int32),
+            target_lengths: tp.Numpy.Placeholder(shape=(2,), dtype=flow.int32),
+        ) -> tp.Numpy:
+            loss = flow.ctc_loss(
+                log_probs, targets, input_lengths, target_lengths, blank=0, reduction="none"
+            )
+            return loss
+
+
+        log_probs = np.array(
+            [
+                [[-1.1031, -0.7998, -1.5200], [-0.9808, -1.1363, -1.1908]],
+                [[-1.2258, -1.0665, -1.0153], [-1.1135, -1.2331, -0.9671]],
+                [[-1.3348, -0.6611, -1.5118], [-0.9823, -1.2355, -1.0941]],
+                [[-1.3850, -1.3273, -0.7247], [-0.8235, -1.4783, -1.0994]],
+                [[-0.9049, -0.8867, -1.6962], [-1.4938, -1.3630, -0.6547]],
+            ]
+        ).astype(np.float32)
+        targets = np.array([[1, 2, 2], [1, 2, 2]]).astype("int32")
+        input_lengths = np.array([5, 5]).astype("int32")
+        target_lengths = np.array([3, 3]).astype("int32")
+        loss = ctc_loss_job(log_probs, targets, input_lengths, target_lengths)
+
+        # loss [3.918017 2.907672]
+
+    """
+    name = name if name is not None else id_util.UniqueStr("CTCLoss_")
+    (loss, _) = (
+        flow.user_op_builder(name)
+        .Op("ctc_loss")
+        .Input("log_probs", [log_probs])
+        .Input("targets", [targets])
+        .Input("input_lengths", [input_lengths])
+        .Input("target_lengths", [target_lengths])
+        .Output("loss")
+        .Output("alpha")
+        .Attr("blank", int(blank))
+        .Attr("zero_infinity", zero_infinity)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()
+    )
+    if zero_infinity:
+        cond = flow.math.equal(
+            loss,
+            flow.constant(
+                float("inf"),
+                dtype=loss.dtype,
+                shape=loss.shape,
+                name=name + "_constant",
+            ),
+            name=name + "_equal",
+        )
+        loss = flow.where(
+            cond,
+            flow.zeros(dtype=loss.dtype, shape=loss.shape, name=name + "_zeros"),
+            loss,
+            name=name + "_where",
+        )
+    if reduction == "mean":
+        return flow.math.reduce_mean(
+            flow.math.xdivy(
+                loss,
+                flow.cast(
+                    flow.math.clip_by_value(
+                        target_lengths, min_value=1, name=name + "_clip_by_value"
+                    ),
+                    dtype=log_probs.dtype,
+                    name=name + "_cast",
+                ),
+                name=name + "_xdivy",
+            ),
+            name=name + "_reduce_mean",
+        )
+    elif reduction == "sum":
+        return flow.math.reduce_sum(loss, name=name + "_reduce_sum")
+    else:
+        return loss
+
+
+def ctc_greedy_decoder(
+    log_probs: oneflow._oneflow_internal.BlobDesc,
+    input_lengths: oneflow._oneflow_internal.BlobDesc,
+    merge_repeated: bool = True,
+    name: Optional[str] = None,
+) -> Tuple[oneflow._oneflow_internal.BlobDesc, oneflow._oneflow_internal.BlobDesc]:
+    """Performs greedy decoding on the logits given in input (best path).
+
+    Args:
+        log_probs (oneflow._oneflow_internal.BlobDesc): A Blob of shape [input_length, batch_size, num_labels]. The logarithmized probabilities of the outputs (e.g. obtained with flow.nn.logsoftmax()).
+        input_lengths (oneflow._oneflow_internal.BlobDesc): A Blob of shape [batch_size]. It represent the lengths of the inputs. And the lengths are specified for each sequence to achieve masking under the assumption that sequences are padded to equal lengths.
+        merge_repeated (bool, optional): If merge_repeated is True, merge repeated classes in output. This means that if consecutive logits' maximum indices are the same, only the first of these is emitted. Defaults to True.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        decoded(oneflow._oneflow_internal.BlobDesc): A Blob of shape [batch_size, input_length], The decoded outputs.
+        neg_sum_logits(oneflow._oneflow_internal.BlobDesc): A float matrix (batch_size x 1) containing, for the sequence found, the negative of the sum of the greatest logit at each timeframe.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        import numpy as np
+        from typing import Tuple
+
+
+        @flow.global_function()
+        def ctc_greedy_decoder_job(
+            log_probs: tp.Numpy.Placeholder(shape=(4, 2, 5)),
+            input_lengths: tp.Numpy.Placeholder(shape=(2,), dtype=flow.int64),
+        ) -> Tuple[tp.Numpy, tp.Numpy]:
+            decoded, neg_sum_logits = flow.nn.ctc_greedy_decoder(
+                log_probs, input_lengths, merge_repeated=True
+            )
+            return decoded, neg_sum_logits
+
+
+        log_probs = np.array(
+            [
+                [[-1.54, -1.20, -1.95, -1.65, -1.81], [-1.84, -1.74, -1.58, -1.55, -1.12]],
+                [[-1.68, -1.48, -1.89, -1.30, -2.07], [-1.13, -1.45, -1.24, -1.61, -1.66]],
+                [[-1.56, -1.40, -2.83, -1.67, -1.48], [-1.20, -2.01, -2.05, -1.95, -1.24]],
+                [[-2.09, -1.76, -1.36, -1.67, -1.45], [-1.85, -1.48, -1.34, -2.16, -1.55]],
+            ]
+        ).astype(np.float32)
+        input_lengths = np.array([4, 4])
+        decoded, neg_sum_logits = ctc_greedy_decoder_job(log_probs, input_lengths)
+
+        # decoded [[1 3 1 2] [0 2 0 0]]
+        # neg_sum_logits [[5.26] [4.79]]
+
+
+    """
+    name = name if name is not None else id_util.UniqueStr("CTCGreedyDecode_")
+    (decoded, neg_sum_logits) = (
+        flow.user_op_builder(name)
+        .Op("ctc_greedy_decoder")
+        .Input("log_probs", [log_probs])
+        .Input("input_lengths", [input_lengths])
+        .Output("decoded")
+        .Output("neg_sum_logits")
+        .Attr("merge_repeated", merge_repeated)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()
+    )
+    return (decoded, neg_sum_logits)
diff --git a/python/oneflow/compatible/single_client/ops/losses/__init__.py b/python/oneflow/compatible/single_client/ops/losses/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/python/oneflow/compatible/single_client/ops/losses/add_loss.py b/python/oneflow/compatible/single_client/ops/losses/add_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c588af7319622fb172b770561f6d4e529da083f
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/losses/add_loss.py
@@ -0,0 +1,47 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow._oneflow_internal
+from oneflow.compatible.single_client.eager import gradient_util as gradient_util
+from oneflow.compatible.single_client.framework import c_api_util as c_api_util
+from oneflow.compatible.single_client.framework import hob as hob
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+from oneflow.compatible.single_client.support import enable_if as enable_if
+
+
+def api_add_loss(loss: oneflow._oneflow_internal.BlobDesc) -> None:
+    """Mark a `Blob` as a loss. Auto grad starts at every loss blob. It doesn't has to be a product of typical "loss" operator like softmax loss but can also be a `Blob` produced by any operator.
+
+    Args:
+        loss: A `Blob`.
+    """
+    return enable_if.unique([lazy_add_loss, eager_add_loss])(loss)
+
+
+@enable_if.condition(
+    hob.in_global_mode & hob.is_trainable & ~hob.eager_execution_enabled
+)
+def lazy_add_loss(loss):
+    c_api_util.CurJobBuildAndInferCtx_AddLossLogicalBlobName(loss.unique_name)
+
+
+@enable_if.condition(
+    hob.in_global_mode & hob.is_trainable & hob.eager_execution_enabled
+)
+def eager_add_loss(loss):
+    c_api_util.CurJobBuildAndInferCtx_AddLossLogicalBlobName(loss.unique_name)
+    gradient_util.GetDefaultBackwardBlobRegister().TrySetObject4BlobName(
+        loss.logical_blob_name, loss.blob_object
+    )
diff --git a/python/oneflow/compatible/single_client/ops/math_binary_elementwise_ops.py b/python/oneflow/compatible/single_client/ops/math_binary_elementwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..f940e6ca6eeafce11eeed170e57a7a5da25d8df8
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/math_binary_elementwise_ops.py
@@ -0,0 +1,288 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+from typing import Optional, Union
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+
+
+def build_math_binary_elementwise_op(math_op, x, y, name=None):
+    if name is None:
+        name = id_util.UniqueStr(math_op + "_")
+    return (
+        flow.user_op_builder(name)
+        .Op(math_op)
+        .Input("x", [x])
+        .Input("y", [y])
+        .Output("z")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def atan2(
+    x: oneflow._oneflow_internal.BlobDesc,
+    y: oneflow._oneflow_internal.BlobDesc,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the values of :math:`arctan(\\frac{x}{y})`.
+
+    The equation is:
+
+    .. math::
+
+        out = arctan(\\frac{x}{y})
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        y (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def atan2Job(x: tp.Numpy.Placeholder((3,),), y: tp.Numpy.Placeholder((3, ))
+        )-> tp.Numpy:
+            return flow.math.atan2(x, y)
+
+        x = np.array([1, 2, 3]).astype(np.float32)
+        y = np.array([4, 4, 4]).astype(np.float32)
+        out = atan2Job(x, y)
+
+
+        # out [0.24497867 0.4636476  0.6435011 ]
+        # We take the first value as an example
+        # (arctan(1/4) * pi) / 180 = 0.24497867
+
+    """
+    return build_math_binary_elementwise_op("atan2", x, y, name)
+
+
+def pow(
+    x: oneflow._oneflow_internal.BlobDesc,
+    y: Union[oneflow._oneflow_internal.BlobDesc, float],
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the Pow result.
+
+    The equation is:
+
+    .. math::
+
+        out = x^y
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        y (Union[oneflow._oneflow_internal.BlobDesc, float]): A Blob or float value, the exponential factor of Pow
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    Example 1:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def powJob(x: tp.Numpy.Placeholder((3,), ), y: tp.Numpy.Placeholder((3,))
+                ) -> tp.Numpy:
+            return flow.math.pow(x, y)
+
+
+        x = np.array([2, 3, 4]).astype(np.float32)
+        y = np.array([2, 3, 4]).astype(np.float32)
+        out = powJob(x, y)
+
+        # out [  4.  27. 256.]
+
+    Example 2:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        import numpy as np
+
+
+        @flow.global_function()
+        def scalar_pow_job(x: tp.Numpy.Placeholder(shape=(3, )))->tp.Numpy:
+            with flow.scope.placement("cpu", "0:0"):
+                out = flow.math.pow(x, 2.0)
+            return out
+
+
+        x = np.array([1, 2, 3]).astype(np.float32)
+        out = scalar_pow_job(x)
+
+        # out [1. 4. 9.]
+    """
+    if name is None:
+        name = id_util.UniqueStr("Pow_")
+    if isinstance(y, (int, float)):
+        return (
+            flow.user_op_builder(name)
+            .Op("scalar_pow")
+            .Input("in", [x])
+            .Attr("exponent", float(y))
+            .Output("out")
+            .Build()
+            .InferAndTryRun()
+            .RemoteBlobList()[0]
+        )
+    else:
+        return build_math_binary_elementwise_op("pow", x, y, name)
+
+
+def floordiv(
+    x: oneflow._oneflow_internal.BlobDesc,
+    y: oneflow._oneflow_internal.BlobDesc,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the result of :math:`x/y`, rounding toward the most negative integer value
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        y (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def floor_div_Job(x: tp.Numpy.Placeholder((3,)),
+                        y: tp.Numpy.Placeholder((3,))
+        ) -> tp.Numpy:
+            return flow.math.floordiv(x, y)
+
+
+        x = np.array([4, 3, 5]).astype(np.float32)
+        y = np.array([3, 2, 2]).astype(np.float32)
+        out = floor_div_Job(x, y)
+
+        # out [1. 1. 2.]
+    """
+    return build_math_binary_elementwise_op("floordiv", x, y, name)
+
+
+def xdivy(
+    x: oneflow._oneflow_internal.BlobDesc,
+    y: oneflow._oneflow_internal.BlobDesc,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the result of :math:`x/y`
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        y (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def xdivy_Job(x: tp.Numpy.Placeholder((3,)),
+                        y: tp.Numpy.Placeholder((3,))
+        ) -> tp.Numpy:
+            return flow.math.xdivy(x, y)
+
+
+        x = np.array([4, 3, 5]).astype(np.float32)
+        y = np.array([3, 2, 2]).astype(np.float32)
+        out = xdivy_Job(x, y)
+
+        # out [1.3333334 1.5       2.5      ]
+
+    """
+    return build_math_binary_elementwise_op("xdivy", x, y, name)
+
+
+def xlogy(
+    x: oneflow._oneflow_internal.BlobDesc,
+    y: oneflow._oneflow_internal.BlobDesc,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the result of :math:`x*log(y)`
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        y (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def xlogy_Job(x: tp.Numpy.Placeholder((3,)),
+                    y: tp.Numpy.Placeholder((3,))
+        ) -> tp.Numpy:
+            return flow.math.xlogy(x, y)
+
+
+        x = np.array([2, 2, 2]).astype(np.float32)
+        y = np.array([4, 8, 16]).astype(np.float32)
+        out = xlogy_Job(x, y)
+
+        # out [2.7725887 4.158883  5.5451775]
+    """
+    return build_math_binary_elementwise_op("xlogy", x, y, name)
diff --git a/python/oneflow/compatible/single_client/ops/math_ops.py b/python/oneflow/compatible/single_client/ops/math_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..b74fe9d1076acc36e265b0c0cb256c415da14e5e
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/math_ops.py
@@ -0,0 +1,2119 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+from typing import List, Optional, Sequence, Tuple, Union
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework import interpret_util as interpret_util
+from oneflow.compatible.single_client.framework import module as module_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+from oneflow.compatible.single_client.ops import (
+    math_unary_elementwise_ops as math_unary_elementwise_ops,
+)
+from oneflow.compatible.single_client.ops.transpose_util import (
+    get_inversed_perm,
+    get_perm_when_transpose_axis_to_last_dim,
+)
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+from oneflow.core.register import logical_blob_id_pb2 as logical_blob_id_util
+
+
+def add(
+    x: Union[int, float, oneflow._oneflow_internal.BlobDesc],
+    y: Union[int, float, oneflow._oneflow_internal.BlobDesc],
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Compute :math:`X + Y` element-wise, math.add supports broadcasting.
+    The equation is:
+
+    .. math::
+        out = X + Y
+
+    Args:
+        x (Union[int, float, oneflow._oneflow_internal.BlobDesc]): A Blob.
+        y (Union[int, float, oneflow._oneflow_internal.BlobDesc]): A Blob has the same type of x.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A Blob is added by x and y, and has the same type of x.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def addJob(x: tp.Numpy.Placeholder((3, )),
+                y: tp.Numpy.Placeholder((3, ))
+        )->tp.Numpy:
+            return flow.math.add(x, y)
+
+        x = np.array([1, 2, 3]).astype(np.float32)
+        y = np.array([1, 1, 1]).astype(np.float32)
+        out = addJob(x, y)
+
+        # out [2., 3., 4.]
+
+    """
+    if isinstance(x, (int, float)):
+        return scalar_add(y, x, name)
+    elif isinstance(y, (int, float)):
+        return scalar_add(x, y, name)
+    elif x.shape == y.shape and x.is_dynamic == y.is_dynamic:
+        return element_wise_add(x, y, name)
+    elif x.shape == (1,):
+        return scalar_add_by_tensor(y, x, name)
+    elif y.shape == (1,):
+        return scalar_add_by_tensor(x, y, name)
+    else:
+        return broadcast_add(x, y, name)
+
+
+def _recursive_build_add_n(inputs, name=None):
+    inputs = list(inputs)
+    kernel_max_inputs = 8
+    if len(inputs) == 1:
+        return inputs[0]
+    elif len(inputs) <= kernel_max_inputs:
+        return (
+            flow.user_op_builder(
+                name if name is not None else id_util.UniqueStr("AddN_")
+            )
+            .Op("add_n")
+            .Input("in", inputs)
+            .Output("out")
+            .Build()
+            .InferAndTryRun()
+            .RemoteBlobList()[0]
+        )
+    else:
+        assert len(inputs) > kernel_max_inputs
+        new_inputs = inputs[kernel_max_inputs:]
+        new_inputs.append(_recursive_build_add_n(inputs[:kernel_max_inputs]))
+        return _recursive_build_add_n(new_inputs)
+
+
+def add_n(
+    inputs: Sequence[oneflow._oneflow_internal.BlobDesc], name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Add all the input tensors in element-wise.
+
+    Args:
+        inputs (Sequence[oneflow._oneflow_internal.BlobDesc]): A list of Blob, each Blob has the same shape and type.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The sum of the inputs, has the same shape and type as the elements of inputs.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def add_n_Job(x: tp.Numpy.Placeholder((3, )),
+                    y: tp.Numpy.Placeholder((3, ))
+        )->tp.Numpy:
+            return flow.math.add_n([x, y])
+
+        x = np.array([1, 2, 3]).astype(np.float32)
+        y = np.array([1, 1, 1]).astype(np.float32)
+        out = add_n_Job(x, y)
+        print(out)
+
+        # out [2., 3., 4.]
+
+    """
+    return _recursive_build_add_n(inputs, name)
+
+
+def subtract(
+    x: Union[int, float, oneflow._oneflow_internal.BlobDesc],
+    y: Union[int, float, oneflow._oneflow_internal.BlobDesc],
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Compute :math:`X - Y` element-wise.
+
+    The equation is:
+
+    .. math::
+        out = X - Y
+
+    Args:
+        x (Union[int, float, oneflow._oneflow_internal.BlobDesc]): A Blob.
+        y (Union[int, float, oneflow._oneflow_internal.BlobDesc]): A Blob has the same type of x.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A Blob after subtracting, has the same type as x.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def subtractJob(x: tp.Numpy.Placeholder((3, )),
+                        y: tp.Numpy.Placeholder((3, ))
+        )->tp.Numpy:
+            return flow.math.subtract(x, y)
+
+        x = np.array([1, 2, 3]).astype(np.float32)
+        y = np.array([2, 4, 1]).astype(np.float32)
+        out = subtractJob(x, y)
+
+        # out [-1., -2., 2.]
+
+    """
+    if isinstance(x, (int, float)):
+        return scalar_add(-1 * y, x, name)
+    elif isinstance(y, (int, float)):
+        return scalar_add(x, -1 * y, name)
+    elif x.shape == y.shape:
+        return broadcast_sub(x, y, name)
+    elif y.shape == (1,):
+        return scalar_sub_by_tensor(x, y, name)
+    else:
+        return broadcast_sub(x, y, name)
+
+
+def multiply(
+    x: Union[int, float, oneflow._oneflow_internal.BlobDesc],
+    y: Union[int, float, oneflow._oneflow_internal.BlobDesc],
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Compute :math:`x \\times y` element-wise.
+
+    The equation is:
+
+    .. math::
+        out = X \\times Y
+
+    Args:
+        x (Union[int, float, oneflow._oneflow_internal.BlobDesc]): A Blob.
+        y (Union[int, float, oneflow._oneflow_internal.BlobDesc]): A Blob has the same type of x.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A Blob after multiplying, has the same type as x.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def multiplyJob(x: tp.Numpy.Placeholder((3, )),
+                        y: tp.Numpy.Placeholder((3, ))
+        )->tp.Numpy:
+            return flow.math.multiply(x, y)
+
+        x = np.array([1, 2, 3]).astype(np.float32)
+        y = np.array([2, 3, 3]).astype(np.float32)
+        out = multiplyJob(x, y)
+
+        # out [2., 6., 9.]
+
+    """
+    if isinstance(x, (int, float)):
+        return scalar_mul(y, x, name)
+    elif isinstance(y, (int, float)):
+        return scalar_mul(x, y, name)
+    elif x.shape == y.shape:
+        return element_wise_mul(x, y, name)
+    elif x.shape == (1,):
+        return scalar_mul_by_tensor(y, x, name)
+    elif y.shape == (1,):
+        return scalar_mul_by_tensor(x, y, name)
+    else:
+        return broadcast_mul(x, y, name)
+
+
+def divide(
+    x: Union[int, float, oneflow._oneflow_internal.BlobDesc],
+    y: Union[int, float, oneflow._oneflow_internal.BlobDesc],
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Computes the division of x by y.
+
+    The equation is:
+
+    .. math::
+        out = \\frac{X}{Y}
+
+    Args:
+        x (Union[int, float, oneflow._oneflow_internal.BlobDesc]): A Blob.
+        y (Union[int, float, oneflow._oneflow_internal.BlobDesc]): A Blob.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A Blob with same shape as input x.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def divideJob(x: tp.Numpy.Placeholder((3, )),
+                    y: tp.Numpy.Placeholder((3, ))
+        )->tp.Numpy:
+            return flow.math.divide(x, y)
+
+        x = np.array([25, 16, 9]).astype(np.float32)
+        y = np.array([10, 4, 2]).astype(np.float32)
+        out = divideJob(x, y)
+
+        # out [2.5, 4., 4.5]
+
+    """
+    if isinstance(x, (int, float)):
+        return scalar_mul(math_unary_elementwise_ops.reciprocal_no_nan(y), x, name)
+    elif isinstance(y, (int, float)):
+        if y == 0 or y == 0.0:
+            y = 0.0
+        else:
+            y = 1.0 / float(y)
+        return scalar_mul(x, y, name)
+    elif x.shape == y.shape:
+        return broadcast_div(x, y, name)
+    elif y.shape == (1,):
+        return scalar_div_by_tensor(x, y, name)
+    else:
+        return broadcast_div(x, y, name)
+
+
+def floor_mod(
+    x: Union[int, float, oneflow._oneflow_internal.BlobDesc],
+    y: Union[int, float, oneflow._oneflow_internal.BlobDesc],
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator mods two Blobs.
+
+    The equation is:
+
+    .. math::
+        out = X \\bmod Y
+
+    Args:
+        x (Union[int, float, oneflow._oneflow_internal.BlobDesc]): A Blob
+        y (Union[int, float, oneflow._oneflow_internal.BlobDesc]): A Blob has the same type of x
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Raises:
+        NotImplementedError: x must be an int or a float
+        NotImplementedError: y must be an int or a float
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A Blob with same type as input x.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def modJob(x: tp.Numpy.Placeholder((3, )),
+                y: tp.Numpy.Placeholder((3, ))
+        )->tp.Numpy:
+            return flow.math.mod(x, y)
+
+        x = np.array([16, 9, 5]).astype(np.float32)
+        y = np.array([6, 4, 3]).astype(np.float32)
+        out = modJob(x, y)
+
+        # out [4., 1., 2.]
+
+    """
+    if isinstance(x, (int, float)):
+        raise NotImplementedError
+    elif isinstance(y, (int, float)):
+        raise NotImplementedError
+    elif x.shape == y.shape:
+        return broadcast_floor_mod(x, y, name)
+    else:
+        return broadcast_floor_mod(x, y, name)
+
+
+def scalar_add(x, operand, name=None):
+    if name is None:
+        name = id_util.UniqueStr("ScalarAdd_")
+    builder = flow.user_op_builder(name).Op("scalar_add").Input("in", [x]).Output("out")
+    if isinstance(operand, int):
+        builder = (
+            builder.Attr("has_int_operand", True)
+            .Attr("has_float_operand", False)
+            .Attr("int_operand", operand)
+            .Attr("float_operand", 0.0)
+        )
+    elif isinstance(operand, float):
+        builder = (
+            builder.Attr("has_int_operand", False)
+            .Attr("has_float_operand", True)
+            .Attr("int_operand", 0)
+            .Attr("float_operand", operand)
+        )
+    return builder.Build().InferAndTryRun().RemoteBlobList()[0]
+
+
+def scalar_add_by_tensor(x, scalar, name=None):
+    return (
+        flow.user_op_builder(name or id_util.UniqueStr("ScalarAddByTensor_"))
+        .Op("scalar_add_by_tensor")
+        .Input("x", [x])
+        .Input("scalar", [scalar])
+        .Output("y")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def element_wise_add(x, y, name=None):
+    return flow.math.add_n([x, y], name)
+
+
+def build_broadcast_binary_op(math_op, x, y, name=None):
+    if name is None:
+        name = id_util.UniqueStr(math_op + "_")
+    return (
+        flow.user_op_builder(name)
+        .Op(math_op)
+        .Input("x", [x])
+        .Input("y", [y])
+        .Output("z")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def broadcast_add(x, y, name=None):
+    return build_broadcast_binary_op("broadcast_add", x, y, name)
+
+
+def broadcast_sub(x, y, name=None):
+    return build_broadcast_binary_op("broadcast_sub", x, y, name)
+
+
+def scalar_sub_by_tensor(x, scalar, name=None):
+    return (
+        flow.user_op_builder(name or id_util.UniqueStr("ScalarSubByTensor_"))
+        .Op("scalar_sub_by_tensor")
+        .Input("x", [x])
+        .Input("scalar", [scalar])
+        .Output("y")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def element_wise_mul(x, y, name=None):
+    return (
+        flow.user_op_builder(name or id_util.UniqueStr("ElementWiseMul_"))
+        .Op("multiply")
+        .Input("x", [x])
+        .Input("y", [y])
+        .Output("out")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def broadcast_mul(x, y, name=None):
+    return build_broadcast_binary_op("broadcast_mul", x, y, name)
+
+
+def scalar_mul(x, operand, name=None):
+    if name is None:
+        name = id_util.UniqueStr("ScalarMul_")
+    builder = flow.user_op_builder(name).Op("scalar_mul").Input("in", [x]).Output("out")
+    if isinstance(operand, int):
+        builder = (
+            builder.Attr("has_int_operand", True)
+            .Attr("has_float_operand", False)
+            .Attr("int_operand", operand)
+            .Attr("float_operand", 0.0)
+        )
+    elif isinstance(operand, float):
+        builder = (
+            builder.Attr("has_int_operand", False)
+            .Attr("has_float_operand", True)
+            .Attr("int_operand", 0)
+            .Attr("float_operand", operand)
+        )
+    return builder.Build().InferAndTryRun().RemoteBlobList()[0]
+
+
+def scalar_mul_by_tensor(x, scalar, name=None):
+    return (
+        flow.user_op_builder(name or id_util.UniqueStr("ScalarMulByTensor_"))
+        .Op("scalar_mul_by_tensor")
+        .Input("x", [x])
+        .Input("scalar", [scalar])
+        .Output("y")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def broadcast_div(x, y, name=None):
+    return build_broadcast_binary_op("broadcast_div", x, y, name)
+
+
+def scalar_div_by_tensor(x, scalar, name=None):
+    return (
+        flow.user_op_builder(name or id_util.UniqueStr("ScalarDivByTensor_"))
+        .Op("scalar_div_by_tensor")
+        .Input("x", [x])
+        .Input("scalar", [scalar])
+        .Output("y")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def broadcast_floor_mod(x, y, name=None):
+    return build_broadcast_binary_op("broadcast_floor_mod", x, y, name)
+
+
+def gelu(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Gelu activation operator.
+
+    The equation is:
+
+    .. math::
+        out = 0.5 * x * (1 + tanh(\\sqrt{\\frac{2}{\\pi}} * (x + 0.044715x^{3})))
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): Input Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A Blob.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def geluJob(x: tp.Numpy.Placeholder((3, ))
+        )->tp.Numpy:
+            return flow.math.gelu(x)
+
+        x = np.array([-0.5, 0, 0.5]).astype(np.float32)
+        out = geluJob(x)
+
+        # out [-0.15426877, 0., 0.34573123]
+
+    """
+    return (
+        flow.user_op_builder(name if name is not None else id_util.UniqueStr("Gelu_"))
+        .Op("gelu")
+        .Input("in", [x])
+        .Output("out")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def relu(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Relu activation
+
+    The equation is:
+
+    .. math::
+        out = max(X, 0)
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): Input Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: An activated Blob.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def reluJob(x: tp.Numpy.Placeholder((3, ))
+        )->tp.Numpy:
+            return flow.math.relu(x)
+
+        x = np.array([-1, 0, 5]).astype(np.float32)
+        out = reluJob(x)
+
+        # out [0., 0., 5.]
+
+    """
+    return (
+        flow.user_op_builder(name if name is not None else id_util.UniqueStr("Relu_"))
+        .Op("relu")
+        .Input("in", [x])
+        .Output("out")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def sigmoid(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Sigmoid activation
+
+    The equation is:
+
+    .. math::
+        out = \\frac{1}{1 + e^{-x}}
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): Input Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: An activated Blob.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def sigmoidJob(x: tp.Numpy.Placeholder((3, ))
+        )->tp.Numpy:
+            return flow.math.sigmoid(x)
+
+        x = np.array([-1, 0, 1]).astype(np.float32)
+        out = sigmoidJob(x)
+
+        # out [0.26894143, 0.5, 0.7310586]
+
+    """
+    return (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("Sigmoid_")
+        )
+        .Op("sigmoid")
+        .Input("in", [x])
+        .Output("out")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def sigmoid_grad(
+    y: oneflow._oneflow_internal.BlobDesc,
+    dy: oneflow._oneflow_internal.BlobDesc,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    return (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("SigmoidGrad_")
+        )
+        .Op("sigmoid_grad")
+        .Input("y", [y])
+        .Input("dy", [dy])
+        .Output("dx")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def unsorted_segment_sum(
+    data: oneflow._oneflow_internal.BlobDesc,
+    segment_ids: oneflow._oneflow_internal.BlobDesc,
+    num_segments: int,
+    axis: int = 0,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Computes the sum along segments of a Blob.
+
+    Args:
+        data (oneflow._oneflow_internal.BlobDesc): Input Blob
+        segment_ids (oneflow._oneflow_internal.BlobDesc): A Blob should be the size of the first dimension, with consecutive IDs in the range 0 to k (k < d0).
+        num_segments (int): num_segments should equal the number of distinct segment IDs.
+        axis (int, optional): The axis of data. Defaults to 0.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A Blob with the same type of data.
+
+    For example:
+
+    .. code-block:: python
+
+        # Example 1:
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def unsorted_segment_sumJob(data: tp.Numpy.Placeholder((3, 4)),
+                                    segment_ids: tp.Numpy.Placeholder((4, ), dtype=flow.int32)
+        )->tp.Numpy:
+            return flow.math.unsorted_segment_sum(data, segment_ids, num_segments=2, axis=1)
+
+        input_blob = np.array([[1, 2, 3, 4],
+                               [5, 6, 7 ,8],
+                               [9, 10, 11, 12]]).astype(np.float32)
+        segment_ids = np.array([0, 1, 0, 1]).astype(np.int32)
+        out = unsorted_segment_sumJob(input_blob, segment_ids)
+
+        # out [[ 4.  6.]
+        #      [12. 14.]
+        #      [20. 22.]]
+
+        # Example 2
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def unsorted_segment_sumJob(data: tp.Numpy.Placeholder((3, 4)),
+                                    segment_ids: tp.Numpy.Placeholder((3, ), dtype=flow.int32)
+        )->tp.Numpy:
+            return flow.math.unsorted_segment_sum(data, segment_ids, num_segments=2, axis=0)
+
+        input_blob = np.array([[1, 2, 3, 4],
+                               [5, 6, 7 ,8],
+                               [9, 10, 11, 12]]).astype(np.float32)
+        segment_ids = np.array([0, 1, 0]).astype(np.int32)
+        out = unsorted_segment_sumJob(input_blob, segment_ids)
+
+        #  out [[10. 12. 14. 16.]
+        #       [ 5.  6.  7.  8.]]
+
+    """
+    return (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("UnsortedSegmentSum_")
+        )
+        .Op("unsorted_segment_sum")
+        .Input("data", [data])
+        .Input("segment_ids", [segment_ids])
+        .Output("out")
+        .Attr("axis", int(axis))
+        .Attr("num_segments", int(num_segments))
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def unsorted_segment_sum_like(
+    data: oneflow._oneflow_internal.BlobDesc,
+    segment_ids: oneflow._oneflow_internal.BlobDesc,
+    like: oneflow._oneflow_internal.BlobDesc,
+    axis: int = 0,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Computes the sum along segments of a Blob, the output shape is the same as the `like` Blob.
+
+    Args:
+        data (oneflow._oneflow_internal.BlobDesc): Input Blob
+        segment_ids (oneflow._oneflow_internal.BlobDesc): A Blob should be the size of the first dimension, with consecutive IDs in the range 0 to k (k < d0).
+        like (oneflow._oneflow_internal.BlobDesc): The input Blob which specifies shape
+        axis (int, optional): The axis of data. Defaults to 0.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A Blob.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def unsorted_segment_sum_like_Job(data: tp.Numpy.Placeholder((3, 4)),
+                                        segment_ids: tp.Numpy.Placeholder((3, ), dtype=flow.int32),
+                                        like: tp.Numpy.Placeholder((2, 4), dtype=flow.float32)
+        )->tp.Numpy:
+            return flow.math.unsorted_segment_sum_like(data, segment_ids, like, axis=0)
+
+        input_blob = np.array([[1, 2, 3, 4],
+                            [5, 6, 7 ,8],
+                            [9, 10, 11, 12]]).astype(np.float32)
+        segment_ids = np.array([0, 1, 0]).astype(np.int32)
+        like = np.zeros(shape=(2, 4), dtype=np.float32)
+
+        out = unsorted_segment_sum_like_Job(input_blob, segment_ids, like)
+
+        # out [[10. 12. 14. 16.]
+        #      [ 5.  6.  7.  8.]]
+
+    """
+    return (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("UnsortedSegmentSumLike_")
+        )
+        .Op("unsorted_segment_sum_like")
+        .Input("data", [data])
+        .Input("segment_ids", [segment_ids])
+        .Input("like", [like])
+        .Output("out")
+        .Attr("axis", int(axis))
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def unsorted_batch_segment_sum(
+    data: oneflow._oneflow_internal.BlobDesc,
+    segment_ids: oneflow._oneflow_internal.BlobDesc,
+    num_segments: int,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """It is similar with `unsorted_segment_sum`, the difference is that `unsorted_batch_segment_sum` brings a `batch axis`. We can do the segment sum in different batch of data.
+
+    For example, the segment id is like:
+
+    .. code-block:: python
+
+        [[0 0 0 1 2 2 3 3],
+         [0 0 1 1 2 3 3 3]]
+
+    Args:
+        data (oneflow._oneflow_internal.BlobDesc): Input Blob
+        segment_ids (oneflow._oneflow_internal.BlobDesc): A Blob with shape (d0, d1). The d0, d1 are the first and second dimension of data.
+        num_segments (int): num_segments should equal the number of distinct segment IDs.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A Blob.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def unsorted_batch_segment_sum_Job(data: tp.Numpy.Placeholder((3, 4)),
+                                        segment_ids: tp.Numpy.Placeholder((3, 4), dtype=flow.int32)
+        )->tp.Numpy:
+            return flow.math.unsorted_batch_segment_sum(data, segment_ids, 2)
+
+        input_blob = np.array([[1, 2, 3, 4],
+                            [1, 2, 3 ,4],
+                            [1, 2, 3, 4]]).astype(np.float32)
+        segment_ids = np.array([[0, 0, 0, 1],
+                                [0, 0, 1, 0],
+                                [0, 1, 0, 0]]).astype(np.int32)
+        out = unsorted_batch_segment_sum_Job(input_blob, segment_ids)
+
+        # out [[6. 4.]
+        #      [7. 3.]
+        #      [8. 2.]]
+
+    """
+    return (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("UnsortedBatchSegmentSum_")
+        )
+        .Op("unsorted_batch_segment_sum")
+        .Input("data", [data])
+        .Input("segment_ids", [segment_ids])
+        .Output("out")
+        .Attr("num_segments", int(num_segments))
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def cast(
+    x: oneflow._oneflow_internal.BlobDesc, dtype: flow.dtype, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """The op takes input x and casts it to the output with `dtype`
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): Input Blob
+        dtype (flow.dtype): Data type of the output
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def cast_Job(x: tp.Numpy.Placeholder((3, ), dtype=flow.float32)
+        )->tp.Numpy:
+            return flow.cast(x, dtype=flow.int32)
+
+        x = np.array([1, 2, 3]).astype(np.float32)
+        out = cast_Job(x)
+
+        # out.dtype = "int32"
+
+    """
+    if x.dtype == dtype:
+        return x
+    if name is None:
+        name = id_util.UniqueStr("Cast_")
+    return (
+        flow.user_op_builder(name)
+        .Op("cast")
+        .Input("in", [x])
+        .Output("out")
+        .Attr("dtype", dtype)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def equal(
+    x: oneflow._oneflow_internal.BlobDesc,
+    y: oneflow._oneflow_internal.BlobDesc,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Returns the truth value of :math:`{x}=={y}` element-wise.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        y (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A Blob with int8 type.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def equal_Job(x: tp.Numpy.Placeholder((3, )),
+                    y: tp.Numpy.Placeholder((3, ))
+        )->tp.Numpy:
+            return flow.math.equal(x, y)
+
+        x = np.array([1, 2, 3]).astype(np.float32)
+        y = np.array([1, 2, 1]).astype(np.float32)
+        out = equal_Job(x, y)
+
+        # out [1 1 0]
+
+    """
+    return build_broadcast_binary_op("broadcast_equal", x, y, name)
+
+
+def not_equal(
+    x: oneflow._oneflow_internal.BlobDesc,
+    y: oneflow._oneflow_internal.BlobDesc,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Returns the truth value of :math:`{x}!={y}` element-wise.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        y (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A Blob with int8 type.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def not_equal_Job(x: tp.Numpy.Placeholder((3, )),
+                        y: tp.Numpy.Placeholder((3, ))
+        )->tp.Numpy:
+            return flow.math.not_equal(x, y)
+
+        x = np.array([1, 2, 3]).astype(np.float32)
+        y = np.array([1, 2, 1]).astype(np.float32)
+        out = not_equal_Job(x, y)
+
+        # out [0 0 1]
+
+    """
+    return build_broadcast_binary_op("broadcast_not_equal", x, y, name)
+
+
+def less(
+    x: oneflow._oneflow_internal.BlobDesc,
+    y: oneflow._oneflow_internal.BlobDesc,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Returns the truth value of :math:`x < y` element-wise.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        y (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A Blob with int8 type.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def less_Job(x: tp.Numpy.Placeholder((3, )),
+                    y: tp.Numpy.Placeholder((3, ))
+        )->tp.Numpy:
+            return flow.math.less(x, y)
+
+        x = np.array([1, 2, 3]).astype(np.float32)
+        y = np.array([1, 2, 4]).astype(np.float32)
+        out = less_Job(x, y)
+
+        # out [0 0 1]
+
+    """
+    return build_broadcast_binary_op("broadcast_less", x, y, name)
+
+
+def less_equal(
+    x: oneflow._oneflow_internal.BlobDesc,
+    y: oneflow._oneflow_internal.BlobDesc,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Returns the truth value of :math:`x <= y` element-wise.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        y (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A Blob with int8 type.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def less_equal_Job(x: tp.Numpy.Placeholder((3, )),
+                        y: tp.Numpy.Placeholder((3, ))
+        )->tp.Numpy:
+            return flow.math.less_equal(x, y)
+
+        x = np.array([1, 2, 3]).astype(np.float32)
+        y = np.array([1, 1, 4]).astype(np.float32)
+        out = less_equal_Job(x, y)
+
+        # out [1 0 1]
+
+    """
+    return build_broadcast_binary_op("broadcast_less_equal", x, y, name)
+
+
+def greater(
+    x: oneflow._oneflow_internal.BlobDesc,
+    y: oneflow._oneflow_internal.BlobDesc,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Returns the truth value of :math:`x > y` element-wise.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        y (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A Blob with int8 type.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def greater_Job(x: tp.Numpy.Placeholder((3, )),
+                        y: tp.Numpy.Placeholder((3, ))
+        )->tp.Numpy:
+            return flow.math.greater(x, y)
+
+        x = np.array([1, 1, 4]).astype(np.float32)
+        y = np.array([1, 2, 3]).astype(np.float32)
+        out = greater_Job(x, y)
+
+        # out [0 0 1]
+
+    """
+    return build_broadcast_binary_op("broadcast_greater", x, y, name)
+
+
+def greater_equal(
+    x: oneflow._oneflow_internal.BlobDesc,
+    y: oneflow._oneflow_internal.BlobDesc,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Returns the truth value of :math:`x >= y` element-wise.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        y (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A Blob with int8 type.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def greater_equal_Job(x: tp.Numpy.Placeholder((3, )),
+                            y: tp.Numpy.Placeholder((3, ))
+        )->tp.Numpy:
+            return flow.math.greater_equal(x, y)
+
+        x = np.array([1, 1, 4]).astype(np.float32)
+        y = np.array([1, 2, 3]).astype(np.float32)
+        out = greater_equal_Job(x, y)
+
+        # out [1 0 1]
+
+    """
+    return build_broadcast_binary_op("broadcast_greater_equal", x, y, name)
+
+
+def logical_and(
+    x: oneflow._oneflow_internal.BlobDesc,
+    y: oneflow._oneflow_internal.BlobDesc,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Logical AND function.
+
+    Each element is calculated by:
+
+    .. math::
+        out = X \\land Y
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        y (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A Blob with int8 type.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def logical_and_Job(x: tp.Numpy.Placeholder((3, )),
+                            y: tp.Numpy.Placeholder((3, ))
+        )->tp.Numpy:
+            return flow.math.logical_and(x, y)
+
+        x = np.array([1, 0, 1]).astype(np.float32)
+        y = np.array([0, 0, 1]).astype(np.float32)
+        out = logical_and_Job(x, y)
+
+        # out [0 0 1]
+
+    """
+    return build_broadcast_binary_op("broadcast_logical_and", x, y, name)
+
+
+def minimum(
+    x: oneflow._oneflow_internal.BlobDesc,
+    y: oneflow._oneflow_internal.BlobDesc,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Returns the min of x and y element-wise, this op supports broadcasting.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        y (oneflow._oneflow_internal.BlobDesc): A Blob. Must have the same type of x
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A Blob, has the same type of x.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def minimum_Job(x: tp.Numpy.Placeholder((3, )),
+                        y: tp.Numpy.Placeholder((3, ))
+        )->tp.Numpy:
+            return flow.math.minimum(x, y)
+
+        x = np.array([2, 3, 4]).astype(np.float32)
+        y = np.array([4, 2, 1]).astype(np.float32)
+        out = minimum_Job(x, y)
+
+        # out [2. 2. 1.]
+
+    """
+    if x.shape == y.shape:
+        return (
+            flow.user_op_builder(name or id_util.UniqueStr("ElementWiseMinimum_"))
+            .Op("elementwise_minimum")
+            .Input("x", [x])
+            .Input("y", [y])
+            .Output("z")
+            .Build()
+            .InferAndTryRun()
+            .RemoteBlobList()[0]
+        )
+    else:
+        return build_broadcast_binary_op("broadcast_minimum", x, y, name)
+
+
+def maximum(
+    x: oneflow._oneflow_internal.BlobDesc,
+    y: oneflow._oneflow_internal.BlobDesc,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Returns the max of x and y element-wise, this op supports broadcasting.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        y (oneflow._oneflow_internal.BlobDesc): A Blob. Must have the same type of x
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A Blob, has the same type of x.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def maximum_Job(x: tp.Numpy.Placeholder((3, )),
+                        y: tp.Numpy.Placeholder((3, ))
+        )->tp.Numpy:
+            return flow.math.maximum(x, y)
+
+        x = np.array([2, 3, 4]).astype(np.float32)
+        y = np.array([4, 2, 1]).astype(np.float32)
+        out = maximum_Job(x, y)
+
+        # out [4. 3. 4.]
+
+    """
+    if x.shape == y.shape:
+        return (
+            flow.user_op_builder(name or id_util.UniqueStr("ElementWiseMaximum_"))
+            .Op("elementwise_maximum")
+            .Input("x", [x])
+            .Input("y", [y])
+            .Output("z")
+            .Build()
+            .InferAndTryRun()
+            .RemoteBlobList()[0]
+        )
+    else:
+        return build_broadcast_binary_op("broadcast_maximum", x, y, name)
+
+
+def elem_cnt(
+    input_blob: oneflow._oneflow_internal.BlobDesc,
+    axis: Optional[Sequence[int]] = None,
+    dtype: Optional[flow.dtype] = None,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Computes the product of input_blob's dimensions along the parameter `axis`. By default, all the dimensions will be computed.
+
+    Args:
+        input_blob (oneflow._oneflow_internal.BlobDesc): Input Blob
+        axis (Optional[Sequence[int]], optional): The dimensions along which the op is performed. Defaults to None.
+        dtype (Optional[flow.dtype], optional): The data type. Defaults to None.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A Blob
+
+    For example:
+
+    .. code-block:: python
+
+        # Example 1:
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def elem_cnt_Job(x: tp.Numpy.Placeholder((3, 4, 5))
+        )->tp.Numpy:
+            return flow.math.reduced_shape_elem_cnt(x, axis=[0, 1])
+
+        x = np.ones(shape=(3, 4, 5), dtype=np.float32)
+        out = elem_cnt_Job(x) # 3 x 4 = 12
+
+        # out [12]
+
+        # Example 2:
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def elem_cnt_Job(x: tp.Numpy.Placeholder((3, 4, 5))
+        )->tp.Numpy:
+            return flow.math.reduced_shape_elem_cnt(x)
+
+        x = np.ones(shape=(3, 4, 5), dtype=np.float32)
+        out = elem_cnt_Job(x) # 3 x 4 x 5 = 60
+
+        # out [60]
+
+    """
+    op_conf = op_conf_util.OperatorConf()
+    setattr(
+        op_conf,
+        "name",
+        name if name is not None else id_util.UniqueStr("ShapeElemCnt_"),
+    )
+    op_conf.shape_elem_cnt_conf.x = input_blob.unique_name
+    if axis is None:
+        op_conf.shape_elem_cnt_conf.exclude_axis_conf.SetInParent()
+    else:
+        assert isinstance(axis, (tuple, list))
+        op_conf.shape_elem_cnt_conf.include_axis_conf.axis.extend(axis)
+    if dtype is not None:
+        op_conf.shape_elem_cnt_conf.data_type = oneflow._oneflow_internal.deprecated.GetProtoDtype4OfDtype(
+            dtype
+        )
+    op_conf.shape_elem_cnt_conf.y = "y"
+    interpret_util.Forward(op_conf)
+    out_lbi = logical_blob_id_util.LogicalBlobId()
+    out_lbi.op_name = op_conf.name
+    out_lbi.blob_name = "y"
+    return remote_blob_util.RemoteBlob(out_lbi)
+
+
+def _top_k_at_last_dim(
+    input: oneflow._oneflow_internal.BlobDesc,
+    k: int = 1,
+    sorted: bool = True,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    return (
+        flow.user_op_builder(name if name is not None else id_util.UniqueStr("TopK_"))
+        .Op("top_k")
+        .Input("in", [input])
+        .Output("out")
+        .Attr("k", k)
+        .Attr("sorted", sorted)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def top_k(
+    input: oneflow._oneflow_internal.BlobDesc,
+    axis: int = -1,
+    k: int = 1,
+    sorted: bool = True,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Finds the indices of the k largest entries at specified axis, the difference between other framework is that oneflow only return the indices.
+
+    Args:
+        input (oneflow._oneflow_internal.BlobDesc): The input Blob
+        axis (int, optional): dimension to be calculated. Defaults to the last dim (-1)
+        k (int, optional): Number of top elements to look for along the last dimension. Defaults to 1.
+        sorted (bool, optional): If true the resulting k elements will be sorted by the values in descending order. Defaults to True.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A Blob(dtype=int32) contains the indices of the k largest elements.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def topk_Job(x: tp.Numpy.Placeholder((5, ))
+        )->tp.Numpy:
+            return flow.math.top_k(x, 2)
+
+        x = np.array([1, 3, 8, 7, 2], dtype=np.float32)
+        out = topk_Job(x)
+
+        # out [2 3]
+
+    """
+    name = name if name is not None else id_util.UniqueStr("TopK_")
+    num_axes = len(input.shape)
+    axis = axis if axis >= 0 else axis + num_axes
+    assert 0 <= axis < num_axes, "axis out of range"
+    if axis == num_axes - 1:
+        return _top_k_at_last_dim(input, k, sorted, name)
+    else:
+        perm = get_perm_when_transpose_axis_to_last_dim(num_axes, axis)
+        x = flow.transpose(input, perm, False, True, name + "_transpose")
+        x = _top_k_at_last_dim(x, k, sorted, name)
+        return flow.transpose(
+            x, get_inversed_perm(perm), False, True, name + "_inverse_transpose"
+        )
+
+
+def _argmax_at_last_dim(
+    input: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    return (
+        flow.user_op_builder(name if name is not None else id_util.UniqueStr("ArgMax_"))
+        .Op("argmax")
+        .Input("in", [input])
+        .Output("out")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def argmax(
+    input: oneflow._oneflow_internal.BlobDesc,
+    axis: int = -1,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """The op computes the index with the largest value of a Blob at specified axis.
+
+    Args:
+        input (oneflow._oneflow_internal.BlobDesc): Input Blob
+        axis (int, optional): dimension to be calculated. Defaults to the last dim (-1)
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A Blob(dtype=int32) contains the index with the largest value of `input`
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def argmax_Job(x: tp.Numpy.Placeholder((2, 5))
+        )->tp.Numpy:
+            return flow.math.argmax(x)
+
+        x = np.array([[1, 3, 8, 7, 2],
+                    [1, 9, 4, 3, 2]], dtype=np.float32)
+
+        out = argmax_Job(x)
+
+        # out [2 1]
+
+    """
+    name = name if name is not None else id_util.UniqueStr("ArgMax_")
+    num_axes = len(input.shape)
+    axis = axis if axis >= 0 else axis + num_axes
+    assert 0 <= axis < num_axes, "axis out of range"
+    if axis == num_axes - 1:
+        return _argmax_at_last_dim(input, name)
+    else:
+        perm = get_perm_when_transpose_axis_to_last_dim(num_axes, axis)
+        x = flow.transpose(input, perm, False, True, name + "_transpose")
+        x = _argmax_at_last_dim(x, name)
+        x = flow.expand_dims(x, -1, name + "_expand_dims")
+        x = flow.transpose(
+            x, get_inversed_perm(perm), False, True, name + "_inverse_transpose"
+        )
+        x = flow.squeeze(x, [axis], name + "_squeeze")
+        return x
+
+
+def broadcast_to_compatible_with(
+    x: oneflow._oneflow_internal.BlobDesc,
+    compatible: Sequence[oneflow._oneflow_internal.BlobDesc],
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Returns a 'Blob' with the shape can be broadcasted by other shapes
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): a 'Blob'
+        compatible (Sequence[oneflow._oneflow_internal.BlobDesc]): Sequence of different shape
+        name (Optional[str], optional): This operator's name. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A 'Blob' with the biggest shape
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def broadcast_to_compatible_with_Job(x: tp.Numpy.Placeholder((4, 1, 1))
+        )->tp.Numpy:
+            blob_a = flow.constant(value=1, dtype=flow.float32, shape=(1, 2, 1))
+            blob_b = flow.constant(value=1, dtype=flow.float32, shape=(1, 1, 3))
+
+            return flow.math.broadcast_to_compatible_with(x, [blob_a, blob_b])
+
+        x = np.ones(shape=(4, 1, 1), dtype=np.float32)
+
+        out = broadcast_to_compatible_with_Job(x)
+
+        # out.shape (4, 2, 3)
+
+    """
+    assert isinstance(compatible, (list, tuple))
+    if name is None:
+        name = id_util.UniqueStr("BroadcastToCompatibleWith_")
+    op_conf = op_conf_util.OperatorConf()
+    setattr(op_conf, "name", name)
+    setattr(op_conf.broadcast_to_compatible_with_conf, "x", x.unique_name)
+    setattr(op_conf.broadcast_to_compatible_with_conf, "y", "y")
+    op_conf.broadcast_to_compatible_with_conf.compatible.extend(
+        [cp.unique_name for cp in compatible]
+    )
+    interpret_util.Forward(op_conf)
+    ret_lbi = logical_blob_id_util.LogicalBlobId()
+    ret_lbi.op_name = op_conf.name
+    ret_lbi.blob_name = "y"
+    return remote_blob_util.RemoteBlob(ret_lbi)
+
+
+def clip_by_value(
+    values: oneflow._oneflow_internal.BlobDesc,
+    min_value: Optional[Union[int, float]] = None,
+    max_value: Optional[Union[int, float]] = None,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This op clips Blob values to a specified min value and max value.
+
+    The equation is:
+
+    .. math::
+        out = MIN(MAX(x, min), max)
+
+    Args:
+        values (oneflow._oneflow_internal.BlobDesc): Input Blob
+        min_value (Optional[Union[int, float]], optional): The minimum value to clip by. Defaults to None.
+        max_value (Optional[Union[int, float]], optional): The maximum value to clip by. Defaults to None.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Raises:
+        ValueError: min_value and max_value `cannot be None at the same time`
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A clipped Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def clip_by_value_Job(x: tp.Numpy.Placeholder((4, ))
+        )->tp.Numpy:
+            return flow.math.clip_by_value(x, min_value=-1, max_value=5)
+
+        x = np.array([-2, 1, 4, 7], dtype=np.float32)
+
+        out = clip_by_value_Job(x)
+
+        # out [-1. 1. 4. 5.]
+
+    """
+    if name is None:
+        name = id_util.UniqueStr("ClipByValue_")
+    is_floating = values.dtype in [flow.float32, flow.float16, flow.float64]
+    if min_value is not None:
+        floating_min_value = float(min_value) if is_floating else 0.0
+        integral_min_value = 0 if is_floating else int(min_value)
+    if max_value is not None:
+        floating_max_value = float(max_value) if is_floating else 0.0
+        integral_max_value = 0 if is_floating else int(max_value)
+    if min_value is not None and max_value is not None:
+        op_builder = (
+            flow.user_op_builder(name)
+            .Op("clip_by_scalar")
+            .Attr("floating_min", floating_min_value)
+            .Attr("integral_min", integral_min_value)
+            .Attr("floating_max", floating_max_value)
+            .Attr("integral_max", integral_max_value)
+        )
+    elif min_value is not None:
+        op_builder = (
+            flow.user_op_builder(name)
+            .Op("clip_by_scalar_min")
+            .Attr("floating_min", floating_min_value)
+            .Attr("integral_min", integral_min_value)
+        )
+    elif max_value is not None:
+        op_builder = (
+            flow.user_op_builder(name)
+            .Op("clip_by_scalar_max")
+            .Attr("floating_max", floating_max_value)
+            .Attr("integral_max", integral_max_value)
+        )
+    else:
+        raise ValueError("min_value and max_value cannot be None at the same time")
+    return (
+        op_builder.Input("x", [values])
+        .Output("y")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def l2_normalize(
+    input: oneflow._oneflow_internal.BlobDesc,
+    axis: Optional[int] = None,
+    epsilon: float = 1e-12,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Use L2 norm to normalizes along dimension `axis`
+
+    The equation is:
+
+    .. math::
+        out = \\frac{x}{\\sqrt{\\Sigma{x^2}+\\epsilon}}
+
+    Args:
+        input (oneflow._oneflow_internal.BlobDesc): Input Blob
+        axis (Optional[int], optional): The axis on which to apply L2 normalization. Defaults to None.
+        epsilon (float, optional): The epsilon value is used to avoid division by zero. Defaults to 1e-12.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The normalized Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def l2_normalize_Job(x: tp.Numpy.Placeholder((4, ))
+        )->tp.Numpy:
+            return flow.math.l2_normalize(x, axis=0)
+
+        x = np.array([1, 2, 3, 4], dtype=np.float32)
+
+        out = l2_normalize_Job(x)
+
+        # out [0.18257418 0.36514837 0.5477226  0.73029673]
+
+    """
+    if axis < 0:
+        axis += len(input.shape)
+    assert axis >= 0 and axis < len(input.shape)
+    (y, square_x_sum) = (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("L2Normalize_")
+        )
+        .Op("l2_normalize")
+        .Input("x", [input])
+        .Output("y")
+        .Output("square_x_sum")
+        .Attr("axis", int(axis))
+        .Attr("epsilon", float(epsilon))
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()
+    )
+    return y
+
+
+def squared_difference(
+    x: Union[int, float, oneflow._oneflow_internal.BlobDesc],
+    y: Union[int, float, oneflow._oneflow_internal.BlobDesc],
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This op computes :math:`(x - y)^2` element-wise.
+
+    Args:
+        x (Union[int, float, oneflow._oneflow_internal.BlobDesc]): A Blob
+        y (Union[int, float, oneflow._oneflow_internal.BlobDesc]): A Blob with the same type of x
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def squared_difference_Job(x: tp.Numpy.Placeholder((4, )),
+                                y: tp.Numpy.Placeholder((4, ))
+        )->tp.Numpy:
+            return flow.math.squared_difference(x, y)
+
+        x = np.array([1, 2, 3, 4], dtype=np.float32)
+        y = np.array([2, 4, 6, 8], dtype=np.float32)
+
+        out = squared_difference_Job(x, y)
+
+        # out [ 1.  4.  9. 16.]
+
+    """
+    (name_subtract, name_square) = (None, None)
+    if name is not None:
+        name_subtract = name + "_subtract"
+        name_square = name + "_square"
+    return flow.math.square(flow.math.subtract(x, y, name_subtract), name_square)
+
+
+def gelu_grad(
+    x: oneflow._oneflow_internal.BlobDesc,
+    dy: oneflow._oneflow_internal.BlobDesc,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    return (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("GeluGrad_")
+        )
+        .Op("gelu_grad")
+        .Input("x", [x])
+        .Input("dy", [dy])
+        .Output("dx")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def tril(
+    x: oneflow._oneflow_internal.BlobDesc,
+    diagonal: int = 0,
+    fill_value: Union[int, float] = 0,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Compute lower triangle of an matrix.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): Input Blob.
+        diagonal (int): Diagonal offset, when diagonal > 0, diagonal offset up,
+                        otherwise, offset downward.
+        fill_value(Union[int, float]): The value filled into the upper triangle.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Attention:
+        The dimension of x must greater or equal to 2.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The lower triangle blob of input.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+        @flow.global_function()
+        def tril_Job(x: tp.Numpy.Placeholder((4, 4))
+        )->tp.Numpy:
+            return flow.math.tril(x, 0)
+        x = np.array([[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]],
+                      dtype=np.float32)
+        out = tril_Job(x).get()
+
+        # output [[1, 0, 0, 0],
+                  [1, 2, 0, 0],
+                  [1, 2, 3, 0],
+                  [1, 2, 3, 4]]
+
+    """
+    if isinstance(fill_value, float):
+        is_floating_fill_value = True
+        floating_fill_value = float(fill_value)
+        integer_fill_value = int(0)
+    else:
+        is_floating_fill_value = False
+        floating_fill_value = float(0)
+        integer_fill_value = int(fill_value)
+    return (
+        flow.user_op_builder(name if name is not None else id_util.UniqueStr("Tril_"))
+        .Op("tril")
+        .Input("in", [x])
+        .Attr("diagonal", diagonal)
+        .Attr("is_floating_fill_value", is_floating_fill_value)
+        .Attr("floating_fill_value", floating_fill_value)
+        .Attr("integer_fill_value", integer_fill_value)
+        .Output("out")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def fused_scale_tril(
+    x: oneflow._oneflow_internal.BlobDesc,
+    diagonal: int = 0,
+    fill_value: Union[int, float] = 0,
+    scale: Union[int, float] = 1,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    if isinstance(fill_value, float):
+        is_floating_fill_value = True
+        floating_fill_value = float(fill_value)
+        integer_fill_value = int(0)
+    else:
+        is_floating_fill_value = False
+        floating_fill_value = float(0)
+        integer_fill_value = int(fill_value)
+    if isinstance(scale, float):
+        is_floating_scale_value = True
+        floating_scale_value = float(scale)
+        integer_scale_value = int(1)
+    else:
+        is_floating_scale_value = False
+        floating_scale_value = float(1)
+        integer_scale_value = int(scale)
+    return (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("FusedScaleTril_")
+        )
+        .Op("fused_scale_tril")
+        .Input("in", [x])
+        .Attr("diagonal", diagonal)
+        .Attr("is_floating_fill_value", is_floating_fill_value)
+        .Attr("floating_fill_value", floating_fill_value)
+        .Attr("integer_fill_value", integer_fill_value)
+        .Attr("is_floating_scale_value", is_floating_scale_value)
+        .Attr("floating_scale_value", floating_scale_value)
+        .Attr("integer_scale_value", integer_scale_value)
+        .Output("out")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def fused_scale_tril_softmax_dropout(
+    x: oneflow._oneflow_internal.BlobDesc,
+    diagonal: int = 0,
+    fill_value: Union[int, float] = 0,
+    scale: Union[int, float] = 1,
+    rate: float = 0.0,
+    noise_shape: Optional[oneflow._oneflow_internal.BlobDesc] = None,
+    seed: Optional[int] = None,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    if name is None:
+        name = id_util.UniqueStr("FusedTrilScaleSoftmaxMaskScale_")
+    mask = flow.nn.random_mask_like(
+        x, rate, seed, noise_shape, "%s-dropout_random_mask_like" % name
+    )
+    (y, softmax_y) = (
+        flow.user_op_builder(name)
+        .Op("fused_tril_scale_softmax_mask_scale")
+        .Input("x", [x])
+        .Input("mask", [mask])
+        .Attr("diagonal", diagonal)
+        .Attr("tril_fill_value", float(fill_value))
+        .Attr("tril_scale_value", float(scale))
+        .Attr("mask_scale_value", float(1.0 / (1.0 - rate)))
+        .Output("y")
+        .Output("softmax_y")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()
+    )
+    return y
+
+
+def polyval(
+    coeffs: Union[List, Tuple],
+    x: oneflow._oneflow_internal.BlobDesc,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Computes the elementwise value of a polynomial.
+
+    Args:
+        coeffs (Union[List, Tuple]): The coefficients of the polynomial.
+        x (oneflow._oneflow_internal.BlobDesc): A Blob.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A Blob, has the same data type of x.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def polyval_Job(
+            x: tp.Numpy.Placeholder((3,), dtype=flow.float32)
+        ) -> tp.Numpy:
+            coeffs = [1.0, 3.0, -2.0]
+            return flow.math.polyval(coeffs, x)
+
+        x = np.array([1.0, 2.0, 3.0]).astype(np.float32)
+        out = polyval_Job(x)
+
+        # output [ 2. 8. 16.]
+
+    """
+    if name is None:
+        name = id_util.UniqueStr("Polyval_")
+    if not isinstance(coeffs, (list, tuple)):
+        raise ValueError(
+            "Argument coeffs must be list type found {}".format(type(coeffs))
+        )
+    if len(coeffs) < 1:
+        return flow.zeros_like(x, name=name)
+    p = flow.zeros_like(x, name=name)
+    for c in coeffs:
+        p = flow.math.add(c, flow.math.multiply(p, x))
+    return p
+
+
+def in_top_k(
+    targets: oneflow._oneflow_internal.BlobDesc,
+    predictions: oneflow._oneflow_internal.BlobDesc,
+    k: Optional[int],
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Says whether the targets are in the top K predictions.
+
+    Args:
+        targets (oneflow._oneflow_internal.BlobDesc): A Blob of type int32 or int64.
+        predictions (oneflow._oneflow_internal.BlobDesc): A Blob of type float32.
+        k (Optional[int], optional): Number of top elements to look at for computing precision.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A Blob of type bool. Computed Precision at k as a bool Blob.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def intopk_Job(
+            targets: tp.Numpy.Placeholder((2,), dtype=flow.int32),
+            predictions: tp.Numpy.Placeholder((2, 4), dtype=flow.float32),
+        ) -> tp.Numpy:
+            return flow.math.in_top_k(targets, predictions, 1)
+
+        targets = np.array([3, 1], dtype=np.int32)
+        predictions = np.array([[0.0, 1.0, 2.0, 3.0], [3.0, 2.0, 1.0, 0.0],], dtype=np.float32)
+        out = intopk_Job(targets, predictions)
+
+        # out [1 0]
+
+    """
+    return (
+        flow.user_op_builder(name if name is not None else id_util.UniqueStr("InTopK_"))
+        .Op("in_top_k")
+        .Input("targets", [targets])
+        .Input("predictions", [predictions])
+        .Attr("k", k)
+        .Output("out")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def range(
+    start, limit=None, delta=1, dtype=None, name="range"
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator is similar to python `range`, the difference is that `oneflow.compatible.single_client.range` generates
+    a Blob.
+
+    Args:
+        start ([type]): The start of interval. Its type should be `int`.
+        limit ([type], optional): The limit of interval. Its type should be `int`.
+        delta (int, optional): The numerical spacing between elements. Defaults to 1.
+        dtype ([type], optional): The output's data type. Currently we only support `oneflow.compatible.single_client.int64`. Defaults to None.
+        name (str, optional): The name for the operation. Defaults to "range".
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    Example 1:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def range_job()->tp.Numpy:
+            with flow.scope.placement("cpu", "0:0"):
+                out = flow.range(10, dtype=flow.int64)
+
+            return out
+
+        out = range_job()
+
+        # out [0 1 2 3 4 5 6 7 8 9]
+
+    Example2:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def range_job()->tp.Numpy:
+            with flow.scope.placement("cpu", "0:0"):
+                out = flow.range(1, 10, 3, dtype=flow.int64)
+
+            return out
+
+        out = range_job()
+
+        # out [1 4 7]
+
+    """
+    if dtype is None:
+        dtype = flow.int64
+    if limit is None:
+        (start, limit) = (0, start)
+    assert limit > start, "Limit should be larger than start"
+    assert delta <= limit - start, "Delta is ilegal"
+    assert type(start) == int, "Params `start`'s type should be int"
+    assert type(limit) == int, "Params `limit`'s type should be int"
+    assert type(delta) == int, "Params `delta`'s type should be int"
+    return (
+        flow.user_op_builder(name if name is not None else id_util.UniqueStr("Range_"))
+        .Op("range")
+        .Attr("start", start)
+        .Attr("delta", delta)
+        .Attr("limit", limit)
+        .Attr("dtype", dtype)
+        .Output("out")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
diff --git a/python/oneflow/compatible/single_client/ops/math_unary_elementwise_ops.py b/python/oneflow/compatible/single_client/ops/math_unary_elementwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc02db57e071eb48610fb9b3161f84fee5dc7562
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/math_unary_elementwise_ops.py
@@ -0,0 +1,1378 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+import traceback
+from typing import Optional
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework import interpret_util as interpret_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+from oneflow.compatible.single_client.ops import user_op_builder as user_op_builder
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+from oneflow.core.register import logical_blob_id_pb2 as logical_blob_id_util
+
+
+def build_unary_elemwise_math_op(math_op, x, name=None):
+    if name is None:
+        name = id_util.UniqueStr(math_op + "_")
+    return (
+        flow.user_op_builder(name)
+        .Op(math_op)
+        .Input("x", [x])
+        .Output("y")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def abs(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator returns the absolute value of Blob.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def abs_Job(x: tp.Numpy.Placeholder((3,))
+        ) -> tp.Numpy:
+            return flow.math.abs(x)
+
+
+        x = np.array([-1, 2, -3]).astype(np.float32)
+        out = abs_Job(x)
+
+        # out [1. 2. 3.]
+
+    """
+    return build_unary_elemwise_math_op("abs", x, name)
+
+
+def acos(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the acos value of Blob.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def acos_Job(x: tp.Numpy.Placeholder((3,))
+        ) -> tp.Numpy:
+            return flow.math.acos(x)
+
+
+        x = np.array([0.5, 0.6, 0.7]).astype(np.float32)
+        out = acos_Job(x)
+
+        # out [1.0471976 0.9272952 0.7953989]
+        # We take the first value as an example
+        # (arccos(0.5) * pi) / 180 = 1.0471976
+
+    """
+    return build_unary_elemwise_math_op("acos", x, name)
+
+
+def acosh(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the inverse hyperbolic cosine value of Blob.
+
+    The equation is:
+
+    .. math::
+
+        out = log(x+(x^2-1)^\\frac{1}{2})
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob, the range is [1, inf]
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def acosh_Job(x: tp.Numpy.Placeholder((3,))
+        ) -> tp.Numpy:
+            return flow.math.acosh(x)
+
+
+        x = np.array([2, 3, 4]).astype(np.float32)
+        out = acosh_Job(x)
+
+        # out [1.316958  1.7627473 2.063437 ]
+
+    """
+    return build_unary_elemwise_math_op("acosh", x, name)
+
+
+def asin(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the arcsin value of Blob.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def asin_Job(x: tp.Numpy.Placeholder((3,))
+        ) -> tp.Numpy:
+            return flow.math.asin(x)
+
+
+        x = np.array([0.5, 0.6, 0.7]).astype(np.float32)
+        out = asin_Job(x)
+
+        # out [0.5235988  0.64350116 0.7753975 ]
+        # We take the first value as an example
+        # (arcsin(0.5) * pi) / 180 = 0.5235988
+
+    """
+    return build_unary_elemwise_math_op("asin", x, name)
+
+
+def asinh(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the inverse hyperbolic sine value of Blob.
+
+    The equation is:
+
+    .. math::
+
+        out = log(x+(x^2+1)^\\frac{1}{2})
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def asinh_Job(x: tp.Numpy.Placeholder((3,))
+        ) -> tp.Numpy:
+            return flow.math.asinh(x)
+
+
+        x = np.array([2, 3, 4]).astype(np.float32)
+        out = asinh_Job(x)
+
+        # out [1.4436355 1.8184464 2.0947125]
+
+    """
+    return build_unary_elemwise_math_op("asinh", x, name)
+
+
+def atan(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the arctan value of Blob.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def atan_Job(x: tp.Numpy.Placeholder((3,))
+        ) -> tp.Numpy:
+            return flow.math.atan(x)
+
+
+        x = np.array([0.5, 0.6, 0.7]).astype(np.float32)
+        out = atan_Job(x)
+
+        # out [0.4636476  0.5404195  0.61072594]
+        # We take the first value as an example
+        # (arctan(0.5) * pi) / 180 = 0.4636476
+
+    """
+    return build_unary_elemwise_math_op("atan", x, name)
+
+
+def atanh(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the inverse hyperbolic tangent value of Blob.
+
+    The equation is:
+
+    .. math::
+
+        out = \\frac{1}{2}*log(\\frac{1+x}{1-x})
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def atanh_Job(x: tp.Numpy.Placeholder((3,))
+        ) -> tp.Numpy:
+            return flow.math.atanh(x)
+
+
+        x = np.array([0.5, 0.6, 0.7]).astype(np.float32)
+        out = atanh_Job(x)
+
+        # out [0.54930615 0.6931472  0.8673005 ]
+
+    """
+    return build_unary_elemwise_math_op("atanh", x, name)
+
+
+def ceil(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the ceiling value of Blob.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def ceil_Job(x: tp.Numpy.Placeholder((3,))
+        ) -> tp.Numpy:
+            return flow.math.ceil(x)
+
+
+        x = np.array([1.3, 1.5, 2.7]).astype(np.float32)
+        out = ceil_Job(x)
+
+        # out [2. 2. 3.]
+
+    """
+    return build_unary_elemwise_math_op("ceil", x, name)
+
+
+def cos(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the cosine value of Blob.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def cos_Job(x: tp.Numpy.Placeholder((3,))
+        ) -> tp.Numpy:
+            return flow.math.cos(x)
+
+
+        x = np.array([1/3*np.pi, 0.25*np.pi, 1.25*np.pi]).astype(np.float32)
+        out = cos_Job(x)
+
+        # out [ 0.49999997  0.70710677 -0.7071068 ]
+
+    """
+    return build_unary_elemwise_math_op("cos", x, name)
+
+
+def cosh(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes hyperbolic cosine value of Blob.
+
+    The equation is:
+
+    .. math::
+
+        out = \\frac{e^x+e^{-x}}{2}
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def cosh_Job(x: tp.Numpy.Placeholder((3,))
+                    ) -> tp.Numpy:
+            return flow.math.cosh(x)
+
+
+        x = np.array([1, 2, 3]).astype(np.float32)
+        out = cosh_Job(x)
+
+        # out [ 1.5430806  3.7621958 10.067662 ]
+
+    """
+    return build_unary_elemwise_math_op("cosh", x, name)
+
+
+def erf(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the Gauss error value of Blob.
+
+    The equation is:
+
+    .. math ::
+
+        out = \\frac{2}{\\sqrt{\\pi}}*\\int_{0}^{x}e^{-z^2}\\mathrm{d}{z}
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def erf_Job(x: tp.Numpy.Placeholder((3,))
+                    ) -> tp.Numpy:
+            return flow.math.erf(x)
+
+
+        x = np.array([1, 2, 3]).astype(np.float32)
+        out = erf_Job(x)
+
+        # out [0.8427008 0.9953223 0.9999779]
+
+    """
+    return build_unary_elemwise_math_op("erf", x, name)
+
+
+def erfc(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the :math:`1-erf(x)`, for more details of `erf` function
+    please refer to `math.erf`.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def erfc_Job(x: tp.Numpy.Placeholder((3,))
+                    ) -> tp.Numpy:
+            return flow.math.erfc(x)
+
+
+        x = np.array([1, 2, 3]).astype(np.float32)
+        out = erfc_Job(x)
+
+        # out [1.5729921e-01 4.6777353e-03 2.2090495e-05]
+
+    """
+    return build_unary_elemwise_math_op("erfc", x, name)
+
+
+def exp(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the exponential of Blob.
+
+    The equation is:
+
+    .. math::
+
+        out = e^x
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def exp_Job(x: tp.Numpy.Placeholder((3,))
+                    ) -> tp.Numpy:
+            return flow.math.exp(x)
+
+
+        x = np.array([1, 2, 3]).astype(np.float32)
+        out = exp_Job(x)
+
+        # out [ 2.7182817  7.389056  20.085537 ]
+
+    """
+    return build_unary_elemwise_math_op("exp", x, name)
+
+
+def expm1(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes :math:`y=e^x-1`.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def expm1_Job(x: tp.Numpy.Placeholder((3,))
+                    ) -> tp.Numpy:
+            return flow.math.expm1(x)
+
+
+        x = np.array([1, 2, 3]).astype(np.float32)
+        out = expm1_Job(x)
+
+        # out [ 1.7182819  6.389056  19.085537 ]
+
+    """
+    return build_unary_elemwise_math_op("expm1", x, name)
+
+
+def floor(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the largest integer not greater than input Blob.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def floor_Job(x: tp.Numpy.Placeholder((3,))
+        ) -> tp.Numpy:
+            return flow.math.floor(x)
+
+
+        x = np.array([1.3, 1.5, 2.7]).astype(np.float32)
+        out = floor_Job(x)
+
+        # out [1. 1. 2.]
+
+    """
+    return build_unary_elemwise_math_op("floor", x, name)
+
+
+def lgamma(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the :math:`Gamma(x)` value.
+
+    The equation is:
+
+    .. math::
+
+        out = \\int_{0}^{\\infty}t^{x-1}*e^{-t}\\mathrm{d}{t}
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def lgamma_Job(x: tp.Numpy.Placeholder((3,))
+        ) -> tp.Numpy:
+            return flow.math.lgamma(x)
+
+
+        x = np.array([1.3, 1.5, 2.7]).astype(np.float32)
+        out = lgamma_Job(x)
+
+        # out [-0.1081748  -0.12078223  0.4348206 ]
+
+    """
+    return build_unary_elemwise_math_op("lgamma", x, name)
+
+
+def log(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the log value of input Blob.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def log_Job(x: tp.Numpy.Placeholder((3,))
+        ) -> tp.Numpy:
+            return flow.math.log(x)
+
+
+        x = np.array([1.3, 1.5, 2.7]).astype(np.float32)
+        out = log_Job(x)
+
+        # out [0.26236424 0.40546513 0.9932518 ]
+
+    """
+    return build_unary_elemwise_math_op("log", x, name)
+
+
+def log1p(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the :math:`log(x)+1` value of input Blob.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def log1p_Job(x: tp.Numpy.Placeholder((3,))
+        ) -> tp.Numpy:
+            return flow.math.log1p(x)
+
+
+        x = np.array([1.3, 1.5, 2.7]).astype(np.float32)
+        out = log1p_Job(x)
+
+        # out [0.8329091  0.91629076 1.3083328 ]
+
+    """
+    return build_unary_elemwise_math_op("log1p", x, name)
+
+
+def log_sigmoid(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the log sigmoid value of input Blob.
+
+    The equation is:
+
+    .. math::
+
+        out = log(\\frac{1}{1+e^{-x}})
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def log_sigmoid_Job(x: tp.Numpy.Placeholder((3,))
+        ) -> tp.Numpy:
+            return flow.math.log_sigmoid(x)
+
+
+        x = np.array([1.3, 1.5, 2.7]).astype(np.float32)
+        out = log_sigmoid_Job(x)
+
+        # out [-0.24100842 -0.20141333 -0.0650436 ]
+
+    """
+    return build_unary_elemwise_math_op("log_sigmoid", x, name)
+
+
+def negative(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the negative value of Blob.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def negative_Job(x: tp.Numpy.Placeholder((3,))
+        ) -> tp.Numpy:
+            return flow.math.negative(x)
+
+
+        x = np.array([1.3, 1.5, 2.7]).astype(np.float32)
+        out = negative_Job(x)
+
+        # out [-1.3 -1.5 -2.7]
+
+    """
+    return build_unary_elemwise_math_op("negative", x, name)
+
+
+def reciprocal(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the reciprocal of x.
+
+    The equation is:
+
+    .. math::
+
+        out = \\frac{1}{x}
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def reciprocal_Job(x: tp.Numpy.Placeholder((3,))
+        ) -> tp.Numpy:
+            return flow.math.reciprocal(x)
+
+
+        x = np.array([1, 2, 4]).astype(np.float32)
+        out = reciprocal_Job(x)
+
+        # out [1.   0.5  0.25]
+
+    """
+    return build_unary_elemwise_math_op("reciprocal", x, name)
+
+
+def reciprocal_no_nan(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the safe reciprocal of x. If x is zero, the reciprocal will
+    be also set to zero.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def reciprocal_no_nan_Job(x: tp.Numpy.Placeholder((3,))
+        ) -> tp.Numpy:
+            return flow.math.reciprocal_no_nan(x)
+
+
+        x = np.array([0, 2, 4]).astype(np.float32)
+        out = reciprocal_no_nan_Job(x)
+
+        # out [0.   0.5  0.25]
+
+    """
+    return build_unary_elemwise_math_op("reciprocal_no_nan", x, name)
+
+
+def rint(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the closest integer to Blob.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def rint_Job(x: tp.Numpy.Placeholder((3,))
+        ) -> tp.Numpy:
+            return flow.math.rint(x)
+
+
+        x = np.array([1.49999, 1.500001, 2.7]).astype(np.float32)
+        out = rint_Job(x)
+
+        # out [1. 2. 3.]
+
+    """
+    return build_unary_elemwise_math_op("rint", x, name)
+
+
+def round(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator rounds the value of Blob to the nearest integer.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def round_Job(x: tp.Numpy.Placeholder((3,))
+        ) -> tp.Numpy:
+            return flow.math.round(x)
+
+
+        x = np.array([1.49999, 1.500001, 2.7]).astype(np.float32)
+        out = round_Job(x)
+
+        # out [1. 2. 3.]
+
+    """
+    return build_unary_elemwise_math_op("round", x, name)
+
+
+def rsqrt(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the reciprocal of square root value of Blob.
+
+    The equation is:
+
+    .. math::
+
+        out=\\frac{1}{\\sqrt{x}}
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def rsqrt_Job(x: tp.Numpy.Placeholder((3,))
+        ) -> tp.Numpy:
+            return flow.math.rsqrt(x)
+
+
+        x = np.array([4, 16, 25]).astype(np.float32)
+        out = rsqrt_Job(x)
+
+        # out [0.5  0.25 0.2 ]
+
+    """
+    return build_unary_elemwise_math_op("rsqrt", x, name)
+
+
+def sigmoid_v2(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the sigmoid value of Blob.
+
+    The equation is:
+
+    .. math::
+
+        out=\\frac{1}{1+e^{-x}}
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def sigmoidv2_Job(x: tp.Numpy.Placeholder((3,))
+        ) -> tp.Numpy:
+            return flow.math.sigmoid_v2(x)
+
+        x = np.array([-0.5, 0, 0.5]).astype(np.float32)
+        out = sigmoidv2_Job(x)
+
+        # out [0.37754068 0.5        0.62245935]
+
+    """
+    return build_unary_elemwise_math_op("sigmoid_v2", x, name)
+
+
+def sign(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator returns the sign of Blob.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def sign_Job(x: tp.Numpy.Placeholder((3,))
+        ) -> tp.Numpy:
+            return flow.math.sign(x)
+
+
+        x = np.array([-2, 0, 2]).astype(np.float32)
+        out = sign_Job(x)
+
+        # out [-1.  0.  1.]
+
+    """
+    return build_unary_elemwise_math_op("sign", x, name)
+
+
+def sin(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the sin value of Blob.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def sin_Job(x: tp.Numpy.Placeholder((3,))
+        ) -> tp.Numpy:
+            return flow.math.sin(x)
+
+
+        x = np.array([-1/6*np.pi, 0, 1/6*np.pi]).astype(np.float32)
+        out = sin_Job(x)
+
+        # out [-0.5  0.   0.5]
+
+    """
+    return build_unary_elemwise_math_op("sin", x, name)
+
+
+def sinh(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the hyperbolic sine value of Blob.
+
+    The equation is:
+
+    .. math::
+
+        out =\\frac{e^x-e^{-x}}{2}
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def sinh_Job(x: tp.Numpy.Placeholder((3,))
+        ) -> tp.Numpy:
+            return flow.math.sinh(x)
+
+
+        x = np.array([-1, 0, 1]).astype(np.float32)
+        out = sinh_Job(x)
+
+        # out [-1.1752012  0.         1.1752012]
+
+    """
+    return build_unary_elemwise_math_op("sinh", x, name)
+
+
+def softplus(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the softplus value of Blob.
+
+    The equation is:
+
+    .. math::
+
+        out = log(e^x+1)
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def softplus_Job(x: tp.Numpy.Placeholder((3,))
+        ) -> tp.Numpy:
+            return flow.math.softplus(x)
+
+
+        x = np.array([-1, 0, 1]).astype(np.float32)
+        out = softplus_Job(x)
+
+        # out [0.31326166 0.6931472  1.3132616 ]
+
+    """
+    return build_unary_elemwise_math_op("softplus", x, name)
+
+
+def sqrt(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the sqrt root value of Blob.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def sqrt_Job(x: tp.Numpy.Placeholder((3,))
+        ) -> tp.Numpy:
+            return flow.math.sqrt(x)
+
+
+        x = np.array([4, 16, 25]).astype(np.float32)
+        out = sqrt_Job(x)
+
+        # out [2. 4. 5.]
+
+    """
+    return build_unary_elemwise_math_op("sqrt", x, name)
+
+
+def square(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the square value of Blob.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def square_Job(x: tp.Numpy.Placeholder((3,))
+        ) -> tp.Numpy:
+            return flow.math.square(x)
+
+
+        x = np.array([2, 3, 4]).astype(np.float32)
+        out = square_Job(x)
+
+        # out [ 4.  9. 16.]
+
+    """
+    return build_unary_elemwise_math_op("square", x, name)
+
+
+def tan(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the tan value of Blob.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def tan_Job(x: tp.Numpy.Placeholder((3,))
+        ) -> tp.Numpy:
+            return flow.math.tan(x)
+
+
+        x = np.array([-1/4*np.pi, 0, 1/4*np.pi]).astype(np.float32)
+        out = tan_Job(x)
+
+        # out [-1.  0.  1.]
+
+    """
+    return build_unary_elemwise_math_op("tan", x, name)
+
+
+def tanh(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the hyperbolic tangent value of Blob.
+
+    The equation is:
+
+    .. math::
+
+        out = \\frac{e^x-e^{-x}}{e^x+e^{-x}} 
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def tanh_Job(x: tp.Numpy.Placeholder((3,))
+        ) -> tp.Numpy:
+            return flow.math.tanh(x)
+
+
+        x = np.array([-1, 0, 1]).astype(np.float32)
+        out = tanh_Job(x)
+
+        # out [-0.7615942  0.         0.7615942]
+
+    """
+    return build_unary_elemwise_math_op("tanh", x, name)
+
+
+def tanh_v2(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the hyperbolic tangent value of Blob.
+
+    The equation is:
+
+    .. math::
+
+        out = \\frac{e^x-e^{-x}}{e^x+e^{-x}}
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+    """
+    print(
+        "WARNING: flow.math.tanh_v2 has been deprecated. Please replace it by flow.math.tanh.\n        "
+    )
+    print(traceback.format_stack()[-2])
+    return flow.math.tanh(x, name)
diff --git a/python/oneflow/compatible/single_client/ops/nn_ops.py b/python/oneflow/compatible/single_client/ops/nn_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b09b80886f5e67be22a684033bbdcb4c336443a
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/nn_ops.py
@@ -0,0 +1,4353 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import collections
+import os
+import random
+import sys
+from typing import List, Optional, Sequence, Tuple, Union
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import distribute as distribute_util
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework import interpret_util as interpret_util
+from oneflow.compatible.single_client.framework import module as module_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+from oneflow.core.register import logical_blob_id_pb2 as logical_blob_id_util
+
+IntPair = Tuple[int, int]
+
+
+def calc_same_padding(input_size, filter_size, dilation_rate, stride):
+    effective_filter_size = (filter_size - 1) * dilation_rate + 1
+    output_size = (input_size + stride - 1) // stride
+    padding_needed = max(
+        0, int((output_size - 1) * stride + effective_filter_size - input_size)
+    )
+    return padding_needed
+
+
+def get_dhw_offset(channel_pos):
+    if channel_pos == "channels_first":
+        return 2
+    else:
+        return 1
+
+
+def check_conv_cudnn_padding_support(
+    input_size, pad, filter_size, dilation_rate, stride, is_dynamic
+):
+    assert len(pad) == 2
+    if pad[0] == pad[1]:
+        return True
+    elif is_dynamic or pad[0] < pad[1] or pad[0] - pad[1] > 1:
+        return False
+    else:
+        effective_filter_size = (filter_size - 1) * dilation_rate + 1
+        cudnn_output_size = (
+            input_size + 2 * pad[0] - effective_filter_size + stride
+        ) // stride
+        output_size = (
+            input_size + pad[0] + pad[1] - effective_filter_size + stride
+        ) // stride
+        return cudnn_output_size == output_size
+
+
+def check_ndim_conv_cudnn_padding_support(
+    inputs_shape,
+    ndim_pads_list,
+    kernel_sizes,
+    dilations,
+    strides,
+    dhw_offset,
+    is_dynamic,
+):
+    ndims = len(ndim_pads_list)
+    for i in range(ndims):
+        cudnn_support = check_conv_cudnn_padding_support(
+            inputs_shape[dhw_offset + i],
+            ndim_pads_list[i],
+            kernel_sizes[i],
+            dilations[i],
+            strides[i],
+            is_dynamic,
+        )
+        if not cudnn_support:
+            return False
+    return True
+
+
+def get_ndim_pads_list(padding, dhw_offset, ndims):
+    pads_list = []
+    for i in range(len(padding)):
+        pad = padding[i]
+        if isinstance(pad, int):
+            pad = [pad, pad]
+        elif isinstance(pad, (list, tuple)):
+            assert len(pad) == 2
+            pad = [pad[0], pad[1]]
+        else:
+            raise ValueError("padding must be list tuple or int")
+        if i in range(dhw_offset, dhw_offset + ndims):
+            pads_list.append(pad)
+        else:
+            assert pad == [0, 0]
+    return pads_list
+
+
+def calc_ndim_same_padding(
+    input_shape, padding, kernel_sizes, dilations, strides, dhw_offset
+):
+    ndim_padding_needed = []
+    ndims = len(kernel_sizes)
+    for i in range(ndims):
+        ndim_padding_needed.append(
+            calc_same_padding(
+                input_shape[dhw_offset + i], kernel_sizes[i], dilations[i], strides[i]
+            )
+        )
+    pads_small = [padding_needed // 2 for padding_needed in ndim_padding_needed]
+    pads_large = [ndim_padding_needed[i] - pads_small[i] for i in range(ndims)]
+    if padding.upper() == "SAME_LOWER":
+        return [[pads_large[i], pads_small[i]] for i in range(ndims)]
+    elif padding.upper() == "SAME_UPPER":
+        return [[pads_small[i], pads_large[i]] for i in range(ndims)]
+    else:
+        raise NotImplementedError
+
+
+def calc_conv_padding(inputs, padding, data_format, kernel_sizes, dilations, strides):
+    ndims = len(inputs.shape) - 2
+    assert len(kernel_sizes) == ndims
+    assert len(dilations) == ndims
+    assert len(strides) == ndims
+    is_dynamic = inputs.is_dynamic
+    channel_pos = "channels_first" if data_format.startswith("NC") else "channels_last"
+    dhw_offset = get_dhw_offset(channel_pos)
+    ndim_pads_list = []
+    if isinstance(padding, str):
+        padding = "SAME_LOWER" if padding.upper() == "SAME" else padding
+        assert padding.upper() in ["VALID", "SAME_LOWER", "SAME_UPPER"]
+        if padding.upper() == "VALID":
+            return_pads_list = [[0, 0]] * ndims
+            return (inputs, return_pads_list)
+        elif is_dynamic:
+            return_pads_list = [[0, 0]] * ndims
+            inputs = flow.same_padding(
+                inputs,
+                padding.lower(),
+                data_format=data_format,
+                kernel_size=kernel_sizes,
+                strides=strides,
+                dilation_rate=dilations,
+            )
+            return (inputs, return_pads_list)
+        else:
+            ndim_pads_list = calc_ndim_same_padding(
+                inputs.shape, padding, kernel_sizes, dilations, strides, dhw_offset
+            )
+            assert len(ndim_pads_list) == ndims
+    elif isinstance(padding, (list, tuple)):
+        assert len(padding) == ndims + 2
+        ndim_pads_list = get_ndim_pads_list(padding, dhw_offset, ndims)
+        assert len(ndim_pads_list) == ndims
+    else:
+        raise ValueError("padding must be str or a list.")
+    cudnn_padding_support = check_ndim_conv_cudnn_padding_support(
+        inputs.shape,
+        ndim_pads_list,
+        kernel_sizes,
+        dilations,
+        strides,
+        dhw_offset,
+        is_dynamic,
+    )
+    if cudnn_padding_support:
+        return (inputs, ndim_pads_list)
+    else:
+        pad_op_list = [[0, 0]] * (ndims + 2)
+        for i in range(ndims):
+            pad_op_list[dhw_offset + i] = ndim_pads_list[i]
+        inputs = flow.pad(inputs, paddings=pad_op_list)
+        return_pads_list = [[0, 0]] * ndims
+        return (inputs, return_pads_list)
+
+
+class ConvUtil(object):
+    @classmethod
+    def split(cls, x, axis, split_num):
+        split_len = x.shape[axis] // split_num
+        result_list = []
+        slice_begin = [0] * len(x.shape)
+        slice_size = [-1] * len(x.shape)
+        slice_size[axis] = split_len
+        for i in range(split_num):
+            slice_begin[axis] = i * split_len
+            result = flow.slice(x, slice_begin, slice_size)
+            result_list.append(result)
+        return result_list
+
+
+def conv_op(
+    conv_type,
+    inputs,
+    filters,
+    bias,
+    padding_before,
+    channel_pos,
+    kernel_size_list,
+    strides,
+    dilations,
+    groups,
+    name,
+):
+    op_builder = (
+        flow.user_op_builder(name if name is not None else id_util.UniqueStr("Conv_"))
+        .Op(conv_type)
+        .Input("in", [inputs])
+        .Input("weight", [filters])
+        .Output("out")
+        .Attr("filters", filters.shape[0])
+        .Attr("padding_before", padding_before)
+        .Attr("data_format", channel_pos)
+        .Attr("kernel_size", kernel_size_list)
+        .Attr("strides", strides)
+        .Attr("dilation_rate", dilations)
+        .Attr("groups", groups)
+    )
+    if bias is not None:
+        op_builder = op_builder.Input("bias", [bias])
+    return op_builder.Build().InferAndTryRun().RemoteBlobList()[0]
+
+
+def conv1d(
+    input: oneflow._oneflow_internal.BlobDesc,
+    filters: oneflow._oneflow_internal.BlobDesc,
+    strides: Union[int, Tuple[int]],
+    padding: Union[str, Tuple[IntPair, IntPair, IntPair]],
+    data_format: str = "NCW",
+    dilations: Optional[Union[int, Tuple[int]]] = None,
+    groups: int = 1,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """1D convolution layer.
+
+    Args:
+        input (oneflow._oneflow_internal.BlobDesc): A 3D input `Blob`. [batch_num, channel, width]
+        filters (oneflow._oneflow_internal.BlobDesc): A `Blob` with the same type as `input` and has the shape [out_channels, in_channels//groups, filter_width] for `NCW`, or [out_channels, filter_width, in_channels//groups] for `NWC`
+        strides (Union[int, Tuple[int]]): An int or list of `ints` that has length `1`. The stride of the sliding window for each dimension of `input`.
+        padding (Union[str, Tuple[IntPair, IntPair, IntPair]]): padding: `string` `"SAME"` or `"SAME_LOWER"` or `"SAME_UPPER"` or `"VALID" or Tuple[IntPair, IntPair, IntPair]` indicating the type of padding algorithm to use, or a list indicating the explicit paddings at the start and end of each dimension.
+        data_format (str, optional): `"NWC" or "NCW"`. Defaults to `"NCW"`.
+        dilations (Optional[Union[int, Tuple[int]]], optional): An int or list of `ints` that has length `1`. The dilation factor for each dimension of `input`. Defaults to None.
+        groups (int, optional): int value greater than 0. Defaults to 1.
+        name (Optional[str], optional): This operator's name. Defaults to None.
+
+    Raises:
+        ValueError: strides must be an int or a list.
+        ValueError: padding must be "SAME" or "SAME_LOWER" or "SAME_UPPER" or "VALID" or Tuple[IntPair, IntPair, IntPair, IntPair].
+        ValueError: data_format must be "NWC" or "NCW".
+        ValueError: dilations must be an int or a list.
+        ValueError: invalid data_format.
+        ValueError: data_format NWC not support groups > 1
+        ValueError: invalid data_format.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A `Blob` with the same type as `input` and the same outer batch shape.
+
+    Note:
+
+        This api is more flexible, if you're new to OneFlow, it's more recommend to use `oneflow.compatible.single_client.layers.conv1d`
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        def conv1d(input, filters, kernel_size, strides, padding, name):
+            input_shape = input.shape
+            weight_initializer = flow.truncated_normal(0.1)
+            weight_regularizer = flow.regularizers.l2(0.0005)
+            weight_shape = (filters,
+                            input_shape[1],
+                            kernel_size)
+
+            weight = flow.get_variable(
+                name + "-weight",
+                shape=weight_shape,
+                initializer=weight_initializer,
+                regularizer=weight_regularizer,
+            )
+            return flow.nn.conv1d(input, weight, strides, padding, name=name)
+
+
+        @flow.global_function()
+        def conv1d_Job(x: tp.Numpy.Placeholder((1, 64, 32))
+        ) -> tp.Numpy:
+            conv = conv1d(x,
+                        filters=32,
+                        kernel_size=3,
+                        strides=1,
+                        padding='SAME',
+                        name="Convlayer")
+            return conv
+
+
+        x = np.random.randn(1, 64, 32).astype(np.float32)
+        out = conv1d_Job(x)
+
+        # out.shape (1, 32, 32)
+
+    """
+    assert len(input.shape) == 3
+    assert len(filters.shape) == 3
+    if isinstance(strides, (list, tuple)):
+        assert len(strides) == 1, ValueError(
+            "strides length must be 1 when passed as a list."
+        )
+    elif isinstance(strides, int):
+        strides = [strides]
+    else:
+        raise ValueError("strides must be an int or a list.")
+    if data_format.upper() != "NCW" and data_format.upper() != "NWC":
+        raise ValueError('data_format must be "NCW" or "NWC".')
+    channel_pos = "channels_first" if data_format == "NCW" else "channels_last"
+    if dilations is None:
+        dilations = [1]
+    elif isinstance(dilations, (list, tuple)):
+        assert len(dilations) == 1, ValueError(
+            "dilations length must be 1 when passed as a list."
+        )
+    elif isinstance(dilations, int):
+        dilations = [dilations]
+    else:
+        raise ValueError("dilations must be an int or a list.")
+    if channel_pos == "channels_first":
+        kernel_size_list = filters.shape[2:3]
+        in_channel_axis = 1
+        filter_out_axis = 0
+        filter_in_axis = 1
+    elif channel_pos == "channels_last":
+        kernel_size_list = filters.shape[-2:-1]
+        in_channel_axis = 2
+        filter_out_axis = 0
+        filter_in_axis = 2
+        if groups > 1:
+            raise ValueError("data_format NWC not support groups > 1")
+    else:
+        raise ValueError("invalid data_format")
+    assert isinstance(kernel_size_list, tuple)
+    assert isinstance(groups, int)
+    assert groups > 0
+    assert groups <= filters.shape[filter_out_axis]
+    assert filters.shape[filter_out_axis] % groups == 0
+    assert groups <= input.shape[in_channel_axis]
+    assert input.shape[in_channel_axis] % groups == 0
+    assert filters.shape[filter_in_axis] == input.shape[in_channel_axis] // groups
+    (inputs, pads_list) = calc_conv_padding(
+        input, padding, data_format.upper(), kernel_size_list, dilations, strides
+    )
+    assert len(pads_list) == len(inputs.shape) - 2
+    padding_before = [pad[0] for pad in pads_list]
+    if (
+        groups > 1
+        and flow.current_scope().device_parallel_desc_symbol.device_tag == "cpu"
+    ):
+        in_split_list = ConvUtil.split(inputs, axis=in_channel_axis, split_num=groups)
+        filter_split_list = ConvUtil.split(
+            filters, axis=filter_out_axis, split_num=groups
+        )
+        out_list = []
+        name = name if name is not None else id_util.UniqueStr("Conv1d_")
+        for i in range(len(in_split_list)):
+            out_list.append(
+                conv_op(
+                    "conv1d",
+                    in_split_list[i],
+                    filter_split_list[i],
+                    None,
+                    padding_before,
+                    channel_pos,
+                    kernel_size_list,
+                    strides,
+                    dilations,
+                    groups=1,
+                    name=name + str(i),
+                )
+            )
+        return flow.concat(out_list, axis=in_channel_axis)
+    else:
+        return conv_op(
+            "conv1d",
+            inputs,
+            filters,
+            None,
+            padding_before,
+            channel_pos,
+            kernel_size_list,
+            strides,
+            dilations,
+            groups,
+            name,
+        )
+
+
+def conv2d(
+    input: oneflow._oneflow_internal.BlobDesc,
+    filters: oneflow._oneflow_internal.BlobDesc,
+    strides: Union[int, IntPair],
+    padding: Union[str, Tuple[IntPair, IntPair, IntPair, IntPair]],
+    bias: Optional[oneflow._oneflow_internal.BlobDesc] = None,
+    data_format: str = "NCHW",
+    dilations: Optional[Union[int, IntPair]] = None,
+    groups: int = 1,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """2D convolution layer.
+
+    Args:
+        input (oneflow._oneflow_internal.BlobDesc): A 4D input `Blob`. [batch_num, channel, height, width]
+        filters (oneflow._oneflow_internal.BlobDesc): A `Blob` with the same type as `input` and has the shape `[out_channels, in_channels//groups, filter_height, filter_width] for NCHW, or [out_channels, filter_height, filter_width, in_channels//groups] for NHWC`
+        strides (Union[int, IntPair]): An int or list of `ints` that has length `2`. The stride of the sliding window for each dimension of `input`.
+        padding (Union[str, Tuple[IntPair, IntPair, IntPair, IntPair]]): padding: `string` `"SAME"` or `"SAME_LOWER"` or `"SAME_UPPER"` or `"VALID" or Tuple[IntPair, IntPair, IntPair, IntPair]` indicating the type of padding algorithm to use, or a list indicating the explicit paddings at the start and end of each dimension.
+        data_format (str, optional): `"NHWC"` or `"NCHW"`. Defaults to `"NCHW"`.
+        dilations (Optional[Union[int, IntPair]], optional): An int or list of `ints` that has length `2`. The dilation factor for each dimension of `input`. Defaults to None.
+        groups (int, optional): int value greater than 0. Defaults to 1.
+        name (Optional[str], optional): This operator's name. Defaults to None.
+
+    Raises:
+        ValueError: strides must be an int or a list.
+        ValueError: padding must be `"SAME"` or `"SAME_LOWER" or `"SAME_UPPER"` or `"VALID"` or Tuple[IntPair, IntPair, IntPair, IntPair].
+        ValueError: data_format must be `"NHWC"` or `"NCHW"`.
+        ValueError: dilations must be an int or a list.
+        ValueError: invalid data_format.
+        ValueError: data_format NHWC not support groups > 1
+        ValueError: invalid data_format.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A `Blob` with the same type as `input` and the same outer batch shape.
+
+    Note:
+
+        This api is more flexible, if you're new to OneFlow, it's more recommend to use `oneflow.compatible.single_client.layers.conv2d`.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        def conv2d(input, filters, kernel_size, strides, padding, name):
+            input_shape = input.shape
+            weight_initializer = flow.truncated_normal(0.1)
+            weight_regularizer = flow.regularizers.l2(0.0005)
+            weight_shape = (filters,
+                            input_shape[1],
+                            kernel_size[0],
+                            kernel_size[1])
+
+            weight = flow.get_variable(
+                name + "-weight",
+                shape=weight_shape,
+                initializer=weight_initializer,
+                regularizer=weight_regularizer,
+            )
+            return flow.nn.conv2d(input, weight, strides, padding, name=name)
+
+
+        @flow.global_function()
+        def conv2d_Job(x: tp.Numpy.Placeholder((1, 64, 32, 32))
+        ) -> tp.Numpy:
+            conv = conv2d(x,
+                        filters=128,
+                        kernel_size=[3, 3],
+                        strides=2,
+                        padding='SAME',
+                        name="Convlayer")
+            return conv
+
+
+        x = np.random.randn(1, 64, 32, 32).astype(np.float32)
+        out = conv2d_Job(x)
+
+        # out.shape (1, 128, 16, 16)
+
+    """
+    assert len(input.shape) == 4
+    assert len(filters.shape) == 4
+    if bias is not None:
+        assert len(bias.shape) == 1
+    if isinstance(strides, (list, tuple)):
+        assert len(strides) == 2, ValueError(
+            "strides length must be 2 when passed as a list."
+        )
+    elif isinstance(strides, int):
+        strides = [strides, strides]
+    else:
+        raise ValueError("strides must be an int or a list.")
+    if data_format.upper() != "NCHW" and data_format.upper() != "NHWC":
+        raise ValueError('data_format must be "NHWC" or "NCHW".')
+    channel_pos = "channels_first" if data_format == "NCHW" else "channels_last"
+    if dilations is None:
+        dilations = [1, 1]
+    elif isinstance(dilations, (list, tuple)):
+        assert len(dilations) == 2, ValueError(
+            "dilations length must be 2 when passed as a list."
+        )
+    elif isinstance(dilations, int):
+        dilations = [dilations, dilations]
+    else:
+        raise ValueError("dilations must be an int or a list.")
+    assert isinstance(groups, int)
+    assert groups > 0
+    if data_format.upper() == "NCHW":
+        kernel_size_list = filters.shape[2:4]
+        in_channel_axis = 1
+        filter_out_axis = 0
+        filter_in_axis = 1
+    elif data_format.upper() == "NHWC":
+        kernel_size_list = filters.shape[-3:-1]
+        in_channel_axis = 3
+        filter_out_axis = 0
+        filter_in_axis = 3
+        if (
+            groups > 1
+            and flow.current_scope().device_parallel_desc_symbol.device_tag == "gpu"
+        ):
+            raise ValueError("gpu data_format NHWC not support groups > 1")
+    else:
+        raise ValueError('data_format must be "NHWC" or "NCHW".')
+    assert isinstance(kernel_size_list, tuple)
+    (inputs, pads_list) = calc_conv_padding(
+        input, padding, data_format.upper(), kernel_size_list, dilations, strides
+    )
+    assert len(pads_list) == len(inputs.shape) - 2
+    padding_before = [pad[0] for pad in pads_list]
+    assert groups <= filters.shape[filter_out_axis]
+    assert filters.shape[filter_out_axis] % groups == 0
+    assert groups <= inputs.shape[in_channel_axis]
+    assert inputs.shape[in_channel_axis] % groups == 0
+    assert filters.shape[filter_in_axis] == inputs.shape[in_channel_axis] // groups
+    if bias is not None:
+        assert bias.shape[filter_out_axis] == filters.shape[filter_out_axis]
+    if (
+        groups > 1
+        and flow.current_scope().device_parallel_desc_symbol.device_tag == "cpu"
+    ):
+        in_split_list = ConvUtil.split(inputs, axis=in_channel_axis, split_num=groups)
+        filter_split_list = ConvUtil.split(
+            filters, axis=filter_out_axis, split_num=groups
+        )
+        bias_spilt_list = (
+            ConvUtil.split(bias, axis=filter_out_axis, split_num=groups)
+            if bias is not None
+            else [None for _ in range(groups)]
+        )
+        out_list = []
+        name = name if name is not None else id_util.UniqueStr("Conv2d_")
+        for i in range(len(in_split_list)):
+            out_list.append(
+                conv_op(
+                    "conv2d",
+                    in_split_list[i],
+                    filter_split_list[i],
+                    bias_spilt_list[i],
+                    padding_before,
+                    channel_pos,
+                    kernel_size_list,
+                    strides,
+                    dilations,
+                    groups=1,
+                    name=name + str(i),
+                )
+            )
+        return flow.concat(out_list, axis=in_channel_axis)
+    else:
+        return conv_op(
+            "conv2d",
+            inputs,
+            filters,
+            bias,
+            padding_before,
+            channel_pos,
+            kernel_size_list,
+            strides,
+            dilations,
+            groups,
+            name,
+        )
+
+
+def conv3d(
+    input: oneflow._oneflow_internal.BlobDesc,
+    filters: oneflow._oneflow_internal.BlobDesc,
+    strides: Union[int, Sequence[int]],
+    padding: Union[str, Tuple[IntPair, IntPair, IntPair, IntPair, IntPair]],
+    data_format: str = "NCDHW",
+    dilations: Optional[Union[int, Sequence[int]]] = None,
+    groups: int = 1,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """3D convolution layer.
+
+    Args:
+        input (oneflow._oneflow_internal.BlobDesc):  A 5D input `Blob`. [batch_num, channel, depth, height, width]
+        filters (oneflow._oneflow_internal.BlobDesc): A `Blob` with the same type as `input` and has the shape `[out_channels, in_channels//groups, filter_depth, filter_height, filter_width] for NCDHW, or [out_channels, filter_depth, filter_height, filter_width, in_channels//groups] for NDHWC`
+        strides (Union[int, Sequence[int]]): An `int` or `list of ints` that has length `3`. The stride of the sliding window for each dimension of `input`.
+        padding (Union[str, Tuple[IntPair, IntPair, IntPair, IntPair, IntPair]]): padding: `string` `"SAME"` or `"SAME_LOWER"` or `"SAME_UPPER"` or `"VALID"` or Tuple[IntPair, IntPair, IntPair, IntPair, IntPair]` indicating the type of padding algorithm to use, or a list indicating the explicit paddings at the start and end of each dimension.
+        data_format (str, optional): `"NDHWC" or "NCDHW"`. Defaults to `"NCDHW"`.
+        dilations (Optional[Union[int, Sequence[int]]], optional): An int or list of `ints` that has length `3`. The dilation factor for each dimension of `input`. Defaults to None.
+        groups (int, optional): int value greater than 0. Defaults to 1.
+        name (Optional[str], optional): This operator's name. Defaults to None.
+
+    Raises:
+        ValueError: strides must be an int or a list.
+        ValueError: padding must be "SAME" or "SAME_LOWER" or "SAME_UPPER" or "VALID" or Tuple[IntPair, IntPair, IntPair, IntPair, IntPair].
+        ValueError: data_format must be "NDHWC" or "NCDHW".
+        ValueError: dilations must be an int or a list.
+        ValueError: invalid data_format.
+        ValueError: data_format NDHWC not support groups > 1
+        ValueError: invalid data_format.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A `Blob` with the same type as `input` and the same outer batch shape.
+
+    Note:
+
+        This api is more flexible, if you're new to OneFlow, it's more recommend to use `oneflow.compatible.single_client.layers.conv3d`
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        def conv3d(input, filters, kernel_size, strides, padding, name):
+            input_shape = input.shape
+            weight_initializer = flow.truncated_normal(0.1)
+            weight_regularizer = flow.regularizers.l2(0.0005)
+            weight_shape = (filters,
+                            input_shape[1],
+                            kernel_size[0],
+                            kernel_size[1],
+                            kernel_size[2])
+
+            weight = flow.get_variable(
+                name + "-weight",
+                shape=weight_shape,
+                initializer=weight_initializer,
+                regularizer=weight_regularizer,
+            )
+            return flow.nn.conv3d(input, weight, strides, padding, name=name)
+
+
+        @flow.global_function()
+        def conv3d_Job(x: tp.Numpy.Placeholder((1, 64, 10, 16, 16))
+        ) -> tp.Numpy:
+            conv = conv3d(x,
+                        filters=128,
+                        kernel_size=[3, 3, 3],
+                        strides=1,
+                        padding='SAME',
+                        name="Convlayer")
+            return conv
+
+
+        x = np.random.randn(1, 64, 10, 16, 16).astype(np.float32)
+        out = conv3d_Job(x)
+
+        # out.shape (1, 128, 10, 16, 16)
+
+    """
+    need_transpose = 0
+    if data_format.upper() == "NDHWC":
+        need_transpose = 1
+        data_format = "NCDHW"
+    if need_transpose:
+        input = flow.transpose(input, perm=[0, 4, 1, 2, 3])
+        filters = flow.transpose(filters, perm=[0, 4, 1, 2, 3])
+        if isinstance(padding, (list, tuple)):
+            padding = list(padding)
+            (padding[1], padding[4]) = (padding[4], padding[1])
+    assert len(input.shape) == 5
+    assert len(filters.shape) == 5
+    if isinstance(strides, (list, tuple)):
+        assert len(strides) == 3, ValueError(
+            "strides length must be 3 when passed as a list."
+        )
+    elif isinstance(strides, int):
+        strides = [strides, strides, strides]
+    else:
+        raise ValueError("strides must be an int or a list.")
+    if data_format.upper() != "NCDHW" and data_format.upper() != "NDHWC":
+        raise ValueError('data_format must be "NDHWC" or "NCDHW".')
+    channel_pos = "channels_first" if data_format == "NCDHW" else "channels_last"
+    if dilations is None:
+        dilations = [1, 1, 1]
+    elif isinstance(dilations, (list, tuple)):
+        assert len(dilations) == 3, ValueError(
+            "dilations length must be 3 when passed as a list."
+        )
+    elif isinstance(dilations, int):
+        dilations = [dilations, dilations, dilations]
+    else:
+        raise ValueError("dilations must be an int or a list.")
+    if channel_pos == "channels_first":
+        kernel_size_list = filters.shape[2:5]
+        in_channel_axis = 1
+        filter_out_axis = 0
+        filter_in_axis = 1
+    elif channel_pos == "channels_last":
+        kernel_size_list = filters.shape[-4:-1]
+        in_channel_axis = 4
+        filter_out_axis = 0
+        filter_in_axis = 4
+        if groups > 1:
+            raise ValueError("data_format NDHWC not support groups > 1")
+    else:
+        raise ValueError("invalid data_format")
+    assert isinstance(kernel_size_list, tuple)
+    assert isinstance(groups, int)
+    assert groups > 0
+    assert groups <= filters.shape[filter_out_axis]
+    assert filters.shape[filter_out_axis] % groups == 0
+    assert groups <= input.shape[in_channel_axis]
+    assert input.shape[in_channel_axis] % groups == 0
+    assert filters.shape[filter_in_axis] == input.shape[1] // groups
+    (inputs, pads_list) = calc_conv_padding(
+        input, padding, data_format.upper(), kernel_size_list, dilations, strides
+    )
+    assert len(pads_list) == len(inputs.shape) - 2
+    padding_before = [pad[0] for pad in pads_list]
+    if (
+        groups > 1
+        and flow.current_scope().device_parallel_desc_symbol.device_tag == "cpu"
+    ):
+        in_split_list = ConvUtil.split(inputs, axis=in_channel_axis, split_num=groups)
+        filter_split_list = ConvUtil.split(
+            filters, axis=filter_out_axis, split_num=groups
+        )
+        out_list = []
+        name = name if name is not None else id_util.UniqueStr("Conv3d_")
+        for i in range(len(in_split_list)):
+            out_list.append(
+                conv_op(
+                    "conv3d",
+                    in_split_list[i],
+                    filter_split_list[i],
+                    None,
+                    padding_before,
+                    channel_pos,
+                    kernel_size_list,
+                    strides,
+                    dilations,
+                    groups=1,
+                    name=name + str(i),
+                )
+            )
+        output = flow.concat(out_list, axis=in_channel_axis)
+    else:
+        output = conv_op(
+            "conv3d",
+            inputs,
+            filters,
+            None,
+            padding_before,
+            channel_pos,
+            kernel_size_list,
+            strides,
+            dilations,
+            groups,
+            name,
+        )
+    if need_transpose:
+        output = flow.transpose(output, perm=[0, 2, 3, 4, 1])
+    return output
+
+
+def moments(
+    x: oneflow._oneflow_internal.BlobDesc,
+    axes: List[int],
+    keepdims: Optional[bool] = False,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the mean and variance value of input Blob.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        axes (List): Array of ints. Axes along which to compute the mean and variance
+        keepdims (bool, optional): Whether to keep the same dimensanality as the input x. Defaults to False.
+        name (str, optional): The operator's name. Defaults to None.
+
+    Returns:
+        remote_blob: Two Blobs, mean and variance.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+        from typing import Tuple
+
+
+        @flow.global_function()
+        def moments_Job(x: tp.Numpy.Placeholder((5,))
+        ) -> Tuple[tp.Numpy, tp.Numpy]:
+            return flow.nn.moments(x, axes=[0])
+
+
+        x = np.array([1, 2, 3, 4, 5]).astype(np.float32)
+        mean, variance = moments_Job(x)
+
+        # mean: [3.]
+        # variance: [2.]
+
+    """
+    assert isinstance(axes, list)
+    if name is None:
+        name = id_util.UniqueStr("Moments_")
+    with flow.scope.namespace(name):
+        return (
+            flow.math.reduce_mean(x, axis=axes, keepdims=keepdims),
+            flow.math.reduce_variance(x, axis=axes, keepdims=keepdims),
+        )
+
+
+def group_normalization(
+    x: oneflow._oneflow_internal.BlobDesc,
+    num_groups: int = 32,
+    eps: float = 1e-05,
+    affine: bool = True,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Applies Group Normalization over a ND(N>=3) input.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): input tensor with shape (N,C,鈭�), where C means the number of channels.
+        eps (float): A value added to the denominator for numerical stability. Default: 1e-5.
+        affine (bool): A boolean value that when set to True, this module has learnable affine parameters,
+                       initialized the same way as done for batch normalization. Default: True.
+        name (Optional[str], optional): Name of this op.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The normalized input tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def group_norm_Job(x: tp.Numpy.Placeholder((4, 4, 32, 32))
+        ) -> tp.Numpy:
+            group_norm = flow.nn.GroupNorm(
+                x,
+                num_group=2,
+                eps=1e-5,
+                affine=True,
+            )
+            return group_norm
+
+        x = np.random.random(size=(4, 4, 32, 32)).astype(np.float32)
+        out = group_norm_Job(x)
+
+    """
+    assert len(x.shape) >= 3
+    assert (
+        x.shape[1] % num_groups == 0
+    ), "The channel should be divisible by num_groups."
+    if name is None:
+        name = id_util.UniqueStr("GroupNorm_")
+    channel = x.shape[1]
+    assert channel % num_groups == 0
+    group_size = channel // num_groups
+    orig_shape = x.shape
+    reshape_to_1d = flow.reshape(x, shape=[orig_shape[0], num_groups, -1])
+    (mean, variance) = flow.nn.moments(reshape_to_1d, [2], keepdims=True)
+    normalized = (reshape_to_1d - mean) / flow.math.sqrt(variance + eps)
+    normalized = flow.reshape(normalized, shape=[orig_shape[0], channel, -1])
+    if affine == True:
+        gamma = flow.get_variable(
+            name + "_gamma",
+            shape=(1, channel, 1),
+            dtype=x.dtype,
+            initializer=flow.ones_initializer(),
+            trainable=True,
+        )
+        beta = flow.get_variable(
+            name + "_beta",
+            shape=(1, channel, 1),
+            dtype=x.dtype,
+            initializer=flow.zeros_initializer(),
+            trainable=True,
+        )
+        normalized = gamma * normalized + beta
+    reshape_back = flow.reshape_like(normalized, like=x)
+    return reshape_back
+
+
+def instance_normalization1d(
+    x: oneflow._oneflow_internal.BlobDesc,
+    eps: float = 1e-05,
+    affine: bool = True,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Applies Instance Normalization over a 3D input.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): 3D input tensor with NCL data layout.
+        eps (float): A value added to the denominator for numerical stability. Default: 1e-5.
+        affine (bool): A boolean value that when set to True, this module has learnable affine parameters,
+                       initialized the same way as done for batch normalization. Default: True.
+        name (Optional[str], optional): Name of this op.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The normalized input tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def instance_norm_Job(x: tp.Numpy.Placeholder((4, 2, 32))
+        ) -> tp.Numpy:
+            instance_norm = flow.nn.InstanceNorm1d(
+                x,
+                eps=1e-5,
+                affine=True,
+            )
+            return instance_norm
+
+        x = np.random.random(size=(4, 2, 32)).astype(np.float32)
+        out = instance_norm_Job(x)
+
+    """
+    assert len(x.shape) == 3
+    if name is None:
+        name = id_util.UniqueStr("InstanceNorm1D_")
+    channel = x.shape[1]
+    (mean, variance) = flow.nn.moments(x, [2], keepdims=True)
+    normalized = (x - mean) / flow.math.sqrt(variance + eps)
+    if affine == True:
+        gamma = flow.get_variable(
+            name + "_gamma",
+            shape=(1, channel, 1),
+            dtype=x.dtype,
+            initializer=flow.ones_initializer(),
+            trainable=True,
+        )
+        beta = flow.get_variable(
+            name + "_beta",
+            shape=(1, channel, 1),
+            dtype=x.dtype,
+            initializer=flow.zeros_initializer(),
+            trainable=True,
+        )
+        return gamma * normalized + beta
+    else:
+        return normalized
+
+
+def instance_normalization2d(
+    x: oneflow._oneflow_internal.BlobDesc,
+    eps: float = 1e-05,
+    affine: bool = True,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Applies Instance Normalization over a 4D input.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): 4D input tensor with NCHW data layout.
+        eps (float): A value added to the denominator for numerical stability. Default: 1e-5.
+        affine (bool): A boolean value that when set to True, this module has learnable affine parameters,
+                       initialized the same way as done for batch normalization. Default: True.
+        name (Optional[str], optional): Name of this op.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The normalized input tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def instance_norm_Job(x: tp.Numpy.Placeholder((4, 2, 32, 32))
+        ) -> tp.Numpy:
+            instance_norm = flow.nn.InstanceNorm2d(
+                x,
+                eps=1e-5,
+                affine=True,
+            )
+            return instance_norm
+
+        x = np.random.random(size=(4, 2, 32, 32)).astype(np.float32)
+        out = instance_norm_Job(x)
+
+    """
+    assert len(x.shape) == 4
+    if name is None:
+        name = id_util.UniqueStr("InstanceNorm2D_")
+    reshape_to_1d = flow.reshape(x, shape=[x.shape[0], x.shape[1], -1])
+    normalized_1d_out = flow.nn.InstanceNorm1d(
+        reshape_to_1d, eps=eps, affine=affine, name=name
+    )
+    reshape_back_to_2d = flow.reshape(normalized_1d_out, shape=list(x.shape))
+    return reshape_back_to_2d
+
+
+def instance_normalization3d(
+    x: oneflow._oneflow_internal.BlobDesc,
+    eps: float = 1e-05,
+    affine: bool = True,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Applies Instance Normalization over a 5D input.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): 5D input tensor with NCDHW data layout.
+        eps (float): A value added to the denominator for numerical stability. Default: 1e-5.
+        affine (bool): A boolean value that when set to True, this module has learnable affine parameters,
+                       initialized the same way as done for batch normalization. Default: True.
+        name (Optional[str], optional): Name of this op.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The normalized input tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function()
+        def instance_norm_Job(x: tp.Numpy.Placeholder((4, 2, 32, 32, 32))
+        ) -> tp.Numpy:
+            instance_norm = flow.nn.InstanceNorm2d(
+                x,
+                eps=1e-5,
+                affine=True,
+            )
+            return instance_norm
+
+        x = np.random.random(size=(4, 2, 32, 32, 32)).astype(np.float32)
+        out = instance_norm_Job(x)
+
+    """
+    assert len(x.shape) == 5
+    if name is None:
+        name = id_util.UniqueStr("InstanceNorm3D_")
+    reshape_to_1d = flow.reshape(x, shape=[x.shape[0], x.shape[1], -1])
+    normalized_1d_out = flow.nn.InstanceNorm1d(
+        reshape_to_1d, eps=eps, affine=affine, name=name
+    )
+    reshape_back_to_3d = flow.reshape(normalized_1d_out, shape=list(x.shape))
+    return reshape_back_to_3d
+
+
+def batch_normalization(
+    x: oneflow._oneflow_internal.BlobDesc,
+    mean: oneflow._oneflow_internal.BlobDesc,
+    variance: oneflow._oneflow_internal.BlobDesc,
+    offset: Optional[oneflow._oneflow_internal.BlobDesc] = None,
+    scale: Optional[oneflow._oneflow_internal.BlobDesc] = None,
+    variance_epsilon: Optional[float] = 1e-05,
+    axis: int = 1,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This op does not fully align with tf.nn.batch_normalization.
+
+    The `mean`, `variable`, `offset` and `scale` are always 1D. Users need to specify `axis` to 1 for NCHW data format.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): Input `Blob` of arbitrary dimensionality.
+        mean (oneflow._oneflow_internal.BlobDesc): A 1D mean `Blob`.
+        variance (oneflow._oneflow_internal.BlobDesc):   A 1D variance `Blob`.
+        offset (Optional[oneflow._oneflow_internal.BlobDesc]): An 1D offset `Blob`, often denoted  in equations, or None. If present, will be added to the normalized `Blob`.
+        scale (Optional[oneflow._oneflow_internal.BlobDesc]): A 1D scale `Blob`, often denoted  in equations, or None. If present, the scale is applied to the normalized `Blob`.
+        variance_epsilon (float):   A small float number to avoid dividing by 0.
+        axis (int, optional): 1 for '`NCHW'` data format. Defaults to 1.
+        name (Optional[str], optional): This operator's name.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc:  the normalized, scaled, offset `Blob`.
+
+    Note:
+
+        This api is more flexible, if you're new to OneFlow, it's more recommend to use `oneflow.compatible.single_client.layers.batch_normalization`
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def batch_norm_Job(x: tp.Numpy.Placeholder((1, 5))
+        ) -> tp.Numpy:
+            bn_mean, bn_variance = flow.nn.moments(x, axes=[1])
+            batch_norm = flow.nn.batch_normalization(
+                x,
+                mean=bn_mean,
+                variance=bn_variance,
+                axis=0
+            )
+            return batch_norm
+
+
+        x = np.array([[1, 2, 3, 4, 5]]).astype(np.float32)
+        out = batch_norm_Job(x)
+
+        # out [[-1.41421  -0.707105  0.        0.707105  1.41421 ]]
+
+    """
+    assert axis >= -len(x.shape) and axis < len(x.shape)
+    if axis < 0:
+        axis += len(x.shape)
+    if name is None:
+        name = id_util.UniqueStr("BatchNorm_")
+    params_shape = [x.shape[axis]]
+    if flow.current_scope().device_parallel_desc_symbol.device_tag == "cpu":
+        if len(mean.shape) == 1:
+            nd_params_shape = [1] * len(x.shape)
+            nd_params_shape[axis] = params_shape[0]
+            mean = flow.reshape(mean, nd_params_shape)
+            variance = flow.reshape(variance, nd_params_shape)
+            if scale:
+                scale = flow.reshape(scale, nd_params_shape)
+            if offset:
+                offset = flow.reshape(offset, nd_params_shape)
+        elif len(mean.shape) == len(x.shape):
+            pass
+        else:
+            raise ValueError(
+                "shape of mean and variance should be 1D or has number of axes and x's"
+            )
+        variance += variance_epsilon
+        std_inv = flow.math.rsqrt(variance)
+        normalized = (x - mean) * std_inv
+        affined = normalized
+        if scale:
+            affined *= scale
+        if offset:
+            affined += offset
+        return affined
+    elif flow.current_scope().device_parallel_desc_symbol.device_tag == "gpu":
+        params_dtype = flow.float32 if x.dtype == flow.float16 else x.dtype
+        if scale is None:
+            scale = flow.constant(
+                1, dtype=params_dtype, shape=params_shape, name="gamma"
+            )
+        if offset is None:
+            offset = flow.constant(
+                0, dtype=params_dtype, shape=params_shape, name="beta"
+            )
+        builder = (
+            flow.user_op_builder(name)
+            .Op("normalization")
+            .Input("x", [x])
+            .Input("moving_mean", [mean])
+            .Input("moving_variance", [variance])
+            .Input("gamma", [scale])
+            .Input("beta", [offset])
+            .Output("y")
+            .Attr("axis", axis)
+            .Attr("epsilon", variance_epsilon)
+            .Attr("training", False)
+            .Attr("momentum", 0.0)
+        )
+        return builder.Build().InferAndTryRun().RemoteBlobList()[0]
+    else:
+        raise NotImplementedError
+
+
+def layer_norm(
+    inputs: oneflow._oneflow_internal.BlobDesc,
+    gamma: Optional[oneflow._oneflow_internal.BlobDesc] = None,
+    beta: Optional[oneflow._oneflow_internal.BlobDesc] = None,
+    begin_norm_axis: int = 1,
+    begin_params_axis: int = -1,
+    epsilon: float = 1e-05,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Layer Normalization.
+
+    Args:
+        inputs (oneflow._oneflow_internal.BlobDesc): Input `Blob`.
+        gamma (Optional[oneflow._oneflow_internal.BlobDesc]).
+        beta (Optional[oneflow._oneflow_internal.BlobDesc]).
+        begin_norm_axis (int, optional): An integer specifies which axis to normalize at first. Defaults to 1.
+        begin_params_axis (int, optional):  An integer specifies which axis params at . Defaults to -1.
+        epsilon (float, optional): A small float is added to avoid division by zero. Defaults to 1e-5.
+        name (Optional[str], optional): This operator's name. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A normalized `Blob` with same shape of input.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def layer_norm_Job(x: tp.Numpy.Placeholder((1, 64, 128, 128))
+        ) -> tp.Numpy:
+            layer_norm = flow.nn.layer_norm(
+                x,
+                name="LayerNorm1"
+            )
+            return layer_norm
+
+
+        x = np.random.randn(1, 64, 128, 128).astype(np.float32)
+        out = layer_norm_Job(x)
+
+        # out.shape (1, 64, 128, 128)
+
+    """
+    param_shape = inputs.shape[begin_params_axis:]
+    if name is None:
+        name = id_util.UniqueStr("LayerNorm_")
+    if flow.current_scope().device_parallel_desc_symbol.device_tag == "cpu":
+        if begin_norm_axis < 0:
+            begin_norm_axis = begin_norm_axis + len(inputs.shape)
+        reduce_axis = []
+        for dim in range(len(inputs.shape)):
+            if dim >= begin_norm_axis:
+                reduce_axis.append(dim)
+        (mean, variance) = flow.nn.moments(inputs, reduce_axis, keepdims=True)
+        axis = begin_norm_axis
+        normalized = flow.nn.batch_normalization(
+            x=inputs,
+            mean=mean,
+            variance=variance,
+            variance_epsilon=epsilon,
+            axis=axis,
+            name=name,
+        )
+        nd_params_shape = [1] * (len(inputs.shape) - len(param_shape)) + list(
+            param_shape
+        )
+        affined = normalized
+        if gamma:
+            gamma = flow.reshape(gamma, nd_params_shape)
+            affined *= gamma
+        if beta:
+            beta = flow.reshape(beta, nd_params_shape)
+            affined += beta
+        return affined
+    elif flow.current_scope().device_parallel_desc_symbol.device_tag == "gpu":
+        op_builder = (
+            flow.user_op_builder(name)
+            .Op("layer_norm")
+            .Input("x", [inputs])
+            .Output("y")
+            .Output("mean")
+            .Output("inv_variance")
+        )
+        scale = False
+        center = False
+        if beta is not None:
+            center = True
+            op_builder.Input("beta", [beta])
+        if gamma is not None:
+            scale = True
+            op_builder.Input("gamma", [gamma])
+            op_builder.Output("normalized")
+        op_builder.Attr("center", center)
+        op_builder.Attr("scale", scale)
+        op_builder.Attr("begin_norm_axis", begin_norm_axis)
+        op_builder.Attr("begin_params_axis", begin_params_axis)
+        op_builder.Attr("epsilon", epsilon)
+        y = op_builder.Build().InferAndTryRun().RemoteBlobList()[0]
+        return y
+    else:
+        raise NotImplementedError
+
+
+def tf_conv2d(
+    input: oneflow._oneflow_internal.BlobDesc,
+    filters: oneflow._oneflow_internal.BlobDesc,
+    strides: Union[int, Sequence[int]],
+    padding: str,
+    data_format: str = "NCHW",
+    dilations: Optional[Union[int, Sequence[int]]] = None,
+    groups: int = 1,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Computes a 2-D convolution given `input` and 4-D `filters` `Blob`.
+
+    Args:
+        input (oneflow._oneflow_internal.BlobDesc): A `Blob` of rank at least 4.
+        filters (oneflow._oneflow_internal.BlobDesc): A `Blob` with the same type as `input` and has the shape `[out_channels, in_channels//groups, filter_height, filter_width] for NCHW, or [out_channels, filter_height, filter_width, in_channels//groups] for NHWC`
+        strides (Union[int, Sequence[int]]): An int or list of `ints` that has length `1`, or `2`. The stride of the sliding window for each dimension of `input`.
+        padding (str): `"SAME"` or `"VALID"` indicating the type of padding algorithm to use, or a list indicating the explicit paddings at the start and end of each dimension.
+        data_format (str, optional): `"NHWC"` or `"NCHW"`. Defaults to `"NCHW"`.
+        dilations (Optional[Union[int, Sequence[int]]], optional): The dilation factor for each dimension of`input`. Defaults to None.
+        groups (int, optional): int value greater than 0. Defaults to 1.
+        name (Optional[str], optional): This operator's name. Defaults to None.
+
+    Raises:
+        ValueError: strides must be an int or a list.
+        ValueError: data_format must be "NHWC" or "NCHW".
+        ValueError: dilations length must be 2 when passed as a list.
+        ValueError: dilations must be an int or a list.
+        ValueError: data_format NHWC not support groups > 1.
+        ValueError: invalid data_format.
+        ValueError: padding must be "SAME" or "VALID".
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc:  A `Blob` with the same type as `input` and the same outer batch shape.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        def conv2d(input, filters, kernel_size, strides, padding, name):
+            input_shape = input.shape
+            weight_initializer = flow.truncated_normal(0.1)
+            weight_regularizer = flow.regularizers.l2(0.0005)
+            weight_shape = (filters,
+                            input_shape[1],
+                            kernel_size[0],
+                            kernel_size[1])
+
+            weight = flow.get_variable(
+                name + "-weight",
+                shape=weight_shape,
+                initializer=weight_initializer,
+                regularizer=weight_regularizer,
+            )
+            return flow.nn.compat_conv2d(input, weight, strides, padding, name=name)
+
+
+        @flow.global_function()
+        def conv2d_Job(x: tp.Numpy.Placeholder((1, 64, 32, 32))
+        ) -> tp.Numpy:
+            conv = conv2d(x,
+                        filters=128,
+                        kernel_size=[3, 3],
+                        strides=2,
+                        padding='SAME',
+                        name="Convlayer")
+            return conv
+
+
+        x = np.random.randn(1, 64, 32, 32).astype(np.float32)
+        out = conv2d_Job(x)
+
+        # out.shape (1, 128, 16, 16)
+
+    """
+    if padding.upper() == "SAME":
+        padding = "SAME_UPPER"
+    return flow.nn.conv2d(
+        input, filters, strides, padding, None, data_format, dilations, groups, name
+    )
+
+
+def bias_add(
+    value: oneflow._oneflow_internal.BlobDesc,
+    bias: oneflow._oneflow_internal.BlobDesc,
+    data_format: Optional[str] = None,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator adds a bias to Blob.
+
+    Args:
+        value (oneflow._oneflow_internal.BlobDesc):  A `Blob`.
+        bias (oneflow._oneflow_internal.BlobDesc): A 1-D `Blob` with size matching the channel dimension of value. And has the same type as value unless value is a quantized type.
+        data_format (Optional[str], optional): A string. '`N...C'` or '`NC...'`. Defaults to None.
+        name (Optional[str], optional): This operator's name. Defaults to None.
+
+    Raises:
+        ValueError: ValueError if data format is unrecognized, if value has less than two dimensions with '`N..C'`/None data_format or value has less than three dimensions with '`NC..'` data_format, if bias is a vector, or if the size of bias does not match the size of the channel dimension of value.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A `Blob` with the same type as value.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def bias_add_Job(x: tp.Numpy.Placeholder((1, 64, 128, 128))
+        ) -> tp.Numpy:
+            bias_initializer = flow.truncated_normal(0.1)
+            bias_regularizer = flow.regularizers.l2(0.0005)
+            bias = flow.get_variable(
+                    "Add_bias",
+                    shape=(64,),
+                    initializer=bias_initializer,
+                    regularizer=bias_regularizer,
+                )
+            bias_out = flow.nn.bias_add(x, bias)
+            return bias_out
+
+
+        x = np.random.randn(1, 64, 128, 128).astype(np.float32)
+        out = bias_add_Job(x)
+
+        # out.shape (1, 64, 128, 128)
+
+    """
+    if name is None:
+        name = id_util.UniqueStr("BiasAdd_")
+    if data_format is None:
+        bias_add_axis = 1
+    elif data_format.startswith("NC"):
+        bias_add_axis = 1
+    elif data_format.startswith("N") and data_format.endswith("C"):
+        bias_add_axis = len(value.shape) - 1
+    else:
+        raise ValueError("data_format must be of the form `N...C` or `NC...`")
+    return (
+        flow.user_op_builder(name)
+        .Op("bias_add")
+        .Input("a", [value])
+        .Input("b", [bias])
+        .Output("out")
+        .Attr("axis", bias_add_axis)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def fused_bias_add_gelu(
+    value: oneflow._oneflow_internal.BlobDesc,
+    bias: oneflow._oneflow_internal.BlobDesc,
+    data_format: Optional[str] = None,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator fuse flow.nn.bias_add and flow.math.gelu operator.
+
+    Args:
+        value (oneflow._oneflow_internal.BlobDesc):  A `Blob`.
+        bias (oneflow._oneflow_internal.BlobDesc): A 1-D `Blob` with size matching the channel dimension of value. And has the same type as value unless value is a quantized type.
+        data_format (Optional[str], optional): A string. '`N...C'` or '`NC...'`. Defaults to None.
+        name (Optional[str], optional): This operator's name. Defaults to None.
+
+    Raises:
+        ValueError: ValueError if data format is unrecognized, if value has less than two dimensions with '`N..C'`/None data_format or value has less than three dimensions with '`NC..'` data_format, if bias is a vector, or if the size of bias does not match the size of the channel dimension of value.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A `Blob` with the same type as value.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def fused_bias_add_gelu_Job(x: tp.Numpy.Placeholder((1, 64, 128, 128))
+        ) -> tp.Numpy:
+            bias_initializer = flow.truncated_normal(0.1)
+            bias_regularizer = flow.regularizers.l2(0.0005)
+            bias = flow.get_variable(
+                    "Add_bias",
+                    shape=(64,),
+                    initializer=bias_initializer,
+                    regularizer=bias_regularizer,
+                )
+            out = flow.nn.fused_bias_add_gelu(x, bias)
+            return out
+
+
+        x = np.random.randn(1, 64, 128, 128).astype(np.float32)
+        out = fused_bias_add_gelu_Job(x)
+
+        # out.shape (1, 64, 128, 128)
+
+    """
+    if name is None:
+        name = id_util.UniqueStr("FusedBiasAddGelu_")
+    if data_format is None:
+        bias_add_axis = 1
+    elif data_format.startswith("NC"):
+        bias_add_axis = 1
+    elif data_format.startswith("N") and data_format.endswith("C"):
+        bias_add_axis = len(value.shape) - 1
+    else:
+        raise ValueError("data_format must be of the form `N...C` or `NC...`")
+    return (
+        flow.user_op_builder(name)
+        .Op("fused_bias_add_gelu")
+        .Input("a", [value])
+        .Input("b", [bias])
+        .Output("out")
+        .Attr("axis", bias_add_axis)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def fused_bias_add_dropout(
+    value: oneflow._oneflow_internal.BlobDesc,
+    bias: oneflow._oneflow_internal.BlobDesc,
+    data_format: Optional[str] = None,
+    rate: float = 0.0,
+    noise_shape: Optional[oneflow._oneflow_internal.BlobDesc] = None,
+    seed: Optional[int] = None,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator fuse flow.nn.bias_add and flow.nn.dropout operator.
+
+    Args:
+        value (oneflow._oneflow_internal.BlobDesc):  A `Blob`.
+        bias (oneflow._oneflow_internal.BlobDesc): A 1-D `Blob` with size matching the channel dimension of value. And has the same type as value unless value is a quantized type.
+        data_format (Optional[str], optional): A string. '`N...C'` or '`NC...'`. Defaults to None.
+        rate (float): A scalar `Blob` with the same type as x. The probability that each element is dropped.
+        noise_shape (Optional[oneflow._oneflow_internal.BlobDesc], optional):  optional: A 1-D `Blob`, representing the shape for randomly generated keep/drop flags. Defaults to None.Defaults to None.
+        seed (Optional[int], optional):  Optional int value. Defaults to None.
+        name (Optional[str], optional): This operator's name. Defaults to None.
+
+    Raises:
+        ValueError: ValueError if data format is unrecognized, if value has less than two dimensions with '`N..C'`/None data_format or value has less than three dimensions with '`NC..'` data_format, if bias is a vector, or if the size of bias does not match the size of the channel dimension of value.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A `Blob` with the same type as value.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def fused_bias_add_dropout_Job(x: tp.Numpy.Placeholder((1, 64, 128, 128))
+        ) -> tp.Numpy:
+            bias_initializer = flow.truncated_normal(0.1)
+            bias_regularizer = flow.regularizers.l2(0.0005)
+            bias = flow.get_variable(
+                    "Add_bias",
+                    shape=(64,),
+                    initializer=bias_initializer,
+                    regularizer=bias_regularizer,
+                )
+            out = flow.nn.fused_bias_add_dropout(x, bias)
+            return out
+
+
+        x = np.random.randn(1, 64, 128, 128).astype(np.float32)
+        out = fused_bias_add_dropout_Job(x)
+
+        # out.shape (1, 64, 128, 128)
+
+    """
+    assert rate is not None and rate >= 0.0 and (rate < 1.0)
+    if not flow.current_global_function_desc().IsTrainable() or rate == 0.0:
+        return flow.nn.bias_add(value, bias, data_format, name)
+    if name is None:
+        name = id_util.UniqueStr("BiasAddDropout_")
+    mask = flow.nn.random_mask_like(
+        value, rate, seed, noise_shape, "%s-dropout_random_mask_like" % name
+    )
+    if data_format is None:
+        bias_add_axis = 1
+    elif data_format.startswith("NC"):
+        bias_add_axis = 1
+    elif data_format.startswith("N") and data_format.endswith("C"):
+        bias_add_axis = len(value.shape) - 1
+    else:
+        raise ValueError("data_format must be of the form `N...C` or `NC...`")
+    return (
+        flow.user_op_builder(name)
+        .Op("fused_bias_add_mask_scale")
+        .Input("a", [value])
+        .Input("b", [bias])
+        .Input("mask", [mask])
+        .Output("out")
+        .Attr("axis", bias_add_axis)
+        .Attr("scale", float(1.0 / (1.0 - rate)))
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def max_pool1d(
+    input: oneflow._oneflow_internal.BlobDesc,
+    ksize: Union[int, Sequence[int]],
+    strides: Union[int, Sequence[int]],
+    padding: Union[str, Sequence[Sequence[int]]],
+    data_format: str = "NWC",
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Performs the 1d-max pooling on the input.
+
+    Args:
+        input (oneflow._oneflow_internal.BlobDesc): A 3-D `Blob` of the format specified by data_format.
+        ksize (Union[int, Sequence[int]]): An int or list of ints that has length 1 or 3. The size of the window for each dimension of the input `Blob`.
+        strides (Union[int, Sequence[int]]): An int or list of ints that has length 1 or 3. The stride of the sliding window for each dimension of the input `Blob`.
+        padding (str):  '`VALID'` or '`SAME'`. The padding algorithm.
+        data_format (str, optional):  An optional string from: '`NWC'`, '`NCW'`. Defaults to '`NWC'`.
+        name (Optional[str], optional): This operator's name(optional).Defaults to None.
+
+    Raises:
+        NotImplementedError: TODO: fix cuDNN bugs in pooling_1d
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A `Blob` of format specified by data_format. The max pooled output `Blob`.
+    """
+    raise NotImplementedError
+
+
+def avg_pool1d(
+    input: oneflow._oneflow_internal.BlobDesc,
+    ksize: Union[int, Sequence[int]],
+    strides: Union[int, Sequence[int]],
+    padding: Union[str, Sequence[Sequence[int]]],
+    data_format: str = "NCW",
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Performs the average pooling on the input `Blob`.
+
+    Args:
+        input (oneflow._oneflow_internal.BlobDesc): A 3-D `Blob` of the format specified by data_format.
+        ksize (Union[int, Sequence[int]]): An int or list of ints that has length 1 or 3. The size of the window for each dimension of the input `Blob`.
+        strides (Union[int, Sequence[int]]): An int or list of ints that has length 1 or 3. The stride of the sliding window for each dimension of the input `Blob`.
+        padding (str): '`VALID'` or '`SAME'`.
+        data_format (str, optional):  '`NWC'` or '`NCW'`. Defaults to '`NWC'`.
+        name (Optional[str], optional):  This operator's name(optional). Defaults to None.
+
+    Raises:
+        NotImplementedError: TODO: fix cuDNN bugs in pooling_1d
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A `Blob` of format specified by data_format. The max pooled output `Blob`.
+    """
+    raise NotImplementedError
+
+
+def calc_pool_padding(padding, dhw_offset, ndims):
+    if isinstance(padding, str):
+        padding = "SAME_LOWER" if padding.upper() == "SAME" else padding
+        assert padding.upper() in ["VALID", "SAME_LOWER", "SAME_UPPER"]
+        padding_type = padding.lower()
+        ndim_pads_list = [[0, 0]] * ndims
+    elif isinstance(padding, (list, tuple)):
+        padding_type = "customized"
+        ndim_pads_list = get_ndim_pads_list(padding, dhw_offset, ndims)
+    else:
+        raise ValueError("padding must be str or a list.")
+    return (padding_type, ndim_pads_list)
+
+
+def max_pool2d(
+    input: oneflow._oneflow_internal.BlobDesc,
+    ksize: Union[int, IntPair],
+    strides: Union[int, IntPair],
+    padding: Union[str, Tuple[IntPair, IntPair, IntPair, IntPair]],
+    data_format: str = "NCHW",
+    ceil_mode: bool = False,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Performs the 2d-max pooling on the input `Blob`.
+
+    Args:
+        input (oneflow._oneflow_internal.BlobDesc): A 4-D `Blob` of the format specified by data_format.
+        ksize (Union[int, IntPair]): An int or list of ints that has length 1, 2. The size of the window for each dimension of the input `Blob`.
+        strides (Union[int, IntPair]): An int or list of ints that has length 1, 2. The stride of the sliding window for each dimension of the input `Blob`.
+        padding (str): '`VALID'` or '`SAME'` or '`SAME_LOWER'` or '`SAME_UPPER'` or Tuple[IntPair, IntPair, IntPair, IntPair]`. The padding algorithm.
+        data_format (str, optional): '`NHWC'`, '`NCHW'` or '`NCHW_VECT_C'`. Defaults to "NCHW".
+        name (Optional[str], optional): This operator's name(optional). Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc:  A `Blob` of format specified by data_format. The max pooled output `Blob`.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def maxpool2d_Job(x: tp.Numpy.Placeholder((1, 32, 128, 128))
+        ) -> tp.Numpy:
+            pool_out = flow.nn.max_pool2d(
+                input=x,
+                ksize=3,
+                strides=2,
+                padding='SAME',
+                data_format='NCHW'
+            )
+
+            return pool_out
+
+
+        x = np.random.randn(1, 32, 128, 128).astype(np.float32)
+        out = maxpool2d_Job(x)
+
+        # out.shape (1, 32, 64, 64)
+
+    """
+    op = (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("MaxPool2D_")
+        )
+        .Op("max_pool_2d")
+        .Input("x", [input])
+        .Output("y")
+    )
+    assert data_format in ["NHWC", "NCHW", "NCHW_VECT_C"]
+    channel_pos = "channels_last" if data_format == "NHWC" else "channels_first"
+    op.Attr("data_format", channel_pos)
+    pool_size = _GetSequence(ksize, 2, "ksize")
+    op.Attr("pool_size", pool_size)
+    strides = _GetSequence(strides, 2, "strides")
+    op.Attr("strides", strides)
+    (padding_type, pads_list) = calc_pool_padding(
+        padding, get_dhw_offset(channel_pos), 2
+    )
+    assert len(pads_list) == len(input.shape) - 2
+    padding_before = [pad[0] for pad in pads_list]
+    padding_after = [pad[1] for pad in pads_list]
+    op.Attr("padding", padding_type)
+    op.Attr("padding_before", padding_before)
+    op.Attr("padding_after", padding_after)
+    op.Attr("ceil_mode", ceil_mode)
+    return op.Build().InferAndTryRun().RemoteBlobList()[0]
+
+
+def avg_pool2d(
+    input: oneflow._oneflow_internal.BlobDesc,
+    ksize: Union[int, IntPair],
+    strides: Union[int, IntPair],
+    padding: Union[str, Tuple[IntPair, IntPair, IntPair, IntPair]],
+    data_format: str = "NCHW",
+    ceil_mode: bool = False,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Performs the 2d-average pooling on the input.
+
+    Args:
+        input (oneflow._oneflow_internal.BlobDesc): A 4-D `Blob` of shape [batch, height, width, channels].
+        ksize (Union[int, IntPair]):  An int or list of ints that has length 1, 2. The size of the window for each dimension of the input `Blob`.
+        strides (Union[int, IntPair]): An int or list of ints that has length 1, 2. The stride of the sliding window for each dimension of the input `Blob`.
+        padding (str): '`VALID'` or '`SAME'` or '`SAME_LOWER'` or '`SAME_UPPER'` or Tuple[IntPair, IntPair, IntPair, IntPair]. The padding algorithm.
+        data_format (str, optional): '`NHWC'` or '`NCHW'`. Defaults to "NCHW".
+        name (Optional[str], optional):  This operator's name(optional). Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc:  A `Blob` with the same type as '`value'`. The average pooled output `Blob`.
+
+    For example:
+
+    .. code-block:: python
+
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def avgpool2d_Job(x: tp.Numpy.Placeholder((1, 32, 128, 128))
+        ) -> tp.Numpy:
+            pool_out = flow.nn.avg_pool2d(
+                input=x,
+                ksize=3,
+                strides=2,
+                padding='SAME',
+                data_format='NCHW'
+            )
+
+            return pool_out
+
+
+        x = np.random.randn(1, 32, 128, 128).astype(np.float32)
+        out = avgpool2d_Job(x)
+
+        # out.shape (1, 32, 64, 64)
+
+    """
+    op = (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("AvgPool2D_")
+        )
+        .Op("avg_pool_2d")
+        .Input("x", [input])
+        .Output("y")
+    )
+    assert data_format in ["NHWC", "NCHW", "NCHW_VECT_C"]
+    channel_pos = "channels_last" if data_format == "NHWC" else "channels_first"
+    op.Attr("data_format", channel_pos)
+    pool_size = _GetSequence(ksize, 2, "ksize")
+    op.Attr("pool_size", pool_size)
+    strides = _GetSequence(strides, 2, "strides")
+    op.Attr("strides", strides)
+    (padding_type, pads_list) = calc_pool_padding(
+        padding, get_dhw_offset(channel_pos), 2
+    )
+    assert len(pads_list) == len(input.shape) - 2
+    padding_before = [pad[0] for pad in pads_list]
+    padding_after = [pad[1] for pad in pads_list]
+    op.Attr("padding", padding_type)
+    op.Attr("padding_before", padding_before)
+    op.Attr("padding_after", padding_after)
+    op.Attr("ceil_mode", ceil_mode)
+    return op.Build().InferAndTryRun().RemoteBlobList()[0]
+
+
+def max_pool3d(
+    input: oneflow._oneflow_internal.BlobDesc,
+    ksize: Union[int, Sequence[int]],
+    strides: Union[int, Sequence[int]],
+    padding: Union[str, Sequence[Sequence[int]]],
+    data_format: str = "NCDHW",
+    ceil_mode: bool = False,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Performs the 3d-max pooling on the input.
+
+    Args:
+        input (oneflow._oneflow_internal.BlobDesc):  A 5-D `Blob` of the format specified by data_format.
+        ksize (Union[int, Sequence[int]]):  An int or list of ints that has length 1, 3 or 5. The size of the window for each dimension of the input `Blob`.
+        strides (Union[int, Sequence[int]]): An int or list of ints that has length 1, 3 or 5. The stride of the sliding window for each dimension of the input `Blob`.
+        padding (str): '`VALID'` or '`SAME'` or '`SAME_LOWER'` or '`SAME_UPPER'` or '`Sequence[Sequence[int]]'`.
+        data_format (str, optional):   "NDHWC" or "NCDHW". Defaults to "NCDHW".
+        name (Optional[str], optional): This operator's name(optional).
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A `Blob` of format specified by data_format. The max pooled output `Blob`.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def maxpool3d_Job(x: tp.Numpy.Placeholder((1, 32, 10, 128, 128))
+        ) -> tp.Numpy:
+            pool_out = flow.nn.max_pool3d(
+                input=x,
+                ksize=3,
+                strides=2,
+                padding='SAME',
+                data_format='NCDHW'
+            )
+
+            return pool_out
+
+
+        x = np.random.randn(1, 32, 10, 128, 128).astype(np.float32)
+        out = maxpool3d_Job(x)
+
+        # out.shape (1, 32, 5, 64, 64)
+
+    """
+    op = (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("MaxPool3D_")
+        )
+        .Op("max_pool_3d")
+        .Input("x", [input])
+        .Output("y")
+    )
+    assert data_format in ["NDHWC", "NCDHW"]
+    channel_pos = "channels_last" if data_format == "NDHWC" else "channels_first"
+    op.Attr("data_format", channel_pos)
+    pool_size = _GetSequence(ksize, 3, "ksize")
+    op.Attr("pool_size", pool_size)
+    strides = _GetSequence(strides, 3, "strides")
+    op.Attr("strides", strides)
+    (padding_type, pads_list) = calc_pool_padding(
+        padding, get_dhw_offset(channel_pos), 3
+    )
+    assert len(pads_list) == len(input.shape) - 2
+    padding_before = [pad[0] for pad in pads_list]
+    padding_after = [pad[1] for pad in pads_list]
+    op.Attr("padding", padding_type)
+    op.Attr("padding_before", padding_before)
+    op.Attr("padding_after", padding_after)
+    op.Attr("ceil_mode", ceil_mode)
+    return op.Build().InferAndTryRun().RemoteBlobList()[0]
+
+
+def avg_pool3d(
+    input: oneflow._oneflow_internal.BlobDesc,
+    ksize: Union[int, Sequence[int]],
+    strides: Union[int, Sequence[int]],
+    padding: Union[str, Sequence[Sequence[int]]],
+    data_format: str = "NCDHW",
+    ceil_mode: bool = False,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Performs the 3d-average pooling on the input.
+
+    Args:
+        input (oneflow._oneflow_internal.BlobDesc): A 5-D `Blob` of shape [batch, height, width, channels].
+        ksize (Union[int, Sequence[int]]): An int or list of ints that has length 1, 3 or 5. The size of the window for each dimension of the input `Blob`.
+        strides (Union[int, Sequence[int]]): An int or list of ints that has length 1, 3 or 5. The stride of the sliding window for each dimension of the input `Blob`.
+        padding (str): '`VALID'` or '`SAME'` or '`SAME_LOWER'` or '`SAME_UPPER or Sequence[Sequence[int]]'`.
+        data_format (str, optional):  '`NDHWC'` or '`NCDHW'`. Defaults to "NCDHW".
+        name (Optional[str], optional):  This operator's name(optional).Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A `Blob` with the same type as value. The average pooled output `Blob`.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def avgpool3d_Job(x: tp.Numpy.Placeholder((1, 32, 10, 128, 128))
+        ) -> tp.Numpy:
+            pool_out = flow.nn.avg_pool3d(
+                input=x,
+                ksize=3,
+                strides=2,
+                padding='SAME',
+                data_format='NCDHW'
+            )
+
+            return pool_out
+
+
+        x = np.random.randn(1, 32, 10, 128, 128).astype(np.float32)
+        out = avgpool3d_Job(x)
+
+        # out.shape (1, 32, 5, 64, 64)
+
+    """
+    op = (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("AvgPool3D_")
+        )
+        .Op("avg_pool_3d")
+        .Input("x", [input])
+        .Output("y")
+    )
+    assert data_format in ["NDHWC", "NCDHW"]
+    channel_pos = "channels_last" if data_format == "NDHWC" else "channels_first"
+    op.Attr("data_format", channel_pos)
+    pool_size = _GetSequence(ksize, 3, "ksize")
+    op.Attr("pool_size", pool_size)
+    strides = _GetSequence(strides, 3, "strides")
+    op.Attr("strides", strides)
+    (padding_type, pads_list) = calc_pool_padding(
+        padding, get_dhw_offset(channel_pos), 3
+    )
+    assert len(pads_list) == len(input.shape) - 2
+    padding_before = [pad[0] for pad in pads_list]
+    padding_after = [pad[1] for pad in pads_list]
+    op.Attr("padding", padding_type)
+    op.Attr("padding_before", padding_before)
+    op.Attr("padding_after", padding_after)
+    op.Attr("ceil_mode", ceil_mode)
+    return op.Build().InferAndTryRun().RemoteBlobList()[0]
+
+
+def _softmax_need_transpose(x, axis):
+    assert type(axis) is int
+    dim_num = len(x.shape)
+    assert dim_num >= 2
+    if axis < 0:
+        axis += dim_num
+    assert axis >= 0
+    assert axis < dim_num
+    need_transpose = False
+    permute = list(range(dim_num))
+    if axis != dim_num - 1:
+        need_transpose = True
+        permute[axis] = permute[-1]
+        permute[-1] = axis
+    return (need_transpose, permute)
+
+
+def softmax(
+    logits: oneflow._oneflow_internal.BlobDesc,
+    axis: Optional[int] = None,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Computes softmax activations.
+
+    For each element, we apply:
+
+    .. math::
+        S_i = \\frac{e^i}{\\sum_1^j e^j }
+
+    Args:
+        logits (oneflow._oneflow_internal.BlobDesc): A non-empty `Blob`.
+        axis (Optional[int], optional): The dimension softmax would be performed on. Defaults to None.
+        name (Optional[str], optional): This operator's name(optional). Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc:  A `Blob` has the same type and shape as logits.
+
+    Raises:
+        InvalidArgumentError: if logits is empty or axis is beyond the last dimension of logits.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def softmax_Job(x: tp.Numpy.Placeholder((1, 5))
+        ) -> tp.Numpy:
+            softmax_out = flow.nn.softmax(x, axis=1)
+
+            return softmax_out
+
+
+        x = np.array([[1, 2, 1, 5, 4]]).astype(np.float32)
+        out = softmax_Job(x)
+
+        # out [[0.01259415 0.03423444 0.01259415 0.68761706 0.2529602 ]]
+
+    """
+    if axis is None:
+        axis = -1
+    (need_transpose, permute) = _softmax_need_transpose(logits, axis)
+    if need_transpose:
+        logits = flow.transpose(logits, perm=permute)
+    out = (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("Softmax_")
+        )
+        .Op("softmax")
+        .Input("in", [logits])
+        .Output("out")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+    if need_transpose:
+        out = flow.transpose(out, perm=permute)
+    return out
+
+
+def logsoftmax(
+    logits: oneflow._oneflow_internal.BlobDesc,
+    axis: Optional[int] = None,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Computes logsoftmax activations.
+
+    For each element, we apply:
+
+    .. math::
+
+        LogSoftmax(x_i) = Log(\\frac{e^i}{\\sum_1^j e^j })
+
+    Args:
+        logits (oneflow._oneflow_internal.BlobDesc): A non-empty `Blob`.
+        axis (Optional[int], optional): The dimension logsoftmax would be performed on. Defaults to None.
+        name (Optional[str], optional): This operator's name(optional). Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc:  A `Blob` has the same type and shape as logits.
+
+    Raises:
+        InvalidArgumentError: if logits is empty or axis is beyond the last dimension of logits.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def logsoftmax_Job(x: tp.Numpy.Placeholder((1, 5))
+        ) -> tp.Numpy:
+            logsoftmax_out = flow.nn.logsoftmax(x, axis=1)
+            return logsoftmax_out
+
+
+        x = np.array([[1, 2, 1, 5, 4]]).astype(np.float32)
+        out = logsoftmax_Job(x)
+
+        # out [[-4.374523  -3.3745232 -4.374523  -0.3745232 -1.374523 ]]
+    """
+    if axis is None:
+        axis = -1
+    if name is None:
+        name = id_util.UniqueStr("logsoftmax")
+    return flow.math.log(
+        flow.nn.softmax(logits, axis, name=name + "_softmax"), name=name + "_log"
+    )
+
+
+def softmax_grad(
+    y: oneflow._oneflow_internal.BlobDesc,
+    dy: oneflow._oneflow_internal.BlobDesc,
+    axis: Optional[int] = None,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Computes gradient of softmax activations.
+
+    Args:
+        y (oneflow._oneflow_internal.BlobDesc):  A `Blob` representing the softmax of x.
+        dy (oneflow._oneflow_internal.BlobDesc):  gradient of y.
+        axis (Optional[int], optional):  The dimension softmax would be performed on. Defaults to None.
+        name (Optional[str], optional):  This operator's name(optional).
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc:  A `Blob` representing the gradient of x.
+    """
+    if axis is None:
+        axis = -1
+    (need_transpose, permute) = _softmax_need_transpose(y, axis)
+    if need_transpose:
+        y = flow.transpose(y, perm=permute)
+        dy = flow.transpose(dy, perm=permute)
+    dx = (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("Softmax_")
+        )
+        .Op("softmax_grad")
+        .Input("y", [y])
+        .Input("dy", [dy])
+        .Output("dx")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+    if need_transpose:
+        dx = flow.transpose(dx, perm=permute)
+    return dx
+
+
+def sparse_cross_entropy(
+    labels: oneflow._oneflow_internal.BlobDesc,
+    prediction: oneflow._oneflow_internal.BlobDesc,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Computes sparse cross entropy
+
+    Args:
+        labels (oneflow._oneflow_internal.BlobDesc): A `Blob` of shape [d_0, d_1, ..., d_{r-1}] (where r is rank of labels and result). Each entry in labels must be an index in [0, num_classes).
+        prediction (oneflow._oneflow_internal.BlobDesc): A `Blob` with the rank that is equal to the rank of the labels plus one.
+        name (Optional[str], optional): This operator's name(optional). Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A `Blob` of the same shape as labels.
+
+    Note:
+
+        The labels data type should be `oneflow.compatible.single_client.int32`.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def sparse_cross_entropy_Job(input: tp.Numpy.Placeholder((5, 2), dtype=flow.float32),
+                                    labels: tp.Numpy.Placeholder((5,), dtype=flow.int32)
+        ) -> tp.Numpy:
+            loss = flow.nn.sparse_cross_entropy(labels=labels,
+                                                prediction=input)
+            return loss
+
+
+        x = np.array([[0.3, 0.7],
+                    [0.4, 0.6],
+                    [0.5, 0.5],
+                    [0.1, 0.9],
+                    [0.2, 0.8]]).astype(np.float32)
+        labels = np.array([0, 1, 1, 0, 1]).astype(np.int32)
+        loss = sparse_cross_entropy_Job(x, labels)
+
+        # out [1.2039728  0.5108256  0.6931472  2.3025851  0.22314353]
+
+    """
+    assert labels is not None
+    assert prediction is not None
+    if len(labels.shape) == len(prediction.shape):
+        assert labels.shape[-1] == 1
+        labels = flow.squeeze(labels, axis=[-1])
+    else:
+        assert len(labels.shape) == len(prediction.shape) - 1
+    if prediction.distribute is oneflow._oneflow_internal.distribute.split(
+        len(prediction.shape) - 1
+    ):
+        return (
+            flow.user_op_builder(
+                name if name is not None else id_util.UniqueStr("SparseCrossEntropyMs_")
+            )
+            .Op("sparse_cross_entropy_ms")
+            .Input("prediction", [prediction])
+            .Input("label", [labels])
+            .Output("out")
+            .Attr("depth", int(prediction.shape[-1]))
+            .Build()
+            .InferAndTryRun()
+            .RemoteBlobList()[0]
+        )
+    else:
+        return (
+            flow.user_op_builder(
+                name if name is not None else id_util.UniqueStr("SparseCrossEntropy_")
+            )
+            .Op("sparse_cross_entropy")
+            .Input("prediction", [prediction])
+            .Input("label", [labels])
+            .Output("out")
+            .Attr("depth", int(prediction.shape[-1]))
+            .Build()
+            .InferAndTryRun()
+            .RemoteBlobList()[0]
+        )
+
+
+def softmax_cross_entropy_with_logits(
+    labels: oneflow._oneflow_internal.BlobDesc,
+    logits: oneflow._oneflow_internal.BlobDesc,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Computes softmax cross entropy between logits and labels.
+
+    Args:
+        labels (oneflow._oneflow_internal.BlobDesc): Each vector along the class dimension should hold a valid probability distribution.
+        logits (oneflow._oneflow_internal.BlobDesc): Per-label activations, typically a linear output. logits has same shape and dtype as labels.
+        name (Optional[str], optional): This operator's name(optional). Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A `Blob` that contains the softmax cross entropy loss. Its type is the same as logits and its shape is the same as labels except that it does not have the last dimension of labels.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def softmax_cross_entropy_Job(input: tp.Numpy.Placeholder((3, 3), dtype=flow.float32),
+                                    labels: tp.Numpy.Placeholder((3, 3), dtype=flow.float32)
+        ) -> tp.Numpy:
+            loss = flow.nn.softmax_cross_entropy_with_logits(labels=labels,
+                                                            logits=input)
+            return loss
+
+
+        x = np.array([[4, 1, 2],
+                    [3, 2, 3],
+                    [1, 5, 10]]).astype(np.float32)
+        labels = np.array([[0.9, 0.05, 0.05],
+                        [0.3, 0.4, 0.3],
+                        [0.8, 0.1, 0.1]]).astype(np.float32)
+        loss = softmax_cross_entropy_Job(x, labels)
+
+        # out [0.73441553 1.1240788  1.4488925 ]
+
+    """
+    assert labels is not None
+    assert logits is not None
+    assert labels.shape == logits.shape
+    assert labels.dtype == logits.dtype
+    (prob, out) = (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("SoftmaxCrossEntropy_")
+        )
+        .Op("softmax_cross_entropy")
+        .Input("prediction", [logits])
+        .Input("label", [labels])
+        .Output("prob")
+        .Output("out")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()
+    )
+    return out
+
+
+def sparse_softmax_cross_entropy_with_logits(
+    labels: oneflow._oneflow_internal.BlobDesc,
+    logits: oneflow._oneflow_internal.BlobDesc,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Computes sparse softmax cross entropy between logits and labels.
+
+    Args:
+        labels (oneflow._oneflow_internal.BlobDesc): `Blob` of shape [d_0, d_1, ..., d_{r-1}] (where r is rank of labels and result). Each entry in labels must be an index in [0, num_classes).
+        logits (oneflow._oneflow_internal.BlobDesc): Unscaled log probabilities of shape [d_0, d_1, ..., d_{r-1},num_classes].
+        name (Optional[str], optional):  This operator's name(optional). Defaults to None.
+
+    Raises:
+        ValueError: If logits are scalars (need to have rank >= 1) or if the rank of the labels is not equal to the rank of the logits minus one.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc:  A `Blob` of the same shape as labels and of the same type as logits with the softmax cross entropy loss.
+
+    Note:
+
+        The labels data type should be `oneflow.compatible.single_client.int32`.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def sparse_softmax_cross_entropy_Job(input: tp.Numpy.Placeholder((3, 3), dtype=flow.float32),
+                                             labels: tp.Numpy.Placeholder((3, ), dtype=flow.int32)
+        ) -> tp.Numpy:
+            loss = flow.nn.sparse_softmax_cross_entropy_with_logits(labels=labels,
+                                                                    logits=input)
+            return loss
+
+
+        x = np.array([[4, 1, 2],
+                    [3, 2, 3],
+                    [1, 5, 10]]).astype(np.float32)
+        labels = np.array([0, 1, 2]).astype(np.int32)
+        loss = sparse_softmax_cross_entropy_Job(x, labels)
+
+        # out [0.65784633 1.2842525  0.5557927 ]
+
+    """
+    assert labels is not None
+    assert logits is not None
+    if len(labels.shape) == len(logits.shape):
+        assert labels.shape[-1] == 1
+        labels = flow.squeeze(labels, axis=[-1])
+    else:
+        assert len(labels.shape) == len(logits.shape) - 1
+    if logits.distribute is oneflow._oneflow_internal.distribute.split(
+        len(logits.shape) - 1
+    ):
+        (prob, out) = (
+            flow.user_op_builder(
+                name
+                if name is not None
+                else id_util.UniqueStr("SparseSoftmaxCrossEntropyMs_")
+            )
+            .Op("sparse_softmax_cross_entropy_ms")
+            .Input("prediction", [logits])
+            .Input("label", [labels])
+            .Output("prob")
+            .Output("out")
+            .Attr("depth", int(logits.shape[-1]))
+            .Build()
+            .InferAndTryRun()
+            .RemoteBlobList()
+        )
+    else:
+        (prob, out) = (
+            flow.user_op_builder(
+                name
+                if name is not None
+                else id_util.UniqueStr("SparseSoftmaxCrossEntropy_")
+            )
+            .Op("sparse_softmax_cross_entropy")
+            .Input("prediction", [logits])
+            .Input("label", [labels])
+            .Output("prob")
+            .Output("out")
+            .Attr("depth", int(logits.shape[-1]))
+            .Build()
+            .InferAndTryRun()
+            .RemoteBlobList()
+        )
+    return out
+
+
+def distributed_sparse_softmax_cross_entropy_with_logits(
+    labels: oneflow._oneflow_internal.BlobDesc,
+    logits: oneflow._oneflow_internal.BlobDesc,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    assert labels is not None
+    assert logits is not None
+    if len(labels.shape) == len(logits.shape):
+        assert labels.shape[-1] == 1
+        labels = flow.squeeze(labels, axis=[-1])
+    else:
+        assert len(labels.shape) == len(logits.shape) - 1
+    (prob, out) = (
+        flow.user_op_builder(
+            name
+            if name is not None
+            else id_util.UniqueStr("DistributedSparseSoftmaxCrossEntropy_")
+        )
+        .Op("sparse_softmax_cross_entropy_ms")
+        .Input("prediction", [logits])
+        .Input("label", [labels])
+        .Output("prob")
+        .Output("out")
+        .Attr("depth", int(logits.shape[-1]))
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()
+    )
+    return out
+
+
+def sigmoid_cross_entropy_with_logits(
+    labels: oneflow._oneflow_internal.BlobDesc,
+    logits: oneflow._oneflow_internal.BlobDesc,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Computes sigmoid cross entropy given logits.
+
+    Args:
+        labels (oneflow._oneflow_internal.BlobDesc): A `Blob` of the same type and shape as logits.
+        logits (oneflow._oneflow_internal.BlobDesc): A `Blob` of type float.
+        name (Optional[str], optional): This operator's name(optional). Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc:   A `Blob` of the same shape as logits with the componentwise logistic losses.
+
+    Raises:
+        ValueError: If logits and labels do not have the same shape.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def sigmoid_cross_entropy_Job(input: tp.Numpy.Placeholder((3, 2), dtype=flow.float32),
+                                    labels: tp.Numpy.Placeholder((3, 2), dtype=flow.float32)
+        ) -> tp.Numpy:
+            loss = flow.nn.sigmoid_cross_entropy_with_logits(labels=labels,
+                                                            logits=input)
+            return loss
+
+
+        x = np.array([[4, 1],
+                    [3, 2],
+                    [1, 5]]).astype(np.float32)
+        labels = np.array([[0.7, 0.3],
+                        [0.4, 0.6],
+                        [0.2, 0.8]]).astype(np.float32)
+        loss = sigmoid_cross_entropy_Job(x, labels)
+
+        # out [[0.612735   0.90472794]
+        #      [0.89778364 0.6990613 ]
+        #      [0.97783387 0.51372755]]
+
+
+    """
+    assert labels is not None
+    assert logits is not None
+    op = (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("SigmoidCrossEntropy_")
+        )
+        .Op("sigmoid_cross_entropy")
+        .Input("prediction", [logits])
+        .Input("label", [labels])
+        .Output("loss")
+        .Build()
+    )
+    return op.InferAndTryRun().RemoteBlobList()[0]
+
+
+def _GetSequence(value, n, name):
+    """Formats value from input"""
+    if value is None:
+        value = [1]
+    elif not isinstance(value, collections.Sized):
+        value = [value]
+    current_n = len(value)
+    if current_n == 1:
+        return list(value * n)
+    elif current_n == n:
+        return list(value)
+    else:
+        raise ValueError(
+            "{} should be of length 1 or {} but was {}".format(name, n, current_n)
+        )
+
+
+def random_mask_like(
+    like: oneflow._oneflow_internal.BlobDesc,
+    rate: float,
+    seed: Optional[int] = None,
+    noise_shape: Optional[Sequence] = None,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Random mask `Blob` with same shape as '`like'`.
+
+    Args:
+        like (oneflow._oneflow_internal.BlobDesc): A `Blob`.
+        rate (float): A float value for the probability that each element is dropped.
+        seed (Optional[int], optional): Optional, int value. Defaults to None.
+        noise_shape (Optional[Sequence], optional): Optional, A 1-D `Blob`, representing the shape for randomly generated keep/drop flags. Defaults to None.
+        name (Optional[str], optional):  This operator's name(optional). Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A random mask `Blob` of the same shape of `like`.
+
+    Raises:
+        ValueError: If rate is not in [0, 1). Rate=1 is not allowed.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def random_mask_like_Job(like: tp.Numpy.Placeholder((5, 5), dtype=flow.float32)
+        ) -> tp.Numpy:
+
+            return flow.nn.random_mask_like(like=like,
+                                            rate=0.5)
+
+
+        like = np.ones(shape=(5, 5)).astype(np.float32)
+        random_mask = random_mask_like_Job(like)
+
+        # out [[0 0 0 0 0]
+        #      [1 1 1 0 0]
+        #      [1 0 1 1 0]
+        #      [0 0 0 0 1]
+        #      [1 0 1 1 1]]
+
+    """
+    assert rate is not None and rate >= 0.0 and (rate < 1.0)
+    if noise_shape is not None:
+        assert 0, "noise_shape will be supported later."
+        assert isinstance(noise_shape, (list, tuple))
+    if seed is not None:
+        assert name is not None
+    if name is None:
+        mask_op = (
+            flow.user_op_builder(id_util.UniqueStr("RandomMaskLike_"))
+            .Op("random_mask_like")
+            .Input("like", [like])
+            .Output("out")
+            .Attr("rate", float(rate))
+        )
+        if seed is not None:
+            mask_op.Attr("seed", seed)
+        else:
+            mask_op.Attr("seed", random.randint(-sys.maxsize, sys.maxsize))
+        return mask_op.Build().InferAndTryRun().RemoteBlobList()[0]
+    else:
+        module = flow.find_or_create_module(
+            name, lambda: RandomMaskLike(rate=rate, seed=seed, name=name)
+        )
+        return module(like)
+
+
+class RandomMaskLike(module_util.Module):
+    def __init__(self, rate: float, seed: Optional[int] = None, name: str = None):
+        module_util.Module.__init__(self, name)
+        if seed is None:
+            seed = random.randint(-sys.maxsize, sys.maxsize)
+        self.op_module_builder = (
+            flow.user_op_module_builder("random_mask_like")
+            .InputSize("like", 1)
+            .Output("out")
+            .Attr("rate", float(rate))
+            .Attr("seed", seed)
+            .CheckAndComplete()
+        )
+        self.op_module_builder.user_op_module.InitOpKernel()
+
+    def forward(self, like: oneflow._oneflow_internal.BlobDesc):
+        if self.call_seq_no == 0:
+            name = self.module_name
+        else:
+            name = id_util.UniqueStr("RandomMaskLike_")
+        return (
+            self.op_module_builder.OpName(name)
+            .Input("like", [like])
+            .Build()
+            .InferAndTryRun()
+            .RemoteBlobList()[0]
+        )
+
+
+def dropout(
+    x: oneflow._oneflow_internal.BlobDesc,
+    rate: float,
+    noise_shape: Optional[oneflow._oneflow_internal.BlobDesc] = None,
+    seed: Optional[int] = None,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """For preventing overfitting, randomly set elements to zero.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A floating point `Blob`.
+        rate (float): A scalar `Blob` with the same type as x. The probability that each element is dropped.
+        noise_shape (Optional[oneflow._oneflow_internal.BlobDesc], optional):  optional: A 1-D `Blob`, representing the shape for randomly generated keep/drop flags. Defaults to None.Defaults to None.
+        seed (Optional[int], optional):  Optional int value. Defaults to None.
+        name (Optional[str], optional): This operator's name(optional). Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc:   A `Blob` of the same shape of x.
+
+    Raises:
+        ValueError: If rate is not in [0, 1) or if x is not a floating point `Blob`. Rate=1 is not allowed.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+
+
+        def lenet(data, train=False):
+            initializer = flow.truncated_normal(0.1)
+            conv1 = flow.layers.conv2d(
+                data,
+                32,
+                5,
+                padding="SAME",
+                activation=flow.nn.relu,
+                name="conv1",
+                kernel_initializer=initializer,
+            )
+            pool1 = flow.nn.max_pool2d(
+                conv1, ksize=2, strides=2, padding="SAME", name="pool1", data_format="NCHW"
+            )
+            conv2 = flow.layers.conv2d(
+                pool1,
+                64,
+                5,
+                padding="SAME",
+                activation=flow.nn.relu,
+                name="conv2",
+                kernel_initializer=initializer,
+            )
+            pool2 = flow.nn.max_pool2d(
+                conv2, ksize=2, strides=2, padding="SAME", name="pool2", data_format="NCHW"
+            )
+            reshape = flow.reshape(pool2, [pool2.shape[0], -1])
+            hidden = flow.layers.dense(
+                reshape,
+                512,
+                activation=flow.nn.relu,
+                kernel_initializer=initializer,
+                name="dense1",
+            )
+            if train:
+                hidden = flow.nn.dropout(hidden, rate=0.5, name="dropout")
+
+            return flow.layers.dense(hidden, 10, kernel_initializer=initializer, name="dense2")
+
+    """
+    assert rate is not None and rate >= 0.0 and (rate < 1.0)
+    if not flow.current_global_function_desc().IsTrainable() or rate == 0.0:
+        return x
+    if seed is not None:
+        assert name is not None
+    if name is None:
+        name = id_util.UniqueStr("Dropout_")
+    mask = random_mask_like(
+        x, rate, seed, noise_shape, "%s-dropout_random_mask_like" % name
+    )
+    return (
+        flow.user_op_builder(name)
+        .Op("dropout")
+        .Input("in", [x])
+        .Input("mask", [mask])
+        .Output("out")
+        .Attr("scale", float(1.0 / (1.0 - rate)))
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def deconv2d(
+    value: Optional[oneflow._oneflow_internal.BlobDesc] = None,
+    filter: Optional[oneflow._oneflow_internal.BlobDesc] = None,
+    output_shape: Tuple[int, int, int, int] = None,
+    strides: Optional[Union[int, Sequence[int]]] = None,
+    padding: str = "VALID",
+    data_format: str = "NCHW",
+    name: Optional[str] = None,
+    input: Optional[oneflow._oneflow_internal.BlobDesc] = None,
+    filters: Optional[oneflow._oneflow_internal.BlobDesc] = None,
+    dilations: Optional[Union[int, Sequence[int]]] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """2d transposed convolution.
+
+    Args:
+        value (Optional[oneflow._oneflow_internal.BlobDesc], optional):   4-d `Blob`. Defaults to None.
+        filter (Optional[oneflow._oneflow_internal.BlobDesc], optional): Filter of transposed convolution, usually a variable. Defaults to None.
+        output_shape (Tuple[int, int, int, int]): A 1-D `Blob` representing the output shape of the deconvolution op. Defaults to None.
+        strides (Optional[Union[int, Sequence[int]]], optional): `int` or `int list`. Defaults to None.
+        padding (str, optional):  `'VALID'` or `'SAME'`. Defaults to "VALID".
+        data_format (str, optional): `'NHWC'` or `'NCHW'`. Defaults to "NCHW".
+        name (Optional[str], optional): This operator's name(optional). Defaults to None.
+        input (Optional[oneflow._oneflow_internal.BlobDesc], optional): Alias for value. Defaults to None.
+        filters (Optional[oneflow._oneflow_internal.BlobDesc], optional): Alias for filter. Defaults to None.
+        dilations (Optional[Union[int, Sequence[int]]], optional): The dilation factor for each dimension of input. Defaults to None.
+
+    Raises:
+        ValueError: shapes of `filter` and `input` must match.
+        ValueError: dilations must be an int or a list.
+        ValueError: data_format must be "NHWC" or "NCHW".
+        ValueError: padding must be "SAME" or "VALID".
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A `Blob` with the same type as `value`.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        def deconv2d(input, filters, kernel_size, strides, padding, name):
+            input_shape = input.shape
+            weight_initializer = flow.truncated_normal(0.1)
+            weight_regularizer = flow.regularizers.l2(0.0005)
+            weight_shape = (filters,
+                            input_shape[1],
+                            kernel_size[0],
+                            kernel_size[1])
+
+            weight = flow.get_variable(
+                name + "-weight",
+                shape=weight_shape,
+                initializer=weight_initializer,
+                regularizer=weight_regularizer,
+            )
+            return flow.nn.conv2d_transpose(value=input,
+                                            output_shape=(1, 32, 64, 64),
+                                            filter=weight,
+                                            strides=strides,
+                                            padding=padding,
+                                            name=name)
+
+
+        @flow.global_function()
+        def deconv2d_Job(x: tp.Numpy.Placeholder((1, 32, 32, 32),)
+        ) -> tp.Numpy:
+            deconv = deconv2d(x,
+                            filters=32,
+                            kernel_size=[3, 3],
+                            strides=2,
+                            padding='SAME',
+                            name="Convlayer")
+            return deconv
+
+
+        x = np.random.randn(1, 32, 32, 32).astype(np.float32)
+        out = deconv2d_Job(x)
+
+        # out.shape (1, 32, 64, 64)
+
+    """
+    assert (value is not None) ^ (
+        input is not None
+    ), "only one of `input` and `value` could be not None"
+    assert (filter is not None) ^ (
+        filters is not None
+    ), "only one of `filter` and `filters` could be not None"
+    filters = filters or filter
+    input = input or value
+    NDims = 2
+    assert len(input.shape) == 2 + NDims
+    assert len(filters.shape) == 2 + NDims
+    assert len(output_shape) == 2 + NDims
+    assert output_shape[0] == input.shape[0]
+    if dilations is None:
+        dilations = [1, 1]
+    elif isinstance(dilations, (list, tuple)):
+        assert len(dilations) == 2, ValueError(
+            "dilations length must be 2 when passed as a list."
+        )
+    elif isinstance(dilations, int):
+        dilations = [dilations, dilations]
+    else:
+        raise ValueError("dilations must be an int or a list.")
+    if data_format.upper() == "NCHW":
+        input_shape = input.shape[2:]
+        kernel_size = filters.shape[2:4]
+        channels = filters.shape[1]
+        assert output_shape[1] == channels
+        output_shape = output_shape[2:4]
+    elif data_format.upper() == "NHWC":
+        input_shape = input.shape[1:3]
+        kernel_size = filters.shape[-3:-1]
+        channels = filters.shape[3]
+        assert output_shape[3] == channels
+        output_shape = output_shape[1:3]
+        assert dilations == [1, 1], ValueError(
+            "dialtions must be 1 when data format is NHWC "
+        )
+    else:
+        raise ValueError('data_format must be "NHWC" or "NCHW".')
+    channel_pos = "channels_first" if data_format.startswith("NC") else "channels_last"
+    if isinstance(strides, (list, tuple)):
+        assert len(strides) == NDims, ValueError(
+            "strides length must be 2 when passed as a list."
+        )
+    elif isinstance(strides, int):
+        strides = [strides, strides]
+    else:
+        raise ValueError("strides must be an int or a list.")
+    output_padding = [0] * NDims
+    padding_needed = [0] * NDims
+    if padding.upper() == "VALID":
+        for i in range(NDims):
+            effective_filter_size = (kernel_size[i] - 1) * dilations[i] + 1
+            assert (output_shape[i] + strides[i] - effective_filter_size) // strides[
+                i
+            ] == input_shape[i]
+            tmp_output_shape = (input_shape[i] - 1) * strides[i] + effective_filter_size
+            output_padding[i] = output_shape[i] - tmp_output_shape
+    elif padding.upper() == "SAME":
+        padding_left = [0] * NDims
+        padding_right = [0] * NDims
+        for i in range(NDims):
+            assert (output_shape[i] + strides[i] - 1) // strides[i] == input_shape[i]
+            effective_filter_size = (kernel_size[i] - 1) * dilations[i] + 1
+            padding_needed[i] = max(
+                0,
+                (input_shape[i] - 1) * strides[i]
+                + effective_filter_size
+                - output_shape[i],
+            )
+            tmp_output_shape = (
+                (input_shape[i] - 1) * strides[i]
+                + effective_filter_size
+                - padding_needed[i]
+            )
+            output_padding[i] = output_shape[i] - tmp_output_shape
+            padding_left[i] = padding_needed[i] // 2
+            padding_right[i] = padding_needed[i] - padding_needed[i] // 2
+    else:
+        raise ValueError('padding must be "SAME" or "VALID".')
+    if padding.upper() == "SAME" and padding_left != padding_right:
+        assert data_format.upper() == "NCHW"
+        padding_before = [0] * NDims
+        input = (
+            flow.user_op_builder(
+                name if name is not None else id_util.UniqueStr("Deconv2d_")
+            )
+            .Op("deconv2d")
+            .Input("in", [input])
+            .Input("weight", [filters])
+            .Output("out")
+            .Attr("filters", channels)
+            .Attr("padding_before", padding_before)
+            .Attr("data_format", channel_pos)
+            .Attr("kernel_size", kernel_size)
+            .Attr("strides", strides)
+            .Attr("dilation_rate", dilations)
+            .Attr("output_padding", output_padding)
+            .Build()
+            .InferAndTryRun()
+            .RemoteBlobList()[0]
+        )
+        return flow.pad_grad(
+            input,
+            [
+                (0, 0),
+                (0, 0),
+                (padding_left[0], padding_right[0]),
+                (padding_left[1], padding_right[1]),
+            ],
+            name=name + "_pad_grad" if name is not None else None,
+        )
+    assert len(padding_needed) == len(input.shape) - 2
+    padding_before = []
+    for pad in padding_needed:
+        assert pad % 2 == 0
+        padding_before.append(pad // 2)
+    return (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("Deconv2d_")
+        )
+        .Op("deconv2d")
+        .Input("in", [input])
+        .Input("weight", [filters])
+        .Output("out")
+        .Attr("filters", channels)
+        .Attr("padding_before", padding_before)
+        .Attr("data_format", channel_pos)
+        .Attr("kernel_size", kernel_size)
+        .Attr("strides", strides)
+        .Attr("dilation_rate", dilations)
+        .Attr("output_padding", output_padding)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def deconv2d_torch(
+    value=None,
+    filter=None,
+    output_padding=None,
+    strides=None,
+    padding_needed=None,
+    data_format="NCHW",
+    name=None,
+    input=None,
+    filters=None,
+    dilations=None,
+):
+    assert (value is not None) ^ (
+        input is not None
+    ), "only one of `input` and `value` could be not None"
+    assert (filter is not None) ^ (
+        filters is not None
+    ), "only one of `filter` and `filters` could be not None"
+    filters = filters or filter
+    input = input or value
+    NDims = 2
+    assert len(input.shape) == 2 + NDims
+    assert len(filters.shape) == 2 + NDims
+    if dilations is None:
+        dilations = [1, 1]
+    elif isinstance(dilations, (list, tuple)):
+        assert len(dilations) == 2, ValueError(
+            "dilations length must be 2 when passed as a list."
+        )
+    elif isinstance(dilations, int):
+        dilations = [dilations, dilations]
+    else:
+        raise ValueError("dilations must be an int or a list.")
+    if data_format.upper() == "NCHW":
+        input_shape = input.shape[2:]
+        kernel_size = filters.shape[2:4]
+        channels = filters.shape[1]
+    elif data_format.upper() == "NHWC":
+        input_shape = input.shape[1:3]
+        kernel_size = filters.shape[-3:-1]
+        channels = filters.shape[3]
+        assert dilations == [1, 1], ValueError(
+            "dialtions must be 1 when data format is NHWC "
+        )
+    else:
+        raise ValueError('data_format must be "NHWC" or "NCHW".')
+    channel_pos = "channels_first" if data_format.startswith("NC") else "channels_last"
+    if isinstance(strides, (list, tuple)):
+        assert len(strides) == NDims, ValueError(
+            "strides length must be 2 when passed as a list."
+        )
+    elif isinstance(strides, int):
+        strides = [strides, strides]
+    else:
+        raise ValueError("strides must be an int or a list.")
+    assert len(padding_needed) == len(input.shape) - 2
+    padding_before = []
+    for pad in padding_needed:
+        assert pad % 2 == 0
+        padding_before.append(pad // 2)
+    if output_padding is None:
+        output_padding = (0, 0)
+    return (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("Deconv2d_")
+        )
+        .Op("deconv2d")
+        .Input("in", [input])
+        .Input("weight", [filters])
+        .Output("out")
+        .Attr("filters", channels)
+        .Attr("padding_before", padding_before)
+        .Attr("data_format", channel_pos)
+        .Attr("kernel_size", kernel_size)
+        .Attr("strides", strides)
+        .Attr("dilation_rate", dilations)
+        .Attr("output_padding", output_padding)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def leaky_relu(
+    x: oneflow._oneflow_internal.BlobDesc,
+    alpha: float = 0.2,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Leaky ReLU activation.
+
+    .. math::
+        out = max(x, alpha*x)
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc):  A `Blob` representing preactivation values.
+        alpha (float, optional): Slope of the activation function at x < 0 with float type. Default value is 0.2.
+        name (Optional[str], optional): This operator's name(optional). Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The activation `Blob`.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def leaky_relu_Job(x: tp.Numpy.Placeholder((5, ),)
+        ) -> tp.Numpy:
+            leaky_relu = flow.nn.leaky_relu(x, alpha=0.2)
+
+            return leaky_relu
+
+
+        x = np.array([-10, -5, 0, 5, 10]).astype(np.float32)
+        out = leaky_relu_Job(x)
+
+        # out [-2. -1.  0.  5. 10.]
+
+    """
+    return (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("LeakyRelu_")
+        )
+        .Op("leaky_relu")
+        .Input("x", [x])
+        .Output("y")
+        .Attr("alpha", float(alpha))
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def elu(
+    x: oneflow._oneflow_internal.BlobDesc,
+    alpha: float = 1.0,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """The ELU activation.
+
+    The formula is:
+
+    .. math::
+
+        \\text{ELU}(x) = \\begin{cases}
+				x & \\text{ if } x \\gt 0  \\\\
+                \\alpha*(exp(x)-1) & \\text{ if } x \\le 0 \\\\
+    		    \\end{cases}
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        import numpy as np
+
+
+        @flow.global_function()
+        def elu_job(x: tp.Numpy.Placeholder(shape=(3, )))->tp.Numpy:
+            return flow.nn.elu(x, alpha=1.0)
+
+
+        x = np.array([-3.5, 1, 3.5]).astype(np.float32)
+        out = elu_job(x)
+
+        # output [-0.9698026  1.         3.5      ]
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): The input Tensor.
+        alpha (float, optional): The `alpha` value for the ELU formula. Defaults to 1.0.
+        name (Optional[str], optional): The name for the operator. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The activated Tensor.
+    """
+    alpha = float(alpha)
+    if name is None:
+        name = id_util.UniqueStr("Elu_")
+    return (
+        flow.user_op_builder(name)
+        .Op("elu")
+        .Input("in", [x])
+        .Output("out")
+        .Attr("alpha", alpha)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def hard_sigmoid(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """The Hardsigmoid activation.
+
+    The formula is:
+
+    .. math::
+
+        \\text{Hardsigmoid}(x) = \\begin{cases}
+            0 & \\text{ if } x \\le -3  \\\\
+            1 & \\text{ if } x \\ge +3 \\\\
+            \\frac{x}{6} + \\frac{1}{2} & \\text{ otherwise } \\\\
+        \\end{cases}
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        import numpy as np
+
+
+        @flow.global_function()
+        def hardsigmoid_job(x: tp.Numpy.Placeholder(shape=(3, )))->tp.Numpy:
+            out = flow.nn.hardsigmoid(x)
+
+            return out
+
+
+        x = np.array([-3.1, 0, 3.3]).astype(np.float32)
+        out = hardsigmoid_job(x)
+
+        # output [0.  0.5 1. ]
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): The input Tensor.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The activated Tensor.
+    """
+    return (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("HardSigmoid_")
+        )
+        .Op("hardsigmoid")
+        .Input("in", [x])
+        .Output("out")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def mish(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """The Mish activation function.
+
+    The equation is:
+
+    .. math::
+
+        out = x*tanh(ln(1+e^x))
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        import numpy as np
+
+
+        @flow.global_function()
+        def mish_job(x: tp.Numpy.Placeholder(shape=(5, )))->tp.Numpy:
+            return flow.nn.mish(x)
+
+
+        x = np.array([-0.5, 0, 0.5, 1.0, 1.5]).astype(np.float32)
+        out = mish_job(x)
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): The input Blob.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+    """
+    if name is None:
+        name = id_util.UniqueStr("Mish_")
+    return x * flow.math.tanh(
+        flow.math.softplus(x, name=name + "softplus"), name=name + "tanh"
+    )
+
+
+def swish(
+    x: oneflow._oneflow_internal.BlobDesc, beta: float = 1.0, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """The Swish activation function.
+
+    The equation is:
+
+    .. math::
+
+        out = x * sigmoid(\\beta*x)
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        import numpy as np
+
+
+        @flow.global_function()
+        def swish_job(x: tp.Numpy.Placeholder(shape=(5, )))->tp.Numpy:
+            return flow.nn.swish(x)
+
+
+        x = np.array([-0.5, 0, 0.5, 1, 1.5]).astype(np.float32)
+
+
+        out = swish_job(x)
+        # output [-0.18877034  0.          0.31122968  0.7310586   1.2263618 ]
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): The input Blob.
+        beta (float, optional): The smooth factor. Defaults to 1.0.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+    """
+    if name is None:
+        name = id_util.UniqueStr("Swish_")
+    return x * flow.math.sigmoid(beta * x, name=name + "_sigmoid")
+
+
+def hardswish(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """The Hardswish activation.
+
+    The formula is:
+
+    .. math::
+
+        \\text{Hardswish}(x) = \\begin{cases}
+            0 & \\text{ if } x \\le -3  \\\\
+            x & \\text{ if } x \\ge +3 \\\\
+            x*(x+3)/6 & \\text{ otherwise } \\\\
+        \\end{cases}
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        import numpy as np
+
+
+        @flow.global_function()
+        def hardswish_job(x: tp.Numpy.Placeholder(shape=(3, )))->tp.Numpy:
+            return flow.nn.hardswish(x)
+
+
+        x = np.array([-3.5, 1, 3.5]).astype(np.float32)
+        out = hardswish_job(x)
+
+        # output [0.        0.6666667 3.5      ]
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): The input Tensor.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The activated Tensor.
+    """
+    if name is None:
+        name = id_util.UniqueStr("HardSwish_")
+    return (
+        flow.user_op_builder(name)
+        .Op("hardswish")
+        .Input("in", [x])
+        .Output("out")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def hardtanh(
+    x: oneflow._oneflow_internal.BlobDesc,
+    min_val: float = -1.0,
+    max_val: float = 1.0,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """The Hardtanh activation.
+
+    The equation is:
+
+    .. math::
+
+        \\text{HardTanh}(x) = \\begin{cases}
+            max\\_val & \\text{ if } x > max\\_val \\\\
+            -min\\_val & \\text{ if } x < min\\_val \\\\
+            x & \\text{ otherwise } \\\\
+        \\end{cases}
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        import numpy as np
+
+        @flow.global_function()
+        def hardtanh_job(x: tp.Numpy.Placeholder(shape=(2, 3)))->tp.Numpy:
+            return flow.nn.hardtanh(x, min_val=-1.25, max_val=1.2)
+
+
+        x = np.array([[-1.5, -1.1, 0.6],
+                    [1.2, 1.3, 1.5]]).astype(np.float32)
+        out = hardtanh_job(x)
+
+        # output [[-1.25 -1.1   0.6 ]
+        #         [ 1.2   1.2   1.2 ]]
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): The input Tensor.
+        min_val (float, optional): The minimum value of the linear region range. Defaults to -1.
+        max_val (float, optional): The maximum value of the linear region range. Defaults to 1.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The activated tensor.
+    """
+    if name is None:
+        name = id_util.UniqueStr("Hardtanh_")
+    min_val = float(min_val)
+    max_val = float(max_val)
+    assert min_val < max_val, "max_val should be larger than min_val"
+    return (
+        flow.user_op_builder(name)
+        .Op("hardtanh")
+        .Input("in", [x])
+        .Attr("min_val", min_val)
+        .Attr("max_val", max_val)
+        .Output("out")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def relu6(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Relu6 activation, it clips the value around (0, 6).
+
+    The equation is:
+
+    .. math::
+
+        \\text{Relu6}(x) = \\begin{cases}
+            6 & \\text{ if } x > 6 \\\\
+            0 & \\text{ if } x < 0 \\\\
+            x & \\text{ otherwise } \\\\
+        \\end{cases}
+
+    For example:
+
+    .. code-block::
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        import numpy as np
+
+
+        @flow.global_function()
+        def relu6_job(x: tp.Numpy.Placeholder(shape=(2, 3)))->tp.Numpy:
+            return flow.nn.relu6(x)
+
+        x = np.array([[-1, -0.5, 0.0],
+                      [0.5, 6.0, 7]]).astype(np.float32)
+
+        out = relu6_job(x)
+
+        # output [[0.  0.  0. ]
+        #         [0.5 6.  6. ]]
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): The input Tensor.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The activated Tensor.
+    """
+    if name is None:
+        name = id_util.UniqueStr("Relu6_")
+    return flow.nn.hardtanh(x, min_val=0.0, max_val=6.0, name=name)
+
+
+def l1_loss(
+    input: oneflow._oneflow_internal.BlobDesc,
+    target: oneflow._oneflow_internal.BlobDesc,
+    reduction: str = "mean",
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the L1 Loss between each element in `input` and `target`.
+
+    The equation is:
+
+    if reduction = "none":
+
+    .. math::
+
+        output = |Target - Input|
+
+    if reduction = "mean":
+
+    .. math::
+
+        output = \\frac{1}{n}\\sum_{i=1}^n|Target_i - Input_i|
+
+    if reduction = "sum":
+
+    .. math::
+
+        output = \\sum_{i=1}^n|Target_i - Input_i|
+
+    Args:
+        input (oneflow._oneflow_internal.BlobDesc): The input Blob.
+        target (oneflow._oneflow_internal.BlobDesc): The target value.
+        reduction (str): The reduce type, it can be one of "none", "mean", "sum". Defaults to "mean".
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+
+    For example:
+
+    Example 1:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        import numpy as np
+
+
+        @flow.global_function()
+        def l1_job(x: tp.Numpy.Placeholder(shape=(3, 3)),
+                y: tp.Numpy.Placeholder(shape=(3, 3))) -> tp.Numpy:
+            out = flow.nn.L1Loss(x, y, reduction="mean", name="l1")
+
+            return out
+
+
+        input = np.array([[1, 1, 1], [2, 2, 2], [7, 7, 7]]).astype(np.float32)
+        target = np.array([[4, 4, 4], [4, 4, 4], [4, 4, 4]]).astype(np.float32)
+
+        out = l1_job(input, target)
+
+        # output [2.6666667]
+
+    Example 2:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        import numpy as np
+
+
+        @flow.global_function()
+        def l1_job(x: tp.Numpy.Placeholder(shape=(3, 3)),
+                y: tp.Numpy.Placeholder(shape=(3, 3))) -> tp.Numpy:
+            out = flow.nn.L1Loss(x, y, reduction="sum", name="l1")
+
+            return out
+
+
+        input = np.array([[1, 1, 1], [2, 2, 2], [7, 7, 7]]).astype(np.float32)
+        target = np.array([[4, 4, 4], [4, 4, 4], [4, 4, 4]]).astype(np.float32)
+
+        out = l1_job(input, target)
+
+        # output [24.]
+
+    """
+    assert (
+        input.shape == target.shape
+    ), "The Input shape must be the same as Target shape"
+    assert reduction in [
+        "none",
+        "mean",
+        "sum",
+    ], "{} is not a valid value for reduction, The reduction must be the one of `none`, `mean`, `sum`. ".format(
+        reduction
+    )
+    if name is None:
+        name = id_util.UniqueStr("L1Loss")
+    l1_value = flow.math.abs(
+        flow.math.subtract(target, input, name=name + "_sub"), name=name + "_abs"
+    )
+    if reduction == "mean":
+        return flow.math.reduce_mean(l1_value, name=name + "_reduce_mean")
+    elif reduction == "sum":
+        return flow.math.reduce_sum(l1_value, name=name + "_reduce_sum")
+    else:
+        return l1_value
+
+
+def bce_loss(
+    input: oneflow._oneflow_internal.BlobDesc,
+    target: oneflow._oneflow_internal.BlobDesc,
+    weight: remote_blob_util = None,
+    reduction: str = "mean",
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the binary cross entropy loss.
+
+    The equation is:
+
+    if reduction = "none":
+
+    .. math::
+
+        out = -(Target_i*log(Input_i) + (1-Target_i)*log(1-Input_i))
+
+    if reduction = "mean":
+
+    .. math::
+
+        out = -\\frac{1}{n}\\sum_{i=1}^n(Target_i*log(Input_i) + (1-Target_i)*log(1-Input_i))
+
+    if reduction = "sum":
+
+    .. math::
+
+        out = -\\sum_{i=1}^n(Target_i*log(Input_i) + (1-Target_i)*log(1-Input_i))
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        import numpy as np
+
+
+        @flow.global_function()
+        def bce_loss_job(input: tp.Numpy.Placeholder(shape=(2, 3)),
+                                target: tp.Numpy.Placeholder(shape=(2, 3)),
+                                weight: tp.Numpy.Placeholder(shape=(2, 3)))->tp.Numpy:
+            sigmoid_input = flow.math.sigmoid(input)
+            return flow.nn.BCELoss(sigmoid_input, target, weight, reduction='mean')
+
+
+        np_input = np.array([[1.2, 0.2, -0.3],
+                             [0.7, 0.6, -2]]).astype(np.float32)
+
+        np_target = np.array([[0, 1, 0],
+                              [1, 0, 1]]).astype(np.float32)
+
+        np_weight = np.array([[2, 2, 2],
+                              [2, 2, 2]]).astype(np.float32)
+
+        # output [2.0611262]
+
+    Args:
+        input (oneflow._oneflow_internal.BlobDesc): The input Blob.
+        target (oneflow._oneflow_internal.BlobDesc): The target value.
+        weight (remote_blob_util, optional): The manual rescaling weight to the loss. Default to None, whose corresponding weight value is 1.
+        reduction (str, optional): The reduce type, it can be one of "none", "mean", "sum". Defaults to "mean".
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Attention:
+        The input value must be in the range of (0, 1). Or the loss function may return `nan` value.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+    """
+    assert (
+        input.shape == target.shape
+    ), "The Input shape must be the same as Target shape"
+    assert reduction in [
+        "none",
+        "mean",
+        "sum",
+    ], "{} is not a valid value for reduction, The reduction must be the one of `none`, `mean`, `sum`. ".format(
+        reduction
+    )
+    if name is None:
+        name = id_util.UniqueStr("BCELoss")
+    _cross_entropy_loss = flow.math.negative(
+        target * flow.math.log(input) + (1 - target) * flow.math.log(1 - input)
+    )
+    if weight is not None:
+        assert (
+            weight.shape == input.shape
+        ), "The weight shape must be the same as Input shape"
+        _weighted_loss = weight * _cross_entropy_loss
+    else:
+        _weighted_loss = _cross_entropy_loss
+    if reduction == "mean":
+        return flow.math.reduce_mean(_weighted_loss, name=name + "_reduce_mean")
+    elif reduction == "sum":
+        return flow.math.reduce_sum(_weighted_loss, name=name + "_reduce_sum")
+    else:
+        return _weighted_loss
+
+
+def bce_with_logits_loss(
+    input: oneflow._oneflow_internal.BlobDesc,
+    target: oneflow._oneflow_internal.BlobDesc,
+    weight: remote_blob_util = None,
+    pos_weight: remote_blob_util = None,
+    reduction: str = "mean",
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator combines the `Sigmoid` and `BCELoss` together. For numerical stability,
+    we apply some math tricks instead of using `Sigmoid` layer with `BCELoss`.
+
+    The equation is:
+
+    if reduction = "none":
+
+    .. math::
+
+        out = -weight*[Pos\\_weight*y*log\\sigma({x}) + (1-y)*log(1-\\sigma(x))]
+
+    if reduction = "mean":
+
+    .. math::
+
+        out = -\\frac{weight}{n}\\sum_{i=1}^n[Pos\\_weight*y*log\\sigma({x}) + (1-y)*log(1-\\sigma(x))]
+
+    if reduction = "sum":
+
+    .. math::
+
+        out = -weight*\\sum_{i=1}^n[Pos\\_weight*y*log\\sigma({x}) + (1-y)*log(1-\\sigma(x))]
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        import numpy as np
+
+
+        @flow.global_function()
+        def bce_with_logits_loss_job(input: tp.Numpy.Placeholder(shape=(2, 3)),
+                                     target: tp.Numpy.Placeholder(shape=(2, 3)),
+                                     weight: tp.Numpy.Placeholder(shape=(2, 3)),
+                                     pos_weight: tp.Numpy.Placeholder(shape=(3, )))->tp.Numpy:
+            return flow.nn.BCEWithLogitsLoss(input, target, weight, pos_weight, reduction='mean')
+
+
+        np_input = np.array([[1.2, 0.2, -0.3],
+                             [0.7, 0.6, -2]]).astype(np.float32)
+
+        np_target = np.array([[0, 1, 0],
+                              [1, 0, 1]]).astype(np.float32)
+
+        np_weight = np.array([[2, 2, 2],
+                              [2, 2, 2]]).astype(np.float32)
+
+        np_pos_weight = np.array([1.2, 1.3, 1.4]).astype(np.float32)
+
+        out = bce_with_logits_loss_job(np_input, np_target, np_weight, np_pos_weight)
+
+        # output [2.4314096]
+
+    Args:
+        input (oneflow._oneflow_internal.BlobDesc): The input Tensor.
+        target (oneflow._oneflow_internal.BlobDesc): The target Tensor.
+        weight (remote_blob_util, optional): The manual rescaling weight to the loss. Defaults to None.
+        pos_weight (remote_blob_util, optional): The manual rescaling weight to the positive examples. Defaults to None.
+        reduction (str, optional): The reduce type, it can be one of "none", "mean", "sum". Defaults to "mean".
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+    """
+    assert (
+        input.shape == target.shape
+    ), "The Input shape must be the same as Target shape"
+    assert reduction in [
+        "none",
+        "mean",
+        "sum",
+    ], "{} is not a valid value for reduction, The reduction must be the one of `none`, `mean`, `sum`. ".format(
+        reduction
+    )
+    if name is None:
+        name = id_util.UniqueStr("BCEWithLogitsLoss")
+    _neg_input = flow.math.negative(input)
+    _max_val = flow.clip(_neg_input, min_value=0)
+    _neg_max_val = flow.math.negative(_max_val)
+    if pos_weight:
+        assert (
+            pos_weight.shape[0] == input.shape[-1]
+        ), "The length of `pos_weight` must be equal to the number of classes. Found the length of pos_weight {} vs classes {}".format(
+            pos_weight.shape[0], input.shape[-1]
+        )
+        _log_weight = (pos_weight - 1) * target + 1
+        _loss = (1 - target) * input + _log_weight * (
+            flow.math.log(
+                flow.math.exp(_neg_max_val) + flow.math.exp(_neg_input - _max_val)
+            )
+            + _max_val
+        )
+    else:
+        _loss = (1 - target) * input + _max_val
+        _loss += flow.math.log(
+            flow.math.exp(_neg_max_val) + flow.math.exp(_neg_input - _max_val)
+        )
+    if weight is not None:
+        assert (
+            weight.shape == input.shape
+        ), "The weight shape must be the same as Input shape"
+        _weighted_loss = weight * _loss
+    else:
+        _weighted_loss = _loss
+    if reduction == "mean":
+        return flow.math.reduce_mean(_weighted_loss, name=name + "_reduce_mean")
+    elif reduction == "sum":
+        return flow.math.reduce_sum(_weighted_loss, name=name + "_reduce_sum")
+    else:
+        return _weighted_loss
+
+
+def mse_loss(
+    input: oneflow._oneflow_internal.BlobDesc,
+    target: oneflow._oneflow_internal.BlobDesc,
+    reduction: str = "mean",
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the mean squared error between each element in `input` and `target`.
+
+    The equation is:
+
+    if reduction = "none":
+
+    .. math::
+
+        out = (Target_i - Input_i)^2
+
+    if reduction = "mean":
+
+    .. math::
+
+        out = \\frac{1}{n}\\sum_{i=1}^n(Target_i - Input_i)^2
+
+    if reduction = "sum":
+
+    .. math::
+
+        out = \\sum_{i=1}^n(Target_i - Input_i)^2
+
+    Args:
+        input (oneflow._oneflow_internal.BlobDesc): The input Blob.
+        target (oneflow._oneflow_internal.BlobDesc): The target value.
+        reduction (str) = The reduce type, it can be the one of "none", "mean", "sum". Defaults to "mean".
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+
+    For example:
+
+    Example 1:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        import numpy as np
+
+
+        @flow.global_function()
+        def mseloss_job(input: tp.Numpy.Placeholder(shape=(3, 3)),
+                        target: tp.Numpy.Placeholder(shape=(3, 3)))->tp.Numpy:
+            out = flow.nn.MSELoss(input, target, reduction="mean")
+            return out
+
+        input = np.array([[1, 1, 1], [2, 2, 2], [7, 7, 7]]).astype(np.float32)
+        target = np.array([[4, 4, 4], [4, 4, 4], [4, 4, 4]]).astype(np.float32)
+
+        out = mseloss_job(input, target)
+
+        # output [7.3333335]
+
+    Example 2:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        import numpy as np
+
+
+        @flow.global_function()
+        def mseloss_job(input: tp.Numpy.Placeholder(shape=(3, 3)),
+                        target: tp.Numpy.Placeholder(shape=(3, 3)))->tp.Numpy:
+            out = flow.nn.MSELoss(input, target, reduction="sum")
+            return out
+
+        input = np.array([[1, 1, 1], [2, 2, 2], [7, 7, 7]]).astype(np.float32)
+        target = np.array([[4, 4, 4], [4, 4, 4], [4, 4, 4]]).astype(np.float32)
+
+        out = mseloss_job(input, target)
+
+        # output [66.]
+    """
+    assert (
+        input.shape == target.shape
+    ), "The Input shape must be the same as Target shape"
+    assert reduction in [
+        "none",
+        "mean",
+        "sum",
+    ], "{} is not a valid value for reduction, The reduction must be the one of `none`, `mean`, `sum`. ".format(
+        reduction
+    )
+    if name is None:
+        name = id_util.UniqueStr("MSELoss")
+    mean_squared_difference = flow.math.squared_difference(
+        target, input, name=name + "_mean_squared"
+    )
+    if reduction == "mean":
+        return flow.math.reduce_mean(
+            mean_squared_difference, name=name + "_reduce_mean"
+        )
+    elif reduction == "sum":
+        return flow.math.reduce_sum(mean_squared_difference, name=name + "_reduce_sum")
+    else:
+        return mean_squared_difference
+
+
+def margin_ranking_loss(
+    input1: oneflow._oneflow_internal.BlobDesc,
+    input2: oneflow._oneflow_internal.BlobDesc,
+    target: oneflow._oneflow_internal.BlobDesc,
+    margin: float = 0.0,
+    reduction: str = "mean",
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the Margin Ranking loss.
+
+    The equation is:
+
+    if reduction = "none":
+
+    .. math::
+
+        out = \\max\\ (0, -y*(x_1-x_2)+margin)
+
+    if reduction = "mean":
+
+    .. math::
+
+        out = \\frac{1}{n}\\sum_{i=1}^n\\max\\ (0, -y*(x_1-x_2)+margin)
+
+    if reduction = "sum":
+
+    .. math::
+
+        out = \\sum_{i=1}^n\\max\\ (0, -y*(x_1-x_2)+margin)
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        import numpy as np
+
+
+        @flow.global_function()
+        def margin_ranking_loss_job(input1: tp.Numpy.Placeholder(shape=(3, 3)),
+                                    input2: tp.Numpy.Placeholder(shape=(3, 3)),
+                                    target: tp.Numpy.Placeholder(shape=(3, 3)))->tp.Numpy:
+            out = flow.nn.MarginRankingLoss(input1, input2, target, margin=1.0)
+            return out
+
+        np_input1 = np.array([[1, 2, 3],
+                            [4, 5, 6],
+                            [7, 8, 9]]).astype(np.float32)
+        np_input2 = np.array([[2, 2, 2],
+                            [2, 2, 2],
+                            [2, 2, 2]]).astype(np.float32)
+        np_target = np.array([[3, 3, 3],
+                            [3, 3, 3],
+                            [3, 3, 3]]).astype(np.float32)
+
+        out = margin_ranking_loss_job(np_input1, np_input2, np_target)
+
+        # output [0.5555556]
+
+    Args:
+        input1 (oneflow._oneflow_internal.BlobDesc): The ranking score of input1 Blob.
+        input2 (oneflow._oneflow_internal.BlobDesc): The ranking score of input2 Blob.
+        target (oneflow._oneflow_internal.BlobDesc): The target Blob.
+        margin (float): The margin value. Defaults to 0.0.
+        reduction (str, optional): The reduce type, it can be one of "none", "mean", "sum". Defaults to "mean".
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+    """
+    assert (
+        input1.shape == input2.shape
+    ), "The shape of `input1`, `input2` must be the same. "
+    assert reduction in [
+        "none",
+        "mean",
+        "sum",
+    ], "{} is not a valid value for reduction, The reduction must be the one of `none`, `mean`, `sum`. ".format(
+        reduction
+    )
+    if name is None:
+        name = id_util.UniqueStr("MarginRankingLoss")
+    _margin_loss = flow.math.negative(flow.math.subtract(input1, input2))
+    _margin_loss = flow.math.multiply(target, _margin_loss)
+    _margin_loss = flow.math.add(margin, _margin_loss)
+    _clipped_margin_loss = flow.clip(_margin_loss, min_value=0.0)
+    if reduction == "none":
+        return _clipped_margin_loss
+    elif reduction == "mean":
+        return flow.math.reduce_mean(_clipped_margin_loss, name=name + "_reduce_mean")
+    else:
+        return flow.math.reduce_sum(_clipped_margin_loss, name=name + "_reduce_sum")
+
+
+def triplet_margin_loss(
+    anchor: oneflow._oneflow_internal.BlobDesc,
+    positive: oneflow._oneflow_internal.BlobDesc,
+    negative: oneflow._oneflow_internal.BlobDesc,
+    margin: float = 1.0,
+    p: float = 2.0,
+    eps: float = 1e-06,
+    swap: bool = False,
+    reduction: str = "mean",
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the Triplet Margin Loss.
+
+    The equation is:
+
+    if reduction = "none":
+
+    .. math::
+
+        output = \\max\\{\\left\\lVert a_i - p_i \\right\\rVert_p - \\left\\lVert a_i - n_i \\right\\rVert_p + {\\rm margin}, 0\\}
+
+    if reduction = "mean":
+
+    .. math::
+
+        output = \\frac{1}{n}\\sum_{i=1}^n\\max\\{\\left\\lVert a_i - p_i \\right\\rVert_p - \\left\\lVert a_i - n_i \\right\\rVert_p + {\\rm margin}, 0\\}
+
+    if reduction = "sum":
+
+    .. math::
+
+        output = \\sum_{i=1}^n\\max\\{\\left\\lVert a_i - p_i \\right\\rVert_p - \\left\\lVert a_i - n_i \\right\\rVert_p + {\\rm margin}, 0\\}
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        import numpy as np
+
+
+        @flow.global_function()
+        def triplet_loss_job(anchor: tp.Numpy.Placeholder(shape=(3, 3)),
+                            pos: tp.Numpy.Placeholder(shape=(3, 3)),
+                            neg: tp.Numpy.Placeholder(shape=(3, 3)))->tp.Numpy:
+            out = flow.nn.TripletMarginLoss(anchor, pos, neg, margin=1.0, p=2.0)
+            return out
+
+        np_anchor = np.array([[1, 2, 3],
+                            [4, 5, 6],
+                            [7, 8, 9]]).astype(np.float32)
+        np_pos = np.array([[2, 2, 2],
+                        [2, 2, 2],
+                        [2, 2, 2]]).astype(np.float32)
+        np_neg = np.array([[3, 3, 3],
+                        [3, 3, 3],
+                        [3, 3, 3]]).astype(np.float32)
+
+        out = triplet_loss_job(np_anchor, np_pos, np_neg)
+
+        # output [1.8449262]
+
+    Args:
+        anchor (oneflow._oneflow_internal.BlobDesc): The anchor Blob.
+        positive (oneflow._oneflow_internal.BlobDesc): The positive sample Blob.
+        negative (oneflow._oneflow_internal.BlobDesc): The negative sample Blob.
+        margin (float, optional): The margin value. Defaults to 1.0.
+        p (float, optional): The norm degree for computing distance. Defaults to 2.0.
+        eps (float, optional): A small value use in norm computation. Defaults to 1e-6.
+        swap (bool, optional): Whether to swap the distance.
+        For more details you can check the Paper `Learning shallow convolutional feature descriptors with triplet losses`. Defaults to False.
+        reduction (str, optional): The reduce type, it can be one of "none", "mean", "sum". Defaults to "mean".
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+    """
+    assert reduction in [
+        "none",
+        "mean",
+        "sum",
+    ], "{} is not a valid value for reduction, The reduction must be the one of `none`, `mean`, `sum`. ".format(
+        reduction
+    )
+    assert (
+        swap == False
+    ), "For now we only support `swap=True`, OneFlow still have backward error in minimum"
+    if name is None:
+        name = id_util.UniqueStr("TripletMarginLoss")
+
+    def _p_norm(x, p=2.0, name="p_norm"):
+        """Compute the p-norm
+
+        The equation is:
+
+        .. math::
+
+            out = \\sqrt[P]{\\sum_{i=0}^{n}(abs(x)^P)}
+
+        Args:
+            x ([type]): The input Blob.
+            p ([type], optional): The norm degree. Defaults to 2..
+
+        """
+        _abs_val = flow.math.abs(x, name=name + "_abs")
+        if p == 2.0:
+            _norm = flow.math.square(_abs_val, name=name + "_square")
+            _norm = flow.math.reduce_sum(_norm, axis=1, name=name + "_sum")
+            _norm_val = flow.math.sqrt(_norm, name=name + "_sqrt")
+        else:
+            _p_constant = flow.constant_like(
+                like=_abs_val, value=p, dtype=flow.float32, name=name + "_p_constant"
+            )
+            _norm = flow.math.pow(_abs_val, _p_constant, name=name + "_pow1")
+            _norm = flow.math.reduce_sum(_norm, axis=1, name=name + "_sum")
+            _p_reciprocal_constant = flow.constant_like(
+                like=_norm,
+                value=1.0 / p,
+                dtype=flow.float32,
+                name=name + "_p_reciprocal_constant",
+            )
+            _norm_val = flow.math.pow(
+                _norm, _p_reciprocal_constant, name=name + "_norm_val"
+            )
+        return _norm_val
+
+    _distance_1 = _p_norm(anchor - positive + eps, p=p, name=name + "_distance_1")
+    _distance_2 = _p_norm(anchor - negative + eps, p=p, name=name + "_distance_2")
+    if swap:
+        _distance_swap = _p_norm(positive - negative + eps, p=p)
+        _distance_swap = flow.math.reduce_sum(_distance_swap, axis=1)
+        _distance_2 = flow.math.minimum(_distance_2, _distance_swap)
+    _triplet_loss = flow.clip(margin + _distance_1 - _distance_2, min_value=0.0)
+    if reduction == "mean":
+        return flow.math.reduce_mean(_triplet_loss, name=name + "_reduce_mean")
+    elif reduction == "sum":
+        return flow.math.reduce_sum(_triplet_loss, name=name + "_reduce_sum")
+    else:
+        return _triplet_loss
+
+
+def pixel_shuffle(
+    input: oneflow._oneflow_internal.BlobDesc,
+    upscale_factor: int,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator do the pixel shuffle, the shape of input(B, C*r*r, H, W) is arranged to
+    (B, C, H*r, W*r). It can be used to do the sub-pixel convolution.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        import numpy as np
+
+
+        @flow.global_function()
+        def PixelShuffleJob(input: tp.Numpy.Placeholder(shape=(3, 4, 2, 2), dtype=flow.float32))->tp.Numpy:
+            out = flow.nn.PixelShuffle(input, upscale_factor=2)
+
+            return out
+
+        input = np.random.uniform(size=(3, 4, 2, 2)).astype(np.float32)
+        out = PixelShuffleJob(input)
+
+        # out.shape (3, 1, 4, 4)
+
+    Args:
+        input (oneflow._oneflow_internal.BlobDesc): The input Blob.
+        upscale_factor (int): The upscale factor.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+    """
+    return flow.nn.PixelShufflev2(input, upscale_factor, upscale_factor, name=name)
+
+
+def pixel_shufflev2(
+    input: oneflow._oneflow_internal.BlobDesc,
+    h_upscale_factor: int,
+    w_upscale_factor: int,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator is similar to `oneflow.compatible.single_client.nn.PixelShuffle`. The difference is that in
+    `oneflow.compatible.single_client.nn.PixelShuffle`, the upscale factor of height and width is the same. But in
+    `oneflow.compatible.single_client.nn.PixelShufflev2`, you can set different upscale factor for height and width.
+
+    Args:
+        input (oneflow._oneflow_internal.BlobDesc): The input Blob.
+        h_upscale_factor (int): The upscale factor of height.
+        w_upscale_factor (int): The upscale factor of width.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        import numpy as np
+
+
+        @flow.global_function()
+        def PixelShufflev2Job(input: tp.Numpy.Placeholder(shape=(3, 16, 2, 4), dtype=flow.float32))->tp.Numpy:
+            out = flow.nn.PixelShufflev2(input, h_upscale_factor=2, w_upscale_factor=4)
+
+            return out
+
+        input = np.random.uniform(size=(3, 16, 2, 4)).astype(np.float32)
+        out = PixelShuffleJob(input)
+
+        # out.shape (3, 2, 4, 16)
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+    """
+    assert (
+        h_upscale_factor > 0 and w_upscale_factor > 0
+    ), "The scale factor of height and width must larger than zero"
+    assert len(input.shape) == 4, "Only Accept 4D Blob"
+    (_batch, _channel, _height, _width) = input.shape
+    assert (
+        _channel % (h_upscale_factor * w_upscale_factor) == 0
+    ), "The channels of input tensor must be divisible by (h_upscale_factor * w_upscale_factor)"
+    if name is None:
+        name = id_util.UniqueStr("PixelShufflev2")
+    _new_c = int(_channel / (h_upscale_factor * w_upscale_factor))
+    out = flow.reshape(
+        input,
+        [_batch, _new_c, h_upscale_factor * w_upscale_factor, _height, _width],
+        name=name + "_reshape1",
+    )
+    out = flow.reshape(
+        out,
+        [_batch, _new_c, h_upscale_factor, w_upscale_factor, _height, _width],
+        name=name + "_reshape2",
+    )
+    out = flow.transpose(out, [0, 1, 4, 2, 5, 3], name=name + "_transpose")
+    out = flow.reshape(
+        out,
+        [_batch, _new_c, _height * h_upscale_factor, _width * w_upscale_factor],
+        name=name + "_reshape3",
+    )
+    return out
+
+
+def kldivloss(
+    input: oneflow._oneflow_internal.BlobDesc,
+    target: oneflow._oneflow_internal.BlobDesc,
+    log_target: bool = False,
+    reduction: str = "mean",
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the Kullback-Leiber divergence loss.
+
+    The equation is:
+
+    If :math:`log\\_target = True`:
+
+    .. math::
+
+            loss = e^{target}*(target-input)
+
+    If :math:`log\\_target = False`:
+
+    .. math::
+
+            loss = target*(log(target)-input)
+
+    Attention:
+        In `log_target = False` case, the element in loss will set to be `0` when the element in target is less than `0`
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        import numpy as np
+
+
+        @flow.global_function()
+        def of_kldivloss(input: tp.Numpy.Placeholder(shape=(3, 3)),
+                        target: tp.Numpy.Placeholder(shape=(3, 3))) -> tp.Numpy:
+            return flow.nn.KLDivLoss(input, target, log_target=False, reduction='none')
+
+
+        input = np.array([[0.1, 0.2, 0.7],
+                    [0.8, 0.9, 0.5],
+                    [0.5, 0.15, 0.35]]).astype(np.float32)
+        target = np.array([[0.3, 0.1, 0.6],
+                    [-0.3, 0.4, 0.4],
+                    [0.35, 0.25, 0.4]]).astype(np.float32)
+
+        out = of_kldivloss(input, target)
+
+        # output [[-0.39119187 -0.25025854 -0.7264954 ]
+        #         [ 0.         -0.72651625 -0.56651634]
+        #         [-0.54243773 -0.3840736  -0.5065163 ]]
+
+    Args:
+        input (oneflow._oneflow_internal.BlobDesc): The input tensor.
+        target (oneflow._oneflow_internal.BlobDesc): The target tensor.
+        log_target (bool, optional): Whether the `target` is passed in the log space. Defaults to False.
+        reduction (str, optional): The reduce type, it can be one of "none", "mean", "sum". Defaults to "mean".
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result tensor.
+    """
+    assert reduction in [
+        "none",
+        "mean",
+        "sum",
+    ], "{} is not a valid value for reduction, The reduction must be the one of `none`, `mean`, `sum`. ".format(
+        reduction
+    )
+    if name is None:
+        name = id_util.UniqueStr("KLDivLoss_")
+    if log_target:
+        _kl_div_loss = flow.math.exp(target, name=name + "exp") * (target - input)
+    else:
+        _kl_div_out_loss = target * (flow.math.log(target, name=name + "log") - input)
+        _zeros = flow.zeros_like(
+            _kl_div_out_loss, dtype=_kl_div_out_loss.dtype, name=name + "zeros"
+        )
+        _condition = flow.cast(
+            flow.math.rint(target + 0.5, name=name + "rint"),
+            dtype=flow.int8,
+            name=name + "cast2int",
+        )
+        _kl_div_loss = flow.where(
+            _condition, _kl_div_out_loss, _zeros, name=name + "where"
+        )
+    if reduction == "mean":
+        return flow.math.reduce_mean(_kl_div_loss, name=name + "_reduce_mean")
+    elif reduction == "sum":
+        return flow.math.reduce_sum(_kl_div_loss, name=name + "_reduce_sum")
+    else:
+        return _kl_div_loss
diff --git a/python/oneflow/compatible/single_client/ops/one_hot.py b/python/oneflow/compatible/single_client/ops/one_hot.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f6e6239f63159a82c37cd565c4a9fecbf20515f
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/one_hot.py
@@ -0,0 +1,142 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+from typing import Optional, Union
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import distribute as distribute_util
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+from oneflow.core.register import logical_blob_id_pb2 as logical_blob_id_util
+
+
+def one_hot(
+    indices: oneflow._oneflow_internal.BlobDesc,
+    depth: int,
+    on_value: Union[int, float] = 1,
+    off_value: Union[int, float] = 0,
+    axis: int = -1,
+    dtype: Optional[flow.dtype] = None,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator generates a onehot Blob from input Blob.
+
+    If input Blob's rank is `N`, the corresponding onehot Blob's rank is `N+1`. The new axis is generated on the specified dimension according to the parameter `axis`.
+
+    The locations represented by `indices` take value `on_value`, while other locations take `off_value`
+
+    Args:
+        indices (oneflow._oneflow_internal.BlobDesc): The input Blob.
+        depth (int): The length of onehot Blob.
+        on_value (Union[int, float], optional): The fill value when `indices[i] == i`. Defaults to 1.
+        off_value (Union[int, float], optional): The fill value when `indice[i] != i`. Defaults to 0.
+        axis (int, optional): The specified dimension that the new axis is generated on. Defaults to -1.
+        dtype (Optional[flow.dtype], optional): The output data type, it can be "oneflow.compatible.single_client.int32", "oneflow.compatible.single_client.int64", "oneflow.compatible.single_client.float", "oneflow.compatible.single_client.double". Defaults to None.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Note:
+
+        The data type of input blob should be `int32` or `int64`
+
+    For example:
+
+    Example 1:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        import numpy as np
+
+
+        @flow.global_function()
+        def onehot_Job(x: tp.Numpy.Placeholder((4, ), dtype=flow.int32)
+        ) -> tp.Numpy:
+            return flow.one_hot(indices=x,
+                                depth=5,
+                                axis=-1,
+                                dtype=flow.int32)
+
+
+        x = np.array([0, 3, 1, 2]).astype(np.int32)
+        out = onehot_Job(x)
+
+        # out [[1 0 0 0 0]
+        #      [0 0 0 1 0]
+        #      [0 1 0 0 0]
+        #      [0 0 1 0 0]]
+
+    Example 2:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        import numpy as np
+
+
+        @flow.global_function()
+        def onehot_Job(x: tp.Numpy.Placeholder((4, ), dtype=flow.int32)
+        ) -> tp.Numpy:
+            return flow.one_hot(indices=x,
+                                depth=5,
+                                axis=0,
+                                dtype=flow.int32)
+
+
+        x = np.array([0, 3, 1, 2]).astype(np.int32)
+        out = onehot_Job(x)
+
+        # out [[1 0 0 0]
+        #      [0 0 1 0]
+        #      [0 0 0 1]
+        #      [0 1 0 0]
+        #      [0 0 0 0]]
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: [description]
+    """
+    out_ndims = len(indices.shape) + 1
+    if axis < 0:
+        axis += out_ndims
+    assert axis >= 0 and axis < out_ndims, ValueError(
+        "Expected axis to between [%d, %d).  But received: %d "
+        % (-out_ndims, out_ndims, axis)
+    )
+    out = (
+        flow.user_op_builder(name if name is not None else id_util.UniqueStr("OneHot_"))
+        .Op("one_hot")
+        .Input("indices", [indices])
+        .Attr("depth", int(depth))
+        .Attr("floating_on_value", float(on_value))
+        .Attr("integer_on_value", int(on_value))
+        .Attr("floating_off_value", float(off_value))
+        .Attr("integer_off_value", int(off_value))
+        .Attr("dtype", dtype)
+        .Output("out")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+    if axis != out_ndims - 1:
+        dim_list = list(range(0, out_ndims))
+        dim_list.insert(axis, out_ndims - 1)
+        dim_list.pop()
+        return flow.transpose(out, dim_list)
+    else:
+        return out
diff --git a/python/oneflow/compatible/single_client/ops/optimizer.py b/python/oneflow/compatible/single_client/ops/optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b5ed1b2399f24e69f3992e299cb7c15de5f1f3a
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/optimizer.py
@@ -0,0 +1,1991 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import collections.abc
+import traceback
+from typing import Callable, List, Optional, Sequence, Text, Union
+
+import oneflow._oneflow_internal
+from oneflow import oneflow_deprecate
+from oneflow._oneflow_internal.oneflow.core.job import job_conf as job_conf_cfg
+from oneflow._oneflow_internal.oneflow.core.job import (
+    learning_rate_schedule_conf as learning_rate_schedule_conf_cfg,
+)
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import c_api_util as c_api_util
+from oneflow.compatible.single_client.framework import runtime_mode as rt_mode
+from oneflow.compatible.single_client.framework import session_context as session_ctx
+
+
+def GetVariablesForCurrentJob() -> List[Text]:
+    sess = session_ctx.GetDefaultSession()
+    assert (
+        rt_mode.CurrentMode() == rt_mode.GLOBAL_MODE
+    ), "Optimizer's Variables() or minimize() method should be called inside a Job Function to implicitly get variables from a job."
+    job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+    return list(sess.job_name2var_name2var_blob_[job_name].keys())
+
+
+class ClipGradientConf:
+    @property
+    def clip_conf(self) -> job_conf_cfg.ClipConf:
+        raise NotImplementedError()
+
+
+class by_global_norm(ClipGradientConf):
+    """This operator limits the norm of `Input` with `clip_norm`.
+
+    If the norm of `Input` is less than the `clip_norm`,
+
+    the `Output` will be the same as `Input`.
+
+    If the norm of `Input` is greater than the `clip_norm`, the `Output` will be scaled.
+
+    The equation is:
+
+    .. math::
+
+        Output = \\frac{clip\\_norm*Input}{norm(Input)}
+
+    Args:
+        clip_norm (float): The maximum norm value.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function(type="train")
+        def train_job(
+            images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
+            labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
+        ) -> tp.Numpy:
+            with flow.scope.placement("gpu", "0:0"):
+                logits = lenet(images, train=True)
+                loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+                    labels, logits, name="softmax_loss"
+                )
+            # Set learning rate as 0.001
+            lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.001])
+            # Set gradient_clip
+            gradient_clip = flow.optimizer.grad_clipping.by_global_norm(1.0)
+            # Set AdamW optimizer with gradient clip
+            flow.optimizer.AdamW(lr_scheduler,
+                        do_bias_correction=False, weight_decay=0.00005,
+                        grad_clipping=gradient_clip).minimize(loss)
+
+            return loss
+
+    """
+
+    def __init__(self, clip_norm):
+        self.clip_norm = clip_norm
+
+    @property
+    def clip_conf(self):
+        clip_conf = job_conf_cfg.ClipConf()
+        clip_conf.mutable_clip_by_global_norm().set_clip_norm(self.clip_norm)
+        return clip_conf
+
+
+class WarmupConf:
+    @property
+    def warmup_conf(self) -> learning_rate_schedule_conf_cfg.WarmupConf:
+        raise NotImplementedError()
+
+
+class constant(WarmupConf):
+    """This operator use the constant warmup strategy to adjust the learning rate.
+
+    Before the steps are specified by user, the learning rate is:
+
+    .. math::
+
+        learning\\_rate = base\\_learning\\_rate*multiplier
+
+    After the steps are specified by user, the learning rate is:
+
+    .. math::
+
+        learning\\_rate = base\\_learning\\_rate
+
+    Args:
+        steps (int): [description]
+        multiplier (float): The scale factor :math:`multiplier`, it should be greater than 0. and less than 1.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function(type="train")
+        def train_job(
+            images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
+            labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
+        ) -> tp.Numpy:
+            with flow.scope.placement("gpu", "0:0"):
+                logits = lenet(images, train=True)
+                loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+                    labels, logits, name="softmax_loss"
+                )
+
+            # Before 10 epochs, the learning rate is 0.001
+            # After 10 epochs, the learning rate is 0.01
+            warmup_scheduler = flow.optimizer.warmup.constant(10, 0.1)
+            lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.01], warmup=warmup_scheduler)
+            flow.optimizer.Adam(lr_scheduler).minimize(loss)
+
+            return loss
+
+    """
+
+    def __init__(self, steps, multiplier):
+        self.steps = steps
+        self.multiplier = multiplier
+
+    @property
+    def warmup_conf(self) -> learning_rate_schedule_conf_cfg.WarmupConf:
+        warmup_conf = learning_rate_schedule_conf_cfg.WarmupConf()
+        warmup_conf.mutable_constant_conf().set_warmup_batches(self.steps)
+        warmup_conf.mutable_constant_conf().set_multiplier(self.multiplier)
+        return warmup_conf
+
+
+class linear(WarmupConf):
+    """This operator uses the linear warmup strategy to adjust the learning rate.
+
+    When current train step is less than warmup steps, the learning rate will be updated as:
+
+    .. math::
+
+        & current\\_multiplier = start\\_multiplier + (1-start\\_multiplier)*\\frac{train\\_step}{warmup\\_step}
+
+        & current\\_learning\\_rate = learning\\_rate*current\\_multiplier
+
+    Args:
+        steps (int): The warmup steps.
+        start_multiplier (float): The start multiplier(:math:`start\\_multiplier`). It should be greater than 0. and less than 1.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function(type="train")
+        def train_job(
+            images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
+            labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
+        ) -> tp.Numpy:
+            with flow.scope.placement("gpu", "0:0"):
+                logits = lenet(images, train=True)
+                loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+                    labels, logits, name="softmax_loss"
+                )
+
+            # Before 10 epochs, the learning rate will increase from 0.001 to 0.01 in linear.
+            warmup_scheduler = flow.optimizer.warmup.linear(10, 0.1)
+            lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.01], warmup=warmup_scheduler)
+            flow.optimizer.Adam(lr_scheduler).minimize(loss)
+
+            return loss
+
+    """
+
+    def __init__(self, steps, start_multiplier):
+        self.steps = steps
+        self.start_multiplier = start_multiplier
+
+    @property
+    def warmup_conf(self) -> learning_rate_schedule_conf_cfg.WarmupConf:
+        warmup_conf = learning_rate_schedule_conf_cfg.WarmupConf()
+        warmup_conf.mutable_linear_conf().set_warmup_batches(self.steps)
+        warmup_conf.mutable_linear_conf().set_start_multiplier(self.start_multiplier)
+        return warmup_conf
+
+
+class LrScheduler:
+    def __init__(
+        self,
+        base_lr: Optional[float] = None,
+        lr_lbn: Optional[Text] = None,
+        warmup: Optional[WarmupConf] = None,
+    ):
+        self.base_lr = base_lr
+        self.lr_lbn = lr_lbn
+        self.warmup = warmup
+
+    @property
+    def warmup_conf(self) -> learning_rate_schedule_conf_cfg.WarmupConf:
+        if self.warmup is None:
+            return None
+        return self.warmup.warmup_conf
+
+    @property
+    def learning_rate_decay_conf(
+        self,
+    ) -> Optional[learning_rate_schedule_conf_cfg.LearningRateDecayConf]:
+        raise NotImplementedError()
+
+    def SetLrFieldsInOptimizerConf(self, optimizer_conf) -> None:
+        if self.lr_lbn is not None:
+            assert self.base_lr is None
+            assert self.warmup is None
+            assert self.learning_rate_decay_conf is None
+            optimizer_conf.set_learning_rate_lbn(self.lr_lbn)
+        else:
+            assert self.base_lr is not None
+            optimizer_conf.set_base_learning_rate(self.base_lr)
+            if self.warmup_conf is not None:
+                optimizer_conf.mutable_warmup_conf().CopyFrom(self.warmup_conf)
+            if self.learning_rate_decay_conf is not None:
+                optimizer_conf.mutable_learning_rate_decay().CopyFrom(
+                    self.learning_rate_decay_conf
+                )
+
+
+class CosineScheduler(LrScheduler):
+    """This operator creates a Cosine decayed learning rate scheduler.
+
+    Before the steps are specified by user, the learning rate will be updated as:
+
+    .. math::
+
+        & cos\\_decay = 0.5*(1+cos(\\pi*\\frac{current\\_batch}{decayed\\_batch}))
+
+        & decay\\_factor = (1-\\alpha)*cos\\_decay+\\alpha
+
+        & learning\\_rate = base\\_learning\\_rate*decay\\_factor
+
+    After the steps specified by user, the learning rate will be :
+
+    .. math::
+
+        learning\\_rate = {base\\_learning\\_rate}*{\\alpha}
+
+    Args:
+        base_lr (float): The base learning rate (:math:`base\\_learning\\_rate`)
+        steps (int): The decay steps in the scheduler (:math:`decayed\\_batch`)
+        alpha (float, optional): The learning rate scale factor (:math:`\\alpha`). Defaults to 0.0.
+        warmup (Optional[WarmupConf], optional): The warmup strategy. Defaults to None.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function(type="train")
+        def train_job(
+            images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
+            labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
+        ) -> tp.Numpy:
+            with flow.scope.placement("gpu", "0:0"):
+                logits = lenet(images, train=True)
+                loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+                    labels, logits, name="softmax_loss"
+                )
+
+            lr_scheduler = flow.optimizer.CosineScheduler(base_lr=0.01,
+                                                          steps=10,
+                                                          alpha=0.1)
+            flow.optimizer.Adam(lr_scheduler).minimize(loss)
+
+            return loss
+
+    """
+
+    def __init__(
+        self,
+        base_lr: float,
+        steps: int,
+        alpha: float = 0.0,
+        warmup: Optional[WarmupConf] = None,
+    ):
+        super().__init__(base_lr=base_lr, warmup=warmup)
+        self.steps = steps
+        self.alpha = alpha
+
+    @property
+    def learning_rate_decay_conf(
+        self,
+    ) -> Optional[learning_rate_schedule_conf_cfg.LearningRateDecayConf]:
+        learning_rate_decay_conf = (
+            learning_rate_schedule_conf_cfg.LearningRateDecayConf()
+        )
+        learning_rate_decay_conf.mutable_cosine_conf().set_decay_batches(self.steps)
+        learning_rate_decay_conf.mutable_cosine_conf().set_alpha(self.alpha)
+        return learning_rate_decay_conf
+
+
+class CustomScheduler(LrScheduler):
+    def __init__(self, lbn: Text):
+        super().__init__(lr_lbn=lbn)
+
+    @property
+    def learning_rate_decay_conf(
+        self,
+    ) -> learning_rate_schedule_conf_cfg.LearningRateDecayConf:
+        return None
+
+
+class PiecewiseConstantScheduler(LrScheduler):
+    """This operator creates a piecewise constant learning rate scheduler.
+
+    The change in learning rate can be described as follows:
+
+    .. code-block:: python
+
+        boundaries = [1000, 2000]
+        values = [0.1, 0.01, 0.001]
+
+        if current_step < 1000:
+            learning_rate = 0.1
+        elif 1000 < current_step < 2000:
+            learning_rate = 0.01
+        else:
+            learning_rate = 0.001
+
+    Args:
+        boundaries (Sequence[int]): A list of train steps.
+        values (Sequence[float]): A list of learning rate values during the different train step boundary.
+        warmup (Optional[WarmupConf], optional): The warmup strategy. Defaults to None.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function(type="train")
+        def train_job(
+                images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
+                labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
+        ) -> tp.Numpy:
+            with flow.scope.placement("gpu", "0:0"):
+                logits = lenet(images, train=True)
+                loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+                    labels, logits, name="softmax_loss"
+                )
+
+            lr_scheduler = flow.optimizer.PiecewiseConstantScheduler(boundaries=[10, 20],
+                                                                     values=[0.1, 0.01, 0.001])
+            flow.optimizer.Adam(lr_scheduler).minimize(loss)
+
+            return loss
+
+    """
+
+    def __init__(
+        self,
+        boundaries: Sequence[int],
+        values: Sequence[float],
+        warmup: Optional[WarmupConf] = None,
+    ):
+        assert len(boundaries) + 1 == len(values)
+        super().__init__(base_lr=values[0], warmup=warmup)
+        self.boundaries = boundaries
+        self.values = values
+
+    @property
+    def learning_rate_decay_conf(
+        self,
+    ) -> Optional[learning_rate_schedule_conf_cfg.LearningRateDecayConf]:
+        learning_rate_decay_conf = (
+            learning_rate_schedule_conf_cfg.LearningRateDecayConf()
+        )
+        for boundary in self.boundaries:
+            learning_rate_decay_conf.mutable_piecewise_constant_conf().add_boundaries(
+                boundary
+            )
+        for value in self.values:
+            learning_rate_decay_conf.mutable_piecewise_constant_conf().add_values(value)
+        return learning_rate_decay_conf
+
+
+class PiecewiseScalingScheduler(LrScheduler):
+    """This operator creates a piecewise scaled decayed learning rate scheduler.
+
+    The change in learning rate can be described as follows:
+
+    .. code-block:: python
+
+        boundaries = [1000, 2000]
+        scale = [0.1, 0.01]
+        base_lr = 0.1
+
+        if current_step < 1000:
+            learning_rate = base_lr
+        elif 1000 < current_step < 2000:
+            learning_rate = 0.1*base_lr
+        else:
+            learning_rate = 0.01*base_lr
+
+    Args:
+        base_lr (float): The base learning rate
+        boundaries (Sequence[int]): A list of train steps.
+        scale (Union[float, Sequence[float]]): A list of learning rate scaled factors during the different train step boundary.
+        warmup (Optional[WarmupConf], optional): The warmup strategy. Defaults to None.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function(type="train")
+        def train_job(
+            images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
+            labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
+        ) -> tp.Numpy:
+            with flow.scope.placement("gpu", "0:0"):
+                logits = lenet(images, train=True)
+                loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+                    labels, logits, name="softmax_loss"
+                )
+
+            lr_scheduler = flow.optimizer.PiecewiseScalingScheduler(base_lr=0.1,
+                                                                    boundaries=[5, 10],
+                                                                    scale=[0.5, 0.1])
+            flow.optimizer.SGD(lr_scheduler, momentum=0).minimize(loss)
+
+            return loss
+
+    """
+
+    def __init__(
+        self,
+        base_lr: float,
+        boundaries: Sequence[int],
+        scale: Union[float, Sequence[float]],
+        warmup: Optional[WarmupConf] = None,
+    ):
+        super().__init__(base_lr=base_lr, warmup=warmup)
+        self.boundaries = boundaries
+        if not isinstance(scale, collections.abc.Sequence):
+            scale = [scale] * len(boundaries)
+        assert len(boundaries) == len(scale)
+        self.scales = [1] + list(scale)
+
+    @property
+    def learning_rate_decay_conf(
+        self,
+    ) -> Optional[learning_rate_schedule_conf_cfg.LearningRateDecayConf]:
+        learning_rate_decay_conf = (
+            learning_rate_schedule_conf_cfg.LearningRateDecayConf()
+        )
+        for boundary in self.boundaries:
+            learning_rate_decay_conf.mutable_piecewise_scaling_conf().add_boundaries(
+                boundary
+            )
+        for scale in self.scales:
+            learning_rate_decay_conf.mutable_piecewise_scaling_conf().add_scales(scale)
+        return learning_rate_decay_conf
+
+
+class PolynomialScheduler(LrScheduler):
+    """This operator creates a polynomial decayed learning rate scheduler.
+
+    The learning rate will be updated as follows:
+
+    If cycle is `True`, the equation is:
+
+    .. math::
+
+        & decay\\_batch = decay\\_batch*ceil(\\frac{current\\_batch}{decay\\_batch})
+
+        & learning\\_rate = (base\\_lr-end\\_lr)*(1-\\frac{current\\_batch}{decay\\_batch})^{pow}+end\\_lr
+
+    If cycle is `False`, the equation is:
+
+    .. math::
+
+        & decay\\_batch = min(decay\\_batch, current\\_batch)
+
+        & learning\\_rate = (base\\_lr-end\\_lr)*(1-\\frac{current\\_batch}{decay\\_batch})^{pow}+end\\_lr
+
+    Args:
+        base_lr (float): The base learning rate
+        steps (int): The decayed steps
+        end_learning_rate (float, optional): The final learning rate. Defaults to 0.0001.
+        power (float, optional): The power of polynomial. Defaults to 1.0.
+        cycle (bool, optional): If cycle is true, the scheduler will decay the learning rate every decay steps. Defaults to False.
+        warmup (Optional[WarmupConf], optional): The warmup strategy. Defaults to None.
+
+    For example:
+
+        .. code-block:: python
+
+            import oneflow.compatible.single_client as flow
+            import oneflow.compatible.single_client.typing as tp
+
+            @flow.global_function(type="train")
+            def train_job(
+                    images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
+                    labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
+            ) -> tp.Numpy:
+                with flow.scope.placement("gpu", "0:0"):
+                    logits = lenet(images, train=True)
+                    loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+                        labels, logits, name="softmax_loss"
+                    )
+
+                lr_scheduler = flow.optimizer.PolynomialScheduler(base_lr=0.001,
+                                                                 steps=5,
+                                                                 end_learning_rate=0.00001,
+                                                                 power=2)
+                flow.optimizer.Adam(lr_scheduler).minimize(loss)
+
+                return loss
+
+    """
+
+    def __init__(
+        self,
+        base_lr: float,
+        steps: int,
+        end_learning_rate: float = 0.0001,
+        power: float = 1.0,
+        cycle: bool = False,
+        warmup: Optional[WarmupConf] = None,
+    ):
+        super().__init__(base_lr=base_lr, warmup=warmup)
+        self.steps = steps
+        self.end_learning_rate = end_learning_rate
+        self.power = power
+        self.cycle = cycle
+
+    @property
+    def learning_rate_decay_conf(
+        self,
+    ) -> Optional[learning_rate_schedule_conf_cfg.LearningRateDecayConf]:
+        learning_rate_decay_conf = (
+            learning_rate_schedule_conf_cfg.LearningRateDecayConf()
+        )
+        learning_rate_decay_conf.mutable_polynomial_conf().set_decay_batches(self.steps)
+        learning_rate_decay_conf.mutable_polynomial_conf().set_end_learning_rate(
+            self.end_learning_rate
+        )
+        learning_rate_decay_conf.mutable_polynomial_conf().set_power(self.power)
+        learning_rate_decay_conf.mutable_polynomial_conf().set_cycle(self.cycle)
+        return learning_rate_decay_conf
+
+
+from oneflow import oneflow_deprecate
+
+
+@oneflow_deprecate()
+class PolynomialSchduler(PolynomialScheduler):
+    def __init__(
+        self,
+        base_lr: float,
+        steps: int,
+        end_learning_rate: float = 0.0001,
+        power: float = 1.0,
+        cycle: bool = False,
+        warmup: Optional[WarmupConf] = None,
+    ):
+        print(
+            "WARNING:",
+            "oneflow.compatible.single_client.optimizer.PolynomialSchduler",
+            "will be removed in the future, use {} instead.".format(
+                "oneflow.compatible.single_client.optimizer.PolynomialScheduler"
+            ),
+        )
+        print(traceback.format_stack()[-2])
+        super().__init__(
+            base_lr=base_lr,
+            steps=steps,
+            end_learning_rate=end_learning_rate,
+            power=power,
+            cycle=cycle,
+            warmup=warmup,
+        )
+
+
+class LinearCosineScheduler(LrScheduler):
+    """This operator creates a linear cosine decayed learning rate scheduler.
+
+    The learning rate will be updated as follows:
+
+    .. math::
+
+        & current\\_batch = min(current\\_batch, decay\\_batch)
+
+        & linear\\_decay = \\frac{(decay\\_batch - current\\_batch)}{decay\\_batch}
+
+        & cosine\\_decay = 0.5*(1.0+cos(2*\\pi*num\\_periods*\\frac{current\\_batch}{decay\\_batch}))
+
+        & decay\\_factor = (\\alpha+linear\\_decay)*cosine\\_decay + \\beta
+
+        & learning\\_rate = base\\_learning\\_rate*decay\\_factor
+
+    Args:
+        base_lr (float): The base learning rate
+        steps (int): The decay steps
+        num_periods (float, optional): The number of decay periods. Defaults to 0.5.
+        alpha (float, optional): The :math:`\\alpha` in equation. Defaults to 0.0.
+        beta (float, optional): The :math:`\\beta` in equation. Defaults to 0.001.
+        warmup (Optional[WarmupConf], optional): The warmup strategy. Defaults to None.
+
+    For example:
+
+        .. code-block:: python
+
+            import oneflow.compatible.single_client as flow
+            import oneflow.compatible.single_client.typing as tp
+
+            @flow.global_function(type="train")
+            def train_job(
+                    images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
+                    labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
+            ) -> tp.Numpy:
+                with flow.scope.placement("gpu", "0:0"):
+                    logits = lenet(images, train=True)
+                    loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+                        labels, logits, name="softmax_loss"
+                    )
+
+                lr_scheduler = flow.optimizer.LinearCosineScheduler(base_lr=0.1,
+                                                                    steps=10)
+                flow.optimizer.SGD(lr_scheduler, momentum=0.9).minimize(loss)
+
+                return loss
+
+    """
+
+    def __init__(
+        self,
+        base_lr: float,
+        steps: int,
+        num_periods: float = 0.5,
+        alpha: float = 0.0,
+        beta: float = 0.001,
+        warmup: Optional[WarmupConf] = None,
+    ):
+        super().__init__(base_lr=base_lr, warmup=warmup)
+        self.steps = steps
+        self.num_periods = num_periods
+        self.alpha = alpha
+        self.beta = beta
+
+    @property
+    def learning_rate_decay_conf(
+        self,
+    ) -> Optional[learning_rate_schedule_conf_cfg.LearningRateDecayConf]:
+        learning_rate_decay_conf = (
+            learning_rate_schedule_conf_cfg.LearningRateDecayConf()
+        )
+        learning_rate_decay_conf.mutable_linear_cosine_conf().set_decay_batches(
+            self.steps
+        )
+        learning_rate_decay_conf.mutable_linear_cosine_conf().set_num_periods(
+            self.num_periods
+        )
+        learning_rate_decay_conf.mutable_linear_cosine_conf().set_alpha(self.alpha)
+        learning_rate_decay_conf.mutable_linear_cosine_conf().set_beta(self.beta)
+        return learning_rate_decay_conf
+
+
+class ExponentialScheduler(LrScheduler):
+    """This operator creates a exponential decayed learning rate scheduler.
+
+    The learning rate will be updated as follows:
+
+    If staircase is set to False, the equation is:
+
+    .. math::
+
+        & pow = \\frac{current\\_batch}{decay\\_batch}
+
+        & learning\\_rate = base\\_learning\\_rate*decay\\_rate^{pow}
+
+    If staircase is set to True, the equation is:
+
+    .. math::
+
+        & pow = floor(\\frac{current\\_batch}{decay\\_batch})
+
+        & learning\\_rate = base\\_learning\\_rate*decay\\_rate^{pow}
+
+    Args:
+        base_lr (float): The base learning rate
+        steps (int): The decay steps
+        decay_rate (float): The decay rate
+        staircase (bool, optional): If staircase is True, the scheduler decay the learning rate at discrete intervals. Defaults to False.
+        warmup (Optional[WarmupConf], optional): The warmup strategy. Defaults to None.
+
+    For example:
+
+        .. code-block::python
+
+            import oneflow.compatible.single_client as flow
+            import oneflow.compatible.single_client.typing as tp
+
+            @flow.global_function(type="train")
+            def train_job(
+                    images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
+                    labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
+            ) -> tp.Numpy:
+                with flow.scope.placement("gpu", "0:0"):
+                    logits = lenet(images, train=True)
+                    loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+                        labels, logits, name="softmax_loss"
+                    )
+
+                lr_scheduler = flow.optimizer.CosineScheduler(base_lr=0.01,
+                                                              steps=10,
+                                                              alpha=0.1)
+                flow.optimizer.Adam(lr_scheduler).minimize(loss)
+
+                return loss
+
+    """
+
+    def __init__(
+        self,
+        base_lr: float,
+        steps: int,
+        decay_rate: float,
+        staircase=False,
+        warmup: Optional[WarmupConf] = None,
+    ):
+        super().__init__(base_lr=base_lr, warmup=warmup)
+        self.steps = steps
+        self.decay_rate = decay_rate
+        self.staircase = staircase
+
+    @property
+    def learning_rate_decay_conf(
+        self,
+    ) -> Optional[learning_rate_schedule_conf_cfg.LearningRateDecayConf]:
+        learning_rate_decay_conf = (
+            learning_rate_schedule_conf_cfg.LearningRateDecayConf()
+        )
+        learning_rate_decay_conf.mutable_exponential_conf().set_decay_batches(
+            self.steps
+        )
+        learning_rate_decay_conf.mutable_exponential_conf().set_decay_rate(
+            self.decay_rate
+        )
+        learning_rate_decay_conf.mutable_exponential_conf().set_staircase(
+            self.staircase
+        )
+        return learning_rate_decay_conf
+
+
+class InverseTimeScheduler(LrScheduler):
+    """This operator creates a inverse time decayed learning rate scheduler.
+
+    The learning rate will be updated as follows:
+
+    If staircase is set to False, the equation is:
+
+    .. math::
+
+        & step\\_ratio = \\frac{current\\_batch}{decay\\_batch}
+
+        & learning\\_rate = \\frac{base\\_learning\\_rate}{1+decay\\_rate*step\\_ratio}
+
+    If staircase is set to True, the equation is:
+
+    .. math::
+
+        & step\\_ratio = \\frac{current\\_batch}{decay\\_batch}
+
+        & learning\\_rate = \\frac{base\\_learning\\_rate}{1+floor(decay\\_rate*step\\_ratio)}
+
+    Args:
+        base_lr (float): The base learning rate
+        steps (int): The decay steps
+        decay_rate (float): The decay rate
+        staircase (bool, optional): If staircase is True, the scheduler decay the learning rate at discrete intervals. Defaults to False.
+        warmup (Optional[WarmupConf], optional): The warmup strategy. Defaults to None.
+
+    For example:
+
+        .. code-block:: python
+
+            import oneflow.compatible.single_client as flow
+            import oneflow.compatible.single_client.typing as tp
+
+            @flow.global_function(type="train")
+            def train_job(
+                    images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
+                    labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
+            ) -> tp.Numpy:
+                with flow.scope.placement("gpu", "0:0"):
+                    logits = lenet(images, train=True)
+                    loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+                        labels, logits, name="softmax_loss"
+                    )
+
+                lr_scheduler = flow.optimizer.InverseTimeScheduler(base_lr=0.1,
+                                                                   steps=5,
+                                                                   decay_rate=0.9)
+                flow.optimizer.SGD(lr_scheduler, momentum=0.9).minimize(loss)
+
+                return loss
+
+    """
+
+    def __init__(
+        self,
+        base_lr: float,
+        steps: int,
+        decay_rate: float,
+        staircase: bool = False,
+        warmup: Optional[WarmupConf] = None,
+    ):
+        super().__init__(base_lr=base_lr, warmup=warmup)
+        self.steps = steps
+        self.decay_rate = decay_rate
+        self.staircase = staircase
+
+    @property
+    def learning_rate_decay_conf(
+        self,
+    ) -> Optional[learning_rate_schedule_conf_cfg.LearningRateDecayConf]:
+        learning_rate_decay_conf = (
+            learning_rate_schedule_conf_cfg.LearningRateDecayConf()
+        )
+        learning_rate_decay_conf.mutable_inverse_time_conf().set_decay_batches(
+            self.steps
+        )
+        learning_rate_decay_conf.mutable_inverse_time_conf().set_decay_rate(
+            self.decay_rate
+        )
+        learning_rate_decay_conf.mutable_inverse_time_conf().set_staircase(
+            self.staircase
+        )
+        return learning_rate_decay_conf
+
+
+class NaturalExpScheduler(LrScheduler):
+    """This operator creates a natural exponential decayed learning rate scheduler.
+
+    The learning rate will be updated as follows:
+
+    If staircase is set to False, the equation is:
+
+    .. math::
+
+        & step\\_ratio = \\frac{current\\_batch}{decay\\_batch}
+
+        & learning\\_rate = {base\\_learning\\_rate}*e^{-decay\\_rate*step\\_ratio}
+
+    If staircase is set to True, the equation is:
+
+    .. math::
+
+        & step\\_ratio = \\frac{current\\_batch}{decay\\_batch}
+
+        & learning\\_rate = {base\\_learning\\_rate}*e^{-decay\\_rate*floor(step\\_ratio)}
+
+    Args:
+        base_lr (float): The base learning rate
+        steps (int): The decay steps
+        decay_rate (float): The decay rate
+        staircase (bool, optional): If staircase is True, the scheduler decay the learning rate at discrete intervals. Defaults to False.
+        warmup (Optional[WarmupConf], optional): The warmup strategy. Defaults to None.
+
+    For example:
+
+        .. code-block:: python
+
+            import oneflow.compatible.single_client as flow
+            import oneflow.compatible.single_client.typing as tp
+
+            @flow.global_function(type="train")
+            def train_job(
+                    images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
+                    labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
+            ) -> tp.Numpy:
+                with flow.scope.placement("gpu", "0:0"):
+                    logits = lenet(images, train=True)
+                    loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+                        labels, logits, name="softmax_loss"
+                    )
+
+                lr_scheduler = flow.optimizer.NaturalExpScheduler(base_lr=0.1,
+                                                                  steps=10,
+                                                                  decay_rate=0.5)
+                flow.optimizer.SGD(lr_scheduler, momentum=0.9).minimize(loss)
+
+                return loss
+
+    """
+
+    def __init__(
+        self,
+        base_lr: float,
+        steps: int,
+        decay_rate: float,
+        staircase: bool = False,
+        warmup: Optional[WarmupConf] = None,
+    ):
+        super().__init__(base_lr=base_lr, warmup=warmup)
+        self.steps = steps
+        self.decay_rate = decay_rate
+        self.staircase = staircase
+
+    @property
+    def learning_rate_decay_conf(
+        self,
+    ) -> Optional[learning_rate_schedule_conf_cfg.LearningRateDecayConf]:
+        learning_rate_decay_conf = (
+            learning_rate_schedule_conf_cfg.LearningRateDecayConf()
+        )
+        learning_rate_decay_conf.mutable_natural_exp_conf.set_decay_batches(self.steps)
+        learning_rate_decay_conf.mutable_natural_exp_conf.set_decay_rate(
+            self.decay_rate
+        )
+        learning_rate_decay_conf.mutable_natural_exp_conf.set_staircase(self.staircase)
+        return learning_rate_decay_conf
+
+
+class LossScalePolicy:
+    def SetLossScaleFieldsInTrainConf(self, train_conf):
+        raise NotImplementedError()
+
+
+class StaticLossScalePolicy(LossScalePolicy):
+    def __init__(self, loss_scale_factor: float):
+        super().__init__()
+        self.loss_scale_factor = loss_scale_factor
+
+    def SetLossScaleFieldsInTrainConf(self, train_conf):
+        train_conf.loss_scale_factor = self.loss_scale_factor
+
+
+class DynamicLossScalePolicy(LossScalePolicy):
+    def __init__(
+        self, initial_loss_scale=2 ** 30, increment_period=2000, multiplier=2.0
+    ):
+        super().__init__()
+        self.initial_loss_scale = initial_loss_scale
+        self.increment_period = increment_period
+        self.multiplier = multiplier
+
+    def SetLossScaleFieldsInTrainConf(self, train_conf):
+        train_conf.mutable_dynamic_loss_scale_policy().set_initial_loss_scale(
+            self.initial_loss_scale
+        )
+        train_conf.mutable_dynamic_loss_scale_policy().set_increment_period(
+            self.increment_period
+        )
+        train_conf.mutable_dynamic_loss_scale_policy().set_multiplier(self.multiplier)
+
+
+class Optimizer:
+    def __init__(
+        self,
+        loss_scale_factor: Optional[int] = None,
+        train_step_lbn: Optional[Text] = None,
+        loss_scale_policy: Optional[LossScalePolicy] = None,
+    ):
+        self.train_step_lbn = train_step_lbn
+        if loss_scale_factor is not None:
+            assert loss_scale_policy is None
+            self.loss_scale_policy = StaticLossScalePolicy(loss_scale_factor)
+        else:
+            self.loss_scale_policy = loss_scale_policy
+        self._variables_list_init = False
+
+    def Variables(self) -> List[Text]:
+        if not self._variables_list_init:
+            if self.variables is None:
+                self.variables = list(GetVariablesForCurrentJob())
+            elif callable(self.variables):
+                self.variables = list(self.variables())
+            else:
+                self.variables = list(self.variables)
+            self._variables_list_init = True
+        return self.variables
+
+    def _AddOptimizerConfInTrainConf(self, train_conf: job_conf_cfg.TrainConf) -> None:
+        raise NotImplementedError()
+
+    @property
+    def train_conf(self) -> job_conf_cfg.TrainConf:
+        train_conf = job_conf_cfg.TrainConf()
+        if self.train_step_lbn is not None:
+            train_conf.set_train_step_lbn(self.train_step_lbn)
+        if self.loss_scale_policy is not None:
+            self.loss_scale_policy.SetLossScaleFieldsInTrainConf(train_conf)
+        self._AddOptimizerConfInTrainConf(train_conf)
+        return train_conf
+
+    def minimize(
+        self,
+        loss: Union[
+            Sequence[oneflow._oneflow_internal.BlobDesc],
+            oneflow._oneflow_internal.BlobDesc,
+        ],
+    ) -> None:
+        if not isinstance(loss, collections.abc.Sequence):
+            loss = [loss]
+        c_api_util.CurJobBuildAndInferCtx_SetTrainConf(self.train_conf)
+        for x in loss:
+            flow.losses.add_loss(x)
+
+
+class SGD(Optimizer):
+    """The optimizer of the stochastic gradient descent algorithm.
+
+    This algorithm takes a random sample's gradient as an approximate estimate of the overall gradient in small batch gradient descent.
+
+    When the momentum = 0, the equation of parameters updating is:
+
+    .. math::
+
+        param_{new} = param_{old} - learning\\_rate*grad
+
+    With momentum, the equation of parameters updating is:
+
+    .. math::
+
+        & V_{t} = \\beta*V_{t-1} + learning\\_rate*g_t
+
+        & param_{new} = param_{old} - V_{t}
+
+    Args:
+        lr_scheduler (LrScheduler): The scheduler of learning rate.
+        loss_scale_factor (Optional[float], optional): The scale factor of loss. Defaults to None.
+        momentum (float, optional): Momentum factor (:math:`\\beta`). Defaults to 0.9.
+        grad_clipping (Optional[ClipGradientConf], optional): The gradient clipping strategy. Defaults to None.
+        train_step_lbn (Optional[Text], optional): [description]. Defaults to None.
+        loss_scale_policy (Optional[LossScalePolicy]): The policy of loss scale.
+        variables(Optional[
+            Union[Sequence[Text], Callable[[], Sequence[Text]]]
+        ]): maintained variables.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function(type="train")
+        def train_job(
+            images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
+            labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
+        ) -> tp.Numpy:
+            with flow.scope.placement("gpu", "0:0"):
+                logits = lenet(images, train=True)
+                loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+                    labels, logits, name="softmax_loss"
+                )
+
+            # Set Learning rate as 0.1
+            lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.1])
+            # Set Momentum=0.9 SGD optimizer
+            flow.optimizer.SGD(lr_scheduler, momentum=0.9).minimize(loss)
+
+            return loss
+    """
+
+    def __init__(
+        self,
+        lr_scheduler: LrScheduler,
+        loss_scale_factor: Optional[float] = None,
+        momentum: float = 0.9,
+        grad_clipping: Optional[ClipGradientConf] = None,
+        train_step_lbn: Optional[Text] = None,
+        loss_scale_policy: Optional[LossScalePolicy] = None,
+        variables: Optional[
+            Union[Sequence[Text], Callable[[], Sequence[Text]]]
+        ] = GetVariablesForCurrentJob,
+    ):
+        super().__init__(loss_scale_factor, train_step_lbn, loss_scale_policy)
+        self.lr_scheduler = lr_scheduler
+        self.grad_clipping = grad_clipping
+        self.momentum = momentum
+        self.variables = variables
+
+    def _AddOptimizerConfInTrainConf(self, train_conf) -> None:
+        optimizer_conf = train_conf.mutable_optimizer_conf().Add()
+        self.lr_scheduler.SetLrFieldsInOptimizerConf(optimizer_conf)
+        if self.grad_clipping is not None:
+            optimizer_conf.mutable_clip_conf().CopyFrom(self.grad_clipping.clip_conf)
+        if self.momentum == 0:
+            optimizer_conf.mutable_naive_conf()
+        else:
+            optimizer_conf.mutable_momentum_conf().set_beta(self.momentum)
+        for variable in self.Variables():
+            optimizer_conf.add_variable_op_names(variable)
+
+
+class SGDW(Optimizer):
+    """The optimizer of the stochastic-gradient-descent-weight-decay algorithm.
+
+    (More details please refer to `Decoupled Weight Decay Regularization <https://arxiv.org/abs/1711.05101>`_).
+
+    When the momentum = 0, the equation of parameters updating is:
+
+    .. math::
+
+        param_{new} = param_{old} - learning\\_rate*(grad + \\lambda*param_{old}))
+
+    With momentum, the equation of parameters updating is:
+
+    .. math::
+
+        & V_{t} = \\beta*V_{t-1} - learning\\_rate*g_t
+
+        & param_{new} = param_{old} + V_{t} - learning\\_rate * \\lambda*param_{old}
+
+    Args:
+        lr_scheduler (LrScheduler): The scheduler of learning rate.
+        loss_scale_factor (Optional[float], optional): The scale factor of loss. Defaults to None.
+        momentum (float, optional): Momentum factor (:math:`\\beta`). Defaults to 0.9.
+        weight_decay (Optional[float], optional): The weight decay factor (In the equation is :math:`\\lambda`). Defaults to None.
+        weight_decay_includes (Optional[Union[Sequence[Text], Text]], optional): The name of the model parameters that use weight decay. Defaults to None.
+        weight_decay_excludes (Optional[Union[Sequence[Text], Text]], optional): The name of the model parameters that do not use weight decay. Defaults to None.
+        grad_clipping (Optional[ClipGradientConf], optional): The gradient clipping strategy. Defaults to None.
+        train_step_lbn (Optional[Text], optional): [description]. Defaults to None.
+        loss_scale_policy (Optional[LossScalePolicy]): The policy of loss scale.
+        variables(Optional[
+            Union[Sequence[Text], Callable[[], Sequence[Text]]]
+        ]): maintained variables.
+
+    Note:
+
+        Only one of `weight_decay_includes` and `weight_decay_excludes` can be set. If both are None,
+        all the model parameters will use weight decay.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function(type="train")
+        def train_job(
+            images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
+            labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
+        ) -> tp.Numpy:
+            with flow.scope.placement("gpu", "0:0"):
+                logits = lenet(images, train=True)
+                loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+                    labels, logits, name="softmax_loss"
+                )
+
+            # Set Learning rate as 0.1
+            lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.1])
+            # Set Momentum=0.9 SGDW optimizer, weight_decay factor is 0.00005
+            flow.optimizer.SGDW(lr_scheduler, momentum=0.9, weight_decay=0.00005).minimize(loss)
+
+            return loss
+    """
+
+    def __init__(
+        self,
+        lr_scheduler: LrScheduler,
+        loss_scale_factor: Optional[float] = None,
+        momentum: float = 0.9,
+        weight_decay: Optional[float] = None,
+        weight_decay_includes: Optional[Union[Sequence[Text], Text]] = None,
+        weight_decay_excludes: Optional[Union[Sequence[Text], Text]] = None,
+        grad_clipping: Optional[ClipGradientConf] = None,
+        train_step_lbn: Optional[Text] = None,
+        loss_scale_policy: Optional[LossScalePolicy] = None,
+        variables: Optional[
+            Union[Sequence[Text], Callable[[], Sequence[Text]]]
+        ] = GetVariablesForCurrentJob,
+    ):
+        super().__init__(loss_scale_factor, train_step_lbn, loss_scale_policy)
+        self.lr_scheduler = lr_scheduler
+        self.grad_clipping = grad_clipping
+        self.momentum = momentum
+        self.weight_decay = weight_decay
+        if isinstance(weight_decay_includes, str):
+            weight_decay_includes = [weight_decay_includes]
+        if isinstance(weight_decay_excludes, str):
+            weight_decay_excludes = [weight_decay_excludes]
+        self.weight_decay_includes = weight_decay_includes
+        self.weight_decay_excludes = weight_decay_excludes
+        self.variables = variables
+
+    def _AddOptimizerConfInTrainConf(self, train_conf) -> None:
+        optimizer_conf = train_conf.mutable_optimizer_conf().Add()
+        self.lr_scheduler.SetLrFieldsInOptimizerConf(optimizer_conf)
+        if self.grad_clipping is not None:
+            optimizer_conf.mutable_clip_conf().CopyFrom(self.grad_clipping.clip_conf)
+        if self.momentum == 0:
+            optimizer_conf.mutable_naive_conf()
+        else:
+            optimizer_conf.mutable_momentum_conf().set_beta(self.momentum)
+        if self.weight_decay is not None:
+            optimizer_conf.mutable_weight_decay_conf().set_weight_decay_rate(
+                self.weight_decay
+            )
+            assert not (
+                self.weight_decay_excludes is not None
+                and self.weight_decay_includes is not None
+            )
+            if self.weight_decay_includes is not None:
+                for weight_decay_include in self.weight_decay_includes:
+                    optimizer_conf.mutable_weight_decay_conf().mutable_includes().add_pattern(
+                        weight_decay_include
+                    )
+            elif self.weight_decay_excludes is not None:
+                for weight_decay_exclude in self.weight_decay_excludes:
+                    optimizer_conf.weight_decay_conf().mutable_excludes().add_pattern(
+                        weight_decay_exclude
+                    )
+        for variable in self.Variables():
+            optimizer_conf.add_variable_op_names(variable)
+
+
+class Adam(Optimizer):
+    """The optimizer of the Adam algorithm.
+
+    This algorithm can adjust the learning rate of each parameter dynamically according to the 1st-moment estimates
+
+    and the 2nd-moment estimates of gradient.
+
+    With bias correction, the equation of parameters updating is:
+
+    .. math::
+
+        & V_t = \\beta_1*V_{t-1} + (1-\\beta_1)*grad
+
+        & S_t = \\beta_2*S_{t-1} + (1-\\beta_2)*{grad} \\odot {grad}
+
+        & \\hat{V_t} = \\frac{V_t}{1-\\beta_1^t}
+
+        & \\hat{S_t} = \\frac{S_t}{1-\\beta_2^t}
+
+        & \\hat{g} = learning\\_rate*\\frac{\\hat{V_t}}{\\sqrt{\\hat{S_t}}+\\epsilon}
+
+        & param_{new} = param_{old} - \\hat{g}
+
+    Without bias correction, the equation of parameters updating is:
+
+    .. math::
+
+        & V_t = \\beta_1*V_{t-1} + (1-\\beta_1)*grad
+
+        & S_t = \\beta_2*S_{t-1} + (1-\\beta_2)*{grad} \\odot {grad}
+
+        & \\hat{g} = learning\\_rate*\\frac{{V_t}}{\\sqrt{{S_t}}+\\epsilon}
+
+        & param_{new} = param_{old} - \\hat{g}
+
+    More details please refer to `Adam <https://arxiv.org/abs/1412.6980>`_
+
+    Args:
+        lr_scheduler (LrScheduler): The scheduler of learning rate.
+        beta1 (float, optional): The exponential weighted average decay rate for the 1st-moment estimates (:math:`\\beta_1`). Defaults to 0.9.
+        beta2 (float, optional): The exponential weighted average decay rate for the 2rd-moment estimates (:math:`\\beta_2`). Defaults to 0.999.
+        epsilon ([type], optional): A small float constant value for numerical stability (:math:`\\epsilon`). Defaults to 1e-8.
+        do_bias_correction (bool, optional): Whether to do the bias correction. Defaults to False.
+        loss_scale_factor (Optional[float], optional): The scale factor of loss. Defaults to None.
+        grad_clipping (Optional[ClipGradientConf], optional): The gradient clipping strategy. Defaults to None.
+        train_step_lbn (Optional[Text], optional): [description]. Defaults to None.
+        loss_scale_policy (Optional[LossScalePolicy]): The policy of loss scale.
+        variables(Optional[
+            Union[Sequence[Text], Callable[[], Sequence[Text]]]
+        ]): maintained variables.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function(type="train")
+        def train_job(
+            images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
+            labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
+        ) -> tp.Numpy:
+            with flow.scope.placement("gpu", "0:0"):
+                logits = lenet(images, train=True)
+                loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+                    labels, logits, name="softmax_loss"
+                )
+
+            # Set learning rate as 0.001
+            lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.001])
+            # Set Adam optimizer
+            flow.optimizer.Adam(lr_scheduler, do_bias_correction=False).minimize(loss)
+
+            return loss
+    """
+
+    def __init__(
+        self,
+        lr_scheduler: LrScheduler,
+        beta1=0.9,
+        beta2=0.999,
+        epsilon=1e-08,
+        do_bias_correction=False,
+        loss_scale_factor: Optional[float] = None,
+        grad_clipping: Optional[ClipGradientConf] = None,
+        train_step_lbn: Optional[Text] = None,
+        loss_scale_policy: Optional[LossScalePolicy] = None,
+        variables: Optional[
+            Union[Sequence[Text], Callable[[], Sequence[Text]]]
+        ] = GetVariablesForCurrentJob,
+    ):
+        super().__init__(loss_scale_factor, train_step_lbn, loss_scale_policy)
+        self.lr_scheduler = lr_scheduler
+        self.grad_clipping = grad_clipping
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.do_bias_correction = do_bias_correction
+        self.variables = variables
+
+    def _AddOptimizerConfInTrainConf(self, train_conf) -> None:
+        optimizer_conf = train_conf.mutable_optimizer_conf().Add()
+        self.lr_scheduler.SetLrFieldsInOptimizerConf(optimizer_conf)
+        if self.grad_clipping is not None:
+            optimizer_conf.mutable_clip_conf().CopyFrom(self.grad_clipping.clip_conf)
+        optimizer_conf.mutable_adam_conf().set_beta1(self.beta1)
+        optimizer_conf.mutable_adam_conf().set_beta2(self.beta2)
+        optimizer_conf.mutable_adam_conf().set_epsilon(self.epsilon)
+        optimizer_conf.mutable_adam_conf().set_do_bias_correction(
+            self.do_bias_correction
+        )
+        for variable in self.Variables():
+            optimizer_conf.add_variable_op_names(variable)
+
+
+class AdamW(Optimizer):
+    """The optimizer of the Adam-weight-decay algorithm.
+
+    If we use L2 regularization,
+
+    it will be invalid due to the adaptive learning rate in Adam optimizer
+
+    (More details please refer to `Adam-weight-decay <https://www.fast.ai/2018/07/02/adam-weight-decay/>`_).
+
+    So we use Adam-weight-decay algorithm to solve this problem.
+
+    With bias correction, the equation of parameters updating is:
+
+    .. math::
+
+        & V_t = \\beta_1*V_{t-1} + (1-\\beta_1)*grad
+
+        & S_t = \\beta_2*S_{t-1} + (1-\\beta_2)*{grad} \\odot {grad}
+
+        & \\hat{V_t} = \\frac{V_t}{1-\\beta_1^t}
+
+        & \\hat{S_t} = \\frac{S_t}{1-\\beta_2^t}
+
+        & \\hat{g} = learning\\_rate*(\\frac{\\hat{V_t}}{\\sqrt{\\hat{S_t}}+\\epsilon}+\\lambda*param_{old})
+
+        & param_{new} = param_{old} - \\hat{g}
+
+    Without bias correction, the equation of parameters updating is:
+
+    .. math::
+
+        & V_t = \\beta_1*V_{t-1} + (1-\\beta_1)*grad
+
+        & S_t = \\beta_2*S_{t-1} + (1-\\beta_2)*{grad} \\odot {grad}
+
+        & \\hat{g} = learning\\_rate*(\\frac{{V_t}}{\\sqrt{{S_t}}+\\epsilon}+\\lambda*param_{old})
+
+        & param_{new} = param_{old} - \\hat{g}
+
+    Args:
+        lr_scheduler (LrScheduler): The scheduler of learning rate.
+        beta1 (float, optional): The exponential weighted average decay rate for the 1st-moment estimates (:math:`\\beta_1`). Defaults to 0.9.
+        beta2 (float, optional): The exponential weighted average decay rate for the 2rd-moment estimates (:math:`\\beta_2`). Defaults to 0.999.
+        epsilon ([type], optional): A small float constant value for numerical stability (:math:`\\epsilon`). Defaults to 1e-8.
+        do_bias_correction (bool, optional): Whether to do the bias correction. Defaults to False.
+        loss_scale_factor (Optional[float], optional): The scale factor of loss. Defaults to None.
+        weight_decay (Optional[float], optional): The weight decay factor (In the equation is :math:`\\lambda`). Defaults to None.
+        weight_decay_includes (Optional[Union[Sequence[Text], Text]], optional): The name of the model parameters that use weight decay. Defaults to None.
+        weight_decay_excludes (Optional[Union[Sequence[Text], Text]], optional): The name of the model parameters that do not use weight decay. Defaults to None.
+        grad_clipping (Optional[ClipGradientConf], optional): The gradient clipping strategy. Defaults to None.
+        train_step_lbn (Optional[Text], optional): [description]. Defaults to None.
+        loss_scale_policy (Optional[LossScalePolicy]): The policy of loss scale.
+        variables(Optional[
+            Union[Sequence[Text], Callable[[], Sequence[Text]]]
+        ]): maintained variables.
+
+    Note:
+
+        Only one of `weight_decay_includes` and `weight_decay_excludes` can be set. If both are None,
+        all the model parameters will use weight decay.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function(type="train")
+        def train_job(
+            images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
+            labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
+        ) -> tp.Numpy:
+            with flow.scope.placement("gpu", "0:0"):
+                logits = lenet(images, train=True)
+                loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+                    labels, logits, name="softmax_loss"
+                )
+
+            # Set learning rate as 0.001
+            lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.001])
+            # Set AdamW optimizer, weight_decay factor is 0.00005
+            flow.optimizer.AdamW(lr_scheduler,
+                    do_bias_correction=False, weight_decay=0.00005).minimize(loss)
+
+            return loss
+
+    """
+
+    def __init__(
+        self,
+        lr_scheduler: LrScheduler,
+        beta1=0.9,
+        beta2=0.999,
+        epsilon=1e-08,
+        do_bias_correction=False,
+        loss_scale_factor: Optional[float] = None,
+        weight_decay: Optional[float] = None,
+        weight_decay_includes: Optional[Union[Sequence[Text], Text]] = None,
+        weight_decay_excludes: Optional[Union[Sequence[Text], Text]] = None,
+        grad_clipping: Optional[ClipGradientConf] = None,
+        train_step_lbn: Optional[Text] = None,
+        loss_scale_policy: Optional[LossScalePolicy] = None,
+        variables: Optional[
+            Union[Sequence[Text], Callable[[], Sequence[Text]]]
+        ] = GetVariablesForCurrentJob,
+    ):
+        super().__init__(loss_scale_factor, train_step_lbn, loss_scale_policy)
+        self.lr_scheduler = lr_scheduler
+        self.grad_clipping = grad_clipping
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.do_bias_correction = do_bias_correction
+        self.weight_decay = weight_decay
+        if isinstance(weight_decay_includes, str):
+            weight_decay_includes = [weight_decay_includes]
+        if isinstance(weight_decay_excludes, str):
+            weight_decay_excludes = [weight_decay_excludes]
+        self.weight_decay_includes = weight_decay_includes
+        self.weight_decay_excludes = weight_decay_excludes
+        self.variables = variables
+
+    def _AddOptimizerConfInTrainConf(self, train_conf) -> None:
+        optimizer_conf = train_conf.mutable_optimizer_conf().Add()
+        self.lr_scheduler.SetLrFieldsInOptimizerConf(optimizer_conf)
+        if self.grad_clipping is not None:
+            optimizer_conf.mutable_clip_conf().CopyFrom(self.grad_clipping.clip_conf)
+        optimizer_conf.mutable_adam_conf().set_beta1(self.beta1)
+        optimizer_conf.mutable_adam_conf().set_beta2(self.beta2)
+        optimizer_conf.mutable_adam_conf().set_epsilon(self.epsilon)
+        optimizer_conf.mutable_adam_conf().set_do_bias_correction(
+            self.do_bias_correction
+        )
+        if self.weight_decay is not None:
+            optimizer_conf.mutable_weight_decay_conf().set_weight_decay_rate(
+                self.weight_decay
+            )
+            assert not (
+                self.weight_decay_excludes is not None
+                and self.weight_decay_includes is not None
+            )
+            if self.weight_decay_includes is not None:
+                for weight_decay_include in self.weight_decay_includes:
+                    optimizer_conf.mutable_weight_decay_conf().mutable_includes().add_pattern(
+                        weight_decay_include
+                    )
+            elif self.weight_decay_excludes is not None:
+                for weight_decay_exclude in self.weight_decay_excludes:
+                    optimizer_conf.mutable_weight_decay_conf().mutable_excludes().add_pattern(
+                        weight_decay_exclude
+                    )
+        for variable in self.Variables():
+            optimizer_conf.add_variable_op_names(variable)
+
+
+class RMSProp(Optimizer):
+    """The optimizer of the RMSProp algorithm.
+
+    This algorithm uses mean squared gradient to adjust the learning rate.
+
+    The equation of parameters updating is:
+
+        if centered:
+
+            .. math::
+
+                & mg_t = mg * \\beta_1 + (1 - \\beta_1) * grad
+
+                & denom_t = S_t - mg_t * mg_t
+
+        else:
+
+            .. math::
+
+                denom_t = S_t
+
+        .. math::
+
+            param_{new} = param_{old} - \\frac{learning\\_rate}{\\sqrt{denom_t+\\epsilon}} \\odot grad
+
+    Args:
+        lr_scheduler (LrScheduler): The scheduler of learning rate.
+        decay_rate (float, optional): The decay factor (:math:`\\beta_1`). Defaults to 0.99.
+        epsilon (float, optional): A small float constant value for numerical stability (:math:`\\epsilon`). Defaults to 1e-8.
+        centered (bool, optional): If `True`, gradients are normalized by the estimated
+                                   variance of the gradient; if False, by the uncentered second moment.
+                                   Setting this to `True` may help with training, but is slightly more
+                                   expensive in terms of computation and memory. Defaults to `False`.
+        loss_scale_factor (Optional[float], optional): The scale factor of loss. Defaults to None.
+        grad_clipping (Optional[ClipGradientConf], optional): The gradient clipping strategy. Defaults to None.
+        train_step_lbn (Optional[Text], optional): [description]. Defaults to None.
+        loss_scale_policy (Optional[LossScalePolicy]): The policy of loss scale.
+        variables(Optional[
+            Union[Sequence[Text], Callable[[], Sequence[Text]]]
+        ]): maintained variables.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function(type="train")
+        def train_job(
+            images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
+            labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
+        ) -> tp.Numpy:
+            with flow.scope.placement("gpu", "0:0"):
+                logits = lenet(images, train=True)
+                loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+                    labels, logits, name="softmax_loss"
+                )
+            # Set learning rate as 0.001
+            lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.001])
+            # Set RMSProp optimizer
+            flow.optimizer.RMSProp(lr_scheduler).minimize(loss)
+
+            return loss
+
+    """
+
+    def __init__(
+        self,
+        lr_scheduler: LrScheduler,
+        decay_rate: float = 0.99,
+        epsilon: float = 1e-08,
+        centered: bool = False,
+        loss_scale_factor: Optional[float] = None,
+        grad_clipping: Optional[ClipGradientConf] = None,
+        train_step_lbn: Optional[Text] = None,
+        loss_scale_policy: Optional[LossScalePolicy] = None,
+        variables: Optional[
+            Union[Sequence[Text], Callable[[], Sequence[Text]]]
+        ] = GetVariablesForCurrentJob,
+    ):
+        super().__init__(loss_scale_factor, train_step_lbn, loss_scale_policy)
+        self.lr_scheduler = lr_scheduler
+        self.grad_clipping = grad_clipping
+        self.decay_rate = decay_rate
+        self.epsilon = epsilon
+        self.centered = centered
+        self.variables = variables
+
+    def _AddOptimizerConfInTrainConf(self, train_conf) -> None:
+        optimizer_conf = train_conf.mutable_optimizer_conf().Add()
+        self.lr_scheduler.SetLrFieldsInOptimizerConf(optimizer_conf)
+        if self.grad_clipping is not None:
+            optimizer_conf.mutable_clip_conf().CopyFrom(self.grad_clipping.clip_conf)
+        optimizer_conf.mutable_rmsprop_conf().set_decay_rate(self.decay_rate)
+        optimizer_conf.mutable_rmsprop_conf().set_centered(self.centered)
+        optimizer_conf.mutable_rmsprop_conf().set_epsilon(self.epsilon)
+        for variable in self.Variables():
+            optimizer_conf.add_variable_op_names(variable)
+
+
+class LARS(Optimizer):
+    """The optimizer of the LARS algorithm.
+
+    The equation of parameters updating is:
+
+    .. math::
+
+        & local\\_learning\\_rate = learning\\_rate*lars\\_coeff*\\frac{\\lVert{parm_{old}\\rVert}}{\\epsilon+\\lVert{grad\\rVert}+weight_decay*\\lVert{parm_{old}\\rVert}}
+
+        & momentum_t = \\beta*momentum_{t-1} + local\\_learning\\_rate*(grad)
+
+        & param_{new} = param_{old} - momentum_t - local_learning_rate * weight_decay * param_{old}
+
+    Args:
+        lr_scheduler (LrScheduler): The scheduler of learning rate.
+        momentum_beta (float, optional): The momentum factor (:math:`\\beta`). Defaults to 0.9.
+        epsilon (float, optional): A small float constant value for numerical stability (:math:`\\epsilon`). Defaults to 1e-9.
+        lars_coefficient (float, optional): The coefficient factor, it defines how much we trust the layer to change its weights (:math:`lars\\_coeff`). Defaults to 0.0001.
+        loss_scale_factor (Optional[float], optional): The scale factor of loss. Defaults to None.
+        weight_decay (Optional[float], optional): The weight decay factor (In the equation is :math:`\\lambda`). Defaults to None.
+        weight_decay_includes (Optional[Union[Sequence[Text], Text]], optional): The name of the model parameters that use weight decay. Defaults to None.
+        weight_decay_excludes (Optional[Union[Sequence[Text], Text]], optional): The name of the model parameters that do not use weight decay. Defaults to None.
+        grad_clipping (Optional[ClipGradientConf], optional): The gradient clipping strategy. Defaults to None.
+        train_step_lbn (Optional[Text], optional): [description]. Defaults to None.
+        loss_scale_policy (Optional[LossScalePolicy]): The policy of loss scale.
+
+    Note:
+
+        Only one of `weight_decay_includes` and `weight_decay_excludes` can be set. If both are None,
+        all the model parameters will use weight decay.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function(type="train")
+        def train_job(
+                images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
+                labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
+        ) -> tp.Numpy:
+            with flow.scope.placement("gpu", "0:0"):
+                logits = lenet(images, train=True)
+                loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+                    labels, logits, name="softmax_loss"
+                )
+            # Set learning rate as 0.1
+            lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.1])
+            # Set LARS optimizer, momentum factor is 0.9
+            flow.optimizer.LARS(lr_scheduler, momentum_beta=0.9).minimize(loss)
+
+            return loss
+
+    """
+
+    def __init__(
+        self,
+        lr_scheduler: LrScheduler,
+        momentum_beta: float = 0.9,
+        epsilon: float = 1e-09,
+        lars_coefficient: float = 0.0001,
+        loss_scale_factor: Optional[float] = None,
+        weight_decay: Optional[float] = None,
+        weight_decay_includes: Optional[Union[Sequence[Text], Text]] = None,
+        weight_decay_excludes: Optional[Union[Sequence[Text], Text]] = None,
+        grad_clipping: Optional[ClipGradientConf] = None,
+        train_step_lbn: Optional[Text] = None,
+        loss_scale_policy: Optional[LossScalePolicy] = None,
+        variables: Optional[
+            Union[Sequence[Text], Callable[[], Sequence[Text]]]
+        ] = GetVariablesForCurrentJob,
+    ):
+        super().__init__(loss_scale_factor, train_step_lbn, loss_scale_policy)
+        self.lr_scheduler = lr_scheduler
+        self.grad_clipping = grad_clipping
+        self.momentum_beta = momentum_beta
+        self.epsilon = epsilon
+        self.lars_coefficient = lars_coefficient
+        self.weight_decay = weight_decay
+        if isinstance(weight_decay_includes, str):
+            weight_decay_includes = [weight_decay_includes]
+        if isinstance(weight_decay_excludes, str):
+            weight_decay_excludes = [weight_decay_excludes]
+        self.weight_decay_includes = weight_decay_includes
+        self.weight_decay_excludes = weight_decay_excludes
+        self.variables = variables
+
+    def _AddOptimizerConfInTrainConf(self, train_conf) -> None:
+        optimizer_conf = train_conf.mutable_optimizer_conf().Add()
+        self.lr_scheduler.SetLrFieldsInOptimizerConf(optimizer_conf)
+        if self.grad_clipping is not None:
+            optimizer_conf.mutable_clip_conf().CopyFrom(self.grad_clipping.clip_conf)
+        optimizer_conf.mutable_lars_conf().set_momentum_beta(self.momentum_beta)
+        optimizer_conf.mutable_lars_conf().set_epsilon(self.epsilon)
+        optimizer_conf.mutable_lars_conf().set_lars_coefficient(self.lars_coefficient)
+        if self.weight_decay is not None:
+            optimizer_conf.mutable_weight_decay_conf().set_weight_decay_rate(
+                self.weight_decay
+            )
+            assert not (
+                self.weight_decay_excludes is not None
+                and self.weight_decay_includes is not None
+            )
+            if self.weight_decay_includes is not None:
+                for weight_decay_include in self.weight_decay_includes:
+                    optimizer_conf.mutable_weight_decay_conf().mutable_includes().add_pattern(
+                        weight_decay_include
+                    )
+            elif self.weight_decay_excludes is not None:
+                for weight_decay_exclude in self.weight_decay_excludes:
+                    optimizer_conf.mutable_weight_decay_conf().mutable_excludes().add_pattern(
+                        weight_decay_exclude
+                    )
+        for variable in self.Variables():
+            optimizer_conf.add_variable_op_names(variable)
+
+
+class LazyAdam(Optimizer):
+    """
+    The optimizer of the LazyAdam algorithm.
+
+    This algorithm can adjust the learning rate of each parameter dynamically according to the 1st-moment estimates and the 2nd-moment estimates of the gradient.
+
+    The difference between Adam optimizer and LazyAdam optimizer is that LazyAdam only updates the element that has gradient in the current batch, it is faster than Adam optimizer.
+
+    .. math::
+
+        & V_t = \\beta_1*V_{t-1} + (1-\\beta_1)*grad
+
+        & S_t = \\beta_2*S_{t-1} + (1-\\beta_2)*{grad} \\odot {grad}
+
+        & \\hat{g} = learning\\_rate*\\frac{{V_t}}{\\sqrt{{S_t}}+\\epsilon}
+
+        & param_{new} = param_{old} - \\hat{g}
+
+    Args:
+        lr_scheduler (LrScheduler): The scheduler of learning rate.
+        beta1 (float, optional): The exponential weighted average decay rate for the 1st-moment estimates (:math:`\\beta_1`). Defaults to 0.9.
+        beta2 (float, optional): The exponential weighted average decay rate for the 2rd-moment estimates (:math:`\\beta_2`). Defaults to 0.999.
+        epsilon ([type], optional): A small float constant value for numerical stability (:math:`\\epsilon`). Defaults to 1e-8.
+        loss_scale_factor (Optional[float], optional): The scale factor of loss. Defaults to None.
+        grad_clipping (Optional[ClipGradientConf], optional): The gradient clipping strategy. Defaults to None.
+        train_step_lbn (Optional[Text], optional): [description]. Defaults to None.
+        loss_scale_policy (Optional[LossScalePolicy]): The policy of loss scale.
+        variables(Optional[
+            Union[Sequence[Text], Callable[[], Sequence[Text]]]
+        ]): maintained variables.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function(type="train")
+        def train_job(
+            images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
+            labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
+        ) -> tp.Numpy:
+            with flow.scope.placement("gpu", "0:0"):
+                logits = lenet(images, train=True)
+                loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+                    labels, logits, name="softmax_loss"
+                )
+            # Set learning rate as 0.001
+            lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.001])
+            # Set LazyAdam optimizer
+            flow.optimizer.LazyAdam(lr_scheduler).minimize(loss)
+
+            return loss
+
+    """
+
+    def __init__(
+        self,
+        lr_scheduler: LrScheduler,
+        beta1: float = 0.9,
+        beta2: float = 0.999,
+        epsilon: float = 1e-08,
+        loss_scale_factor: Optional[float] = None,
+        grad_clipping: Optional[ClipGradientConf] = None,
+        train_step_lbn: Optional[Text] = None,
+        loss_scale_policy: Optional[LossScalePolicy] = None,
+        variables: Optional[
+            Union[Sequence[Text], Callable[[], Sequence[Text]]]
+        ] = GetVariablesForCurrentJob,
+    ):
+        super().__init__(loss_scale_factor, train_step_lbn, loss_scale_policy)
+        self.lr_scheduler = lr_scheduler
+        self.grad_clipping = grad_clipping
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.variables = variables
+
+    def _AddOptimizerConfInTrainConf(self, train_conf) -> None:
+        optimizer_conf = train_conf.mutable_optimizer_conf().Add()
+        self.lr_scheduler.SetLrFieldsInOptimizerConf(optimizer_conf)
+        if self.grad_clipping is not None:
+            optimizer_conf.mutable_clip_conf().CopyFrom(self.grad_clipping.clip_conf)
+        optimizer_conf.mutable_lazy_adam_conf().set_beta1(self.beta1)
+        optimizer_conf.mutable_lazy_adam_conf().set_beta2(self.beta2)
+        optimizer_conf.mutable_lazy_adam_conf().set_epsilon(self.epsilon)
+        for variable in self.Variables():
+            optimizer_conf.add_variable_op_names(variable)
+
+
+class LAMB(Optimizer):
+    """
+
+    Args:
+        lr_scheduler (LrScheduler): The scheduler of learning rate.
+        beta1 (float, optional): The exponential weighted average decay rate for the 1st-moment estimates (:math:`\\beta_1`). Defaults to 0.9.
+        beta2 (float, optional): The exponential weighted average decay rate for the 2rd-moment estimates (:math:`\\beta_2`). Defaults to 0.999.
+        epsilon ([type], optional): A small float constant value for numerical stability (:math:`\\epsilon`). Defaults to 1e-6.
+        loss_scale_factor (Optional[float], optional): The scale factor of loss. Defaults to None.
+        weight_decay (Optional[float], optional): The weight decay factor (In the equation is :math:`\\lambda`). Defaults to None.
+        weight_decay_includes (Optional[Union[Sequence[Text], Text]], optional): The name of the model parameters that use weight decay. Defaults to None.
+        weight_decay_excludes (Optional[Union[Sequence[Text], Text]], optional): The name of the model parameters that do not use weight decay. Defaults to None.
+        grad_clipping (Optional[ClipGradientConf], optional): The gradient clipping strategy. Defaults to None.
+        train_step_lbn (Optional[Text], optional): [description]. Defaults to None.
+        loss_scale_policy (Optional[LossScalePolicy]): The policy of loss scale.
+        variables(Optional[
+            Union[Sequence[Text], Callable[[], Sequence[Text]]]
+        ]): maintained variables.
+
+    Note:
+
+        Only one of `weight_decay_includes` and `weight_decay_excludes` can be set. If both are None,
+        all the model parameters will use weight decay.
+
+    """
+
+    def __init__(
+        self,
+        lr_scheduler: LrScheduler,
+        beta1: float = 0.9,
+        beta2: float = 0.999,
+        epsilon: float = 1e-06,
+        loss_scale_factor: Optional[float] = None,
+        weight_decay: Optional[float] = None,
+        weight_decay_includes: Optional[Union[Sequence[Text], Text]] = None,
+        weight_decay_excludes: Optional[Union[Sequence[Text], Text]] = None,
+        grad_clipping: Optional[ClipGradientConf] = None,
+        train_step_lbn: Optional[Text] = None,
+        loss_scale_policy: Optional[LossScalePolicy] = None,
+        variables: Optional[
+            Union[Sequence[Text], Callable[[], Sequence[Text]]]
+        ] = GetVariablesForCurrentJob,
+    ):
+        super().__init__(loss_scale_factor, train_step_lbn, loss_scale_policy)
+        self.lr_scheduler = lr_scheduler
+        self.grad_clipping = grad_clipping
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.weight_decay = weight_decay
+        if isinstance(weight_decay_includes, str):
+            weight_decay_includes = [weight_decay_includes]
+        if isinstance(weight_decay_excludes, str):
+            weight_decay_excludes = [weight_decay_excludes]
+        self.weight_decay_includes = weight_decay_includes
+        self.weight_decay_excludes = weight_decay_excludes
+        self.variables = variables
+
+    def _AddOptimizerConfInTrainConf(self, train_conf) -> None:
+        optimizer_conf = train_conf.mutable_optimizer_conf().Add()
+        self.lr_scheduler.SetLrFieldsInOptimizerConf(optimizer_conf)
+        if self.grad_clipping is not None:
+            optimizer_conf.mutable_clip_conf().CopyFrom(self.grad_clipping.clip_conf)
+        optimizer_conf.mutable_lamb_conf().set_beta1(self.beta1)
+        optimizer_conf.mutable_lamb_conf().set_beta2(self.beta2)
+        optimizer_conf.mutable_lamb_conf().set_epsilon(self.epsilon)
+        if self.weight_decay is not None:
+            optimizer_conf.mutable_weight_decay_conf().set_weight_decay_rate(
+                self.weight_decay
+            )
+            assert not (
+                self.weight_decay_excludes is not None
+                and self.weight_decay_includes is not None
+            )
+            if self.weight_decay_includes is not None:
+                for weight_decay_include in self.weight_decay_includes:
+                    optimizer_conf.mutable_weight_decay_conf().mutable_includes().add_pattern(
+                        weight_decay_include
+                    )
+            elif self.weight_decay_excludes is not None:
+                for weight_decay_exclude in self.weight_decay_excludes:
+                    optimizer_conf.mutable_weight_decay_conf().mutable_excludes().add_pattern(
+                        weight_decay_exclude
+                    )
+        for variable in self.Variables():
+            optimizer_conf.add_variable_op_names(variable)
+
+
+class CombinedOptimizer(Optimizer):
+    """
+    Combined optimizer for multi optimizer case.
+
+    Args:
+        optimizers (Sequence[Optimizer]): optimizers to work together
+        loss_scale_factor (Optional[float], optional): The scale factor of loss. Defaults to None.
+        train_step_lbn (Optional[Text], optional): [description]. Defaults to None.
+        loss_scale_policy (Optional[LossScalePolicy]): The policy of loss scale.
+
+        Example: see test_multi_optimizer.py
+    """
+
+    def __init__(
+        self,
+        optimizers: Sequence[Optimizer],
+        loss_scale_factor: Optional[float] = None,
+        train_step_lbn: Optional[Text] = None,
+        loss_scale_policy: Optional[LossScalePolicy] = None,
+    ):
+        super().__init__(loss_scale_factor, train_step_lbn, loss_scale_policy)
+        for optimizer in optimizers:
+            assert not isinstance(
+                optimizer, CombinedOptimizer
+            ), "Forbid constructing CombinedOptimizer recursively"
+            assert (
+                optimizer.train_step_lbn is None
+            ), "Only one train step lbn among multi optimizers, please set thisparameter in CombinedOptimizer"
+            assert (
+                optimizer.loss_scale_policy is None
+            ), "Only one loss scale policy among multi optimizers, please set thisparameter in CombinedOptimizer"
+        self.optimizers = optimizers
+
+    def Variables(self) -> List[Text]:
+        if not self._variables_list_init:
+            self.variables = []
+            for optimizer in self.optimizers:
+                self.variables.append(optimizer.Variables())
+            self._variables_list_init = True
+        return self.variables
+
+    def _SanityCheck(self):
+        all_variables = set(GetVariablesForCurrentJob())
+        union_set = set()
+        inter_set = all_variables
+        for optimizer in self.optimizers:
+            s = set(optimizer.Variables())
+            union_set.union(s)
+            inter_set = inter_set.intersection(s)
+        assert union_set.issubset(all_variables)
+        assert (
+            len(inter_set) == 0
+        ), "Do not allow overlap of variables between multi optimizers"
+
+    def _AddOptimizerConfInTrainConf(self, train_conf) -> None:
+        self._SanityCheck()
+        for optimizer in self.optimizers:
+            optimizer._AddOptimizerConfInTrainConf(train_conf)
diff --git a/python/oneflow/compatible/single_client/ops/pad.py b/python/oneflow/compatible/single_client/ops/pad.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d4932f8232b88a7100cdc050c9df79ff2c12754
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/pad.py
@@ -0,0 +1,511 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional, Sequence, Union
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+
+
+def pad(
+    x: oneflow._oneflow_internal.BlobDesc,
+    paddings: Sequence[int],
+    constant_value: Union[int, float] = 0,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator pads the input blob with constant value that user specifies. User can set the amount of padding by setting the parameter `paddings`.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): The input Blob
+        paddings (Sequence[int]): A list of integers to specify the padding width, its length must equal with the length of `x.shape`.
+        constant_value (Union[int, float], optional): The constant value to pad. Defaults to 0.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Raises:
+        ValueError: The parameter `paddings` must be a tuple or a list.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The Blob after padding.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        import numpy as np
+
+
+        @flow.global_function()
+        def pad_Job(x: tp.Numpy.Placeholder((3, 3))
+        ) -> tp.Numpy:
+            return flow.pad(x,
+                            paddings=((2, 2), (1, 1)),
+                            constant_value=5)
+
+
+        x = np.array([[1, 1, 1],
+                    [1, 1, 1],
+                    [1, 1, 1]]).astype(np.float32)
+        out = pad_Job(x)
+
+        # out [[5. 5. 5. 5. 5.]
+        #      [5. 5. 5. 5. 5.]
+        #      [5. 1. 1. 1. 5.]
+        #      [5. 1. 1. 1. 5.]
+        #      [5. 1. 1. 1. 5.]
+        #      [5. 5. 5. 5. 5.]
+        #      [5. 5. 5. 5. 5.]]
+
+    """
+    padding_before = []
+    padding_after = []
+    if isinstance(paddings, (list, tuple)):
+        assert len(paddings) == len(x.shape), ValueError(
+            "paddings must be the same size of input dims"
+        )
+        for p in paddings:
+            assert isinstance(p, (list, tuple)) and len(p) == 2, ValueError(
+                "the elem of paddings must be a tuple or a list with length of 2"
+            )
+            padding_before.append(p[0])
+            padding_after.append(p[1])
+    else:
+        raise ValueError("paddings must be a tuple or a list.")
+    if x.dtype in [flow.float32, flow.float16, flow.float64]:
+        floating_constant_value = float(constant_value)
+        integral_constant_value = int(0)
+    else:
+        floating_constant_value = float(0)
+        integral_constant_value = int(constant_value)
+    return (
+        flow.user_op_builder(name if name is not None else id_util.UniqueStr("Pad_"))
+        .Op("pad")
+        .Input("x", [x])
+        .Output("y")
+        .Attr("padding_before", padding_before)
+        .Attr("padding_after", padding_after)
+        .Attr("floating_constant_value", floating_constant_value)
+        .Attr("integral_constant_value", integral_constant_value)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def pad_grad(
+    x: oneflow._oneflow_internal.BlobDesc,
+    paddings: Sequence[int],
+    constant_value: Union[int, float] = 0,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    padding_before = []
+    padding_after = []
+    if isinstance(paddings, (list, tuple)):
+        assert len(paddings) == len(x.shape), ValueError(
+            "paddings must be the same size of input dims"
+        )
+        for p in paddings:
+            assert isinstance(p, (list, tuple)) and len(p) == 2, ValueError(
+                "the elem of paddings must be a tuple or a list with length of 2"
+            )
+            padding_before.append(p[0])
+            padding_after.append(p[1])
+    else:
+        raise ValueError("paddings must be a tuple or a list.")
+    return (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("PadGrad_")
+        )
+        .Op("pad_grad")
+        .Input("dy", [x])
+        .Output("dx")
+        .Attr("padding_before", padding_before)
+        .Attr("padding_after", padding_after)
+        .Attr("floating_constant_value", float(constant_value))
+        .Attr("integral_constant_value", int(constant_value))
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def same_padding(
+    x: oneflow._oneflow_internal.BlobDesc,
+    padding: Sequence[int],
+    data_format: str,
+    kernel_size: Sequence[int],
+    strides: Sequence[int],
+    dilation_rate: Sequence[int],
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator do the padding in "SAME" mode, It can computes the pad width according to the `kernel_size` and `strides` to keep the size of feature map unchanged after convolution or other operations.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): The input blob.
+        padding (Sequence[int]): The padding mode. It should be "SAME_UPPER" or "SAME_LOWER"
+        data_format ([type]): The data format of input Blob. If the string starts with "NC", it means the data format is `channel first`, else the data format is `channel last`.
+        kernel_size (Sequence[int]): The kernel size of operations. Its type should be tuple or list.
+        strides (Sequence[int]): The strides of operations. Its type should be tuple or list.
+        dilation_rate (Sequence[int]): The dilation rate of operations. Its type should be tuple or list.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The Blob after padding.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        import numpy as np
+
+
+        @flow.global_function()
+        def same_pad_Job(x: tp.Numpy.Placeholder((1, 1, 3, 3))
+        ) -> tp.Numpy:
+            return flow.same_padding(x,
+                                    padding="SAME_UPPER",
+                                    data_format="NCHW",
+                                    kernel_size=(3, 3),
+                                    strides=(1, 1),
+                                    dilation_rate=(1, 1))
+
+
+        x = np.ones(shape=(1, 1, 3, 3)).astype(np.float32)
+        out = same_pad_Job(x)
+
+        # out [[[[0. 0. 0. 0. 0.]
+        #        [0. 1. 1. 1. 0.]
+        #        [0. 1. 1. 1. 0.]
+        #        [0. 1. 1. 1. 0.]
+        #        [0. 0. 0. 0. 0.]]]]
+
+    """
+    assert isinstance(padding, str) and (
+        padding.upper() == "SAME_LOWER" or padding.upper() == "SAME_UPPER"
+    ), 'padding must be "SAME_LOWER" or "SAME_UPPER".'
+    channel_pos = "channels_first" if data_format.startswith("NC") else "channels_last"
+    assert isinstance(kernel_size, (list, tuple))
+    assert isinstance(strides, (list, tuple))
+    assert isinstance(dilation_rate, (list, tuple))
+    num_spatial_dims = len(x.shape) - 2
+    assert len(kernel_size) == num_spatial_dims
+    assert len(strides) == num_spatial_dims
+    assert len(dilation_rate) == num_spatial_dims
+    return (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("SamePadding_")
+        )
+        .Op("same_padding")
+        .Input("x", [x])
+        .Output("y")
+        .Attr("padding", padding.lower())
+        .Attr("data_format", channel_pos)
+        .Attr("kernel_size", kernel_size)
+        .Attr("strides", strides)
+        .Attr("dilation_rate", dilation_rate)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def reflection_pad2d(
+    x: oneflow._oneflow_internal.BlobDesc,
+    padding: Union[int, tuple, list],
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Pads the input tensor using the reflection of the input boundary.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): input blob, only support "NCHW" format.
+        padding (Union[int, oneflow._oneflow_internal.BlobDesc]): The size or bundary of padding, if is int uses the same padding in all dimension;
+        if 4-dims tuple, uses (	ext{padding\\_left}padding_left , 	ext{padding\\_right}padding_right , 	ext{padding\\_top}padding_top , 	ext{padding\\_bottom}padding_bottom )
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: [description]
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        import numpy as np
+
+
+        @flow.global_function()
+        def pad_Job(x: tp.Numpy.Placeholder((1, 2, 3, 3))
+        ) -> tp.Numpy:
+            return flow.reflection_pad2d(x, padding=[2, 2, 1, 1])
+
+
+        x = np.arange(18).reshape((1, 2, 3, 3)).astype(np.float32)
+        out = pad_Job(x)
+
+        # out [[[[ 5.  4.  3.  4.  5.  4.  3.]
+        #    [ 2.  1.  0.  1.  2.  1.  0.]
+        #    [ 5.  4.  3.  4.  5.  4.  3.]
+        #    [ 8.  7.  6.  7.  8.  7.  6.]
+        #    [ 5.  4.  3.  4.  5.  4.  3.]]
+
+        #   [[ 14. 13. 12. 13. 14. 13. 12.]
+        #    [ 11. 10.  9. 10. 11. 10.  9.]
+        #    [ 14. 13. 12. 13. 14. 13. 12.]
+        #    [ 17. 16. 15. 16. 17. 16. 15.]
+        #    [ 14. 13. 12. 13. 14. 13. 12.]]]]
+
+    """
+    (H, W) = (x.shape[2], x.shape[3])
+    if isinstance(padding, (tuple, list)):
+        assert len(padding) == len(x.shape), ValueError(
+            "padding boundry must be the same size of input dims"
+        )
+        assert (
+            padding[2] < H and padding[3] < H and (padding[0] < W) and (padding[1] < W)
+        ), ValueError(
+            "Padding size should be less than the corresponding input dimension!"
+        )
+        boundry = [padding[0], padding[1], padding[2], padding[3]]
+    elif isinstance(padding, int):
+        assert padding < H and padding < W, ValueError(
+            "Padding size should be less than the corresponding input dimension!"
+        )
+        boundry = [padding, padding, padding, padding]
+    else:
+        raise ValueError("padding must be in or list or tuple!")
+    return (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("Reflection_Pad2d_")
+        )
+        .Op("reflection_pad2d")
+        .Input("x", [x])
+        .Output("y")
+        .Attr("padding", list(boundry))
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def replication_pad2d(
+    x: oneflow._oneflow_internal.BlobDesc,
+    padding: Union[int, tuple, list],
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Pads the input tensor using the replication of the input boundary.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): input blob, only support "NCHW" format.
+        padding (Union[int, oneflow._oneflow_internal.BlobDesc]): The size or bundary of padding, if is int uses the same padding in all dimension;
+        if 4-dims tuple, uses (	ext{padding\\_left}padding_left , 	ext{padding\\_right}padding_right , 	ext{padding\\_top}padding_top , 	ext{padding\\_bottom}padding_bottom )
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: [description]
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        import numpy as np
+
+
+        @flow.global_function()
+        def pad_Job(x: tp.Numpy.Placeholder((1, 2, 3, 3))
+        ) -> tp.Numpy:
+            return flow.reflection_pad2d(x, padding=[2, 2, 1, 1])
+
+
+        x = np.arange(18).reshape((1, 2, 3, 3)).astype(np.float32)
+        out = pad_Job(x)
+
+        # out [[[[ 0.  0.  0.  1.  2.  2.  2.]
+        #    [ 0.  0.  0.  1.  2.  2.  2.]
+        #    [ 3.  3.  3.  4.  5.  5.  5.]
+        #    [ 6.  6.  6.  7.  8.  8.  8.]
+        #    [ 6.  6.  6.  7.  8.  8.  8.]]
+
+        #   [[ 9.  9.  9.  10.  11.  11.  11.]
+        #    [ 9.  9.  9.  10.  11.  11.  11.]
+        #    [ 12.  12.  12.  13.  14.  14.  14.]
+        #    [ 15.  15.  15.  16.  17.  17.  17.]
+        #    [ 15.  15.  15.  16.  17.  17.  17.]]]]
+
+    """
+    (H, W) = (x.shape[2], x.shape[3])
+    if isinstance(padding, (tuple, list)):
+        assert len(padding) == len(x.shape), ValueError(
+            "padding boundry must be the same size of input dims"
+        )
+        boundry = [padding[0], padding[1], padding[2], padding[3]]
+    elif isinstance(padding, int):
+        boundry = [padding, padding, padding, padding]
+    else:
+        raise ValueError("padding must be in or list or tuple!")
+    return (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("Replication_Pad2d_")
+        )
+        .Op("replication_pad2d")
+        .Input("x", [x])
+        .Output("y")
+        .Attr("padding", list(boundry))
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def constant_pad2d(
+    x: oneflow._oneflow_internal.BlobDesc,
+    padding: Union[int, tuple, list],
+    constant_value: Union[int, float] = 0,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Pads the input tensor using an input constant value.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): input blob, only support "NCHW" format.
+        padding (Union[int, oneflow._oneflow_internal.BlobDesc]): The size or bundary of padding, if is int uses the same padding in all dimension;
+        if 4-dims tuple, uses (	ext{padding\\_left}padding_left , 	ext{padding\\_right}padding_right , 	ext{padding\\_top}padding_top , 	ext{padding\\_bottom}padding_bottom )
+        constant_value (Union[int, float]): The constant value used for padding. Defaults to Zero.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: [description]
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        import numpy as np
+
+
+        @flow.global_function()
+        def pad_Job(x: tp.Numpy.Placeholder((1, 2, 3, 3), const_value)
+        ) -> tp.Numpy:
+            return flow.constant_pad2d(x, padding=[2, 2, 1, 1], const_value)
+
+
+        x = np.arange(18).reshape((1, 2, 3, 3)).astype(np.float32)
+        const_value = 1.5
+        out = pad_Job(x, const_value)
+
+        # out [[[[ 1.5  1.5  1.5  1.5  1.5  1.5  1.5]
+        #    [ 1.5  1.5  0.  1.  2.  1.5  1.5]
+        #    [ 1.5  1.5  3.  4.  5.  1.5  1.5]
+        #    [ 1.5  1.5  6.  7.  8.  1.5  1.5]
+        #    [ 1.5  1.5  1.5  1.5  1.5  1.5  1.5]]
+
+        #   [[ 1.5  1.5  1.5  1.5  1.5  1.5  1.5.]
+        #    [ 1.5  1.5  9.  10.  11.  1.5  1.5]
+        #    [ 1.5  1.5  12.  13.  14.  1.5  1.5]
+        #    [ 1.5  1.5  15.  16.  17.  1.5  1.5]
+        #    [ 1.5  1.5  1.5  1.5  1.5  1.5  1.5]]]]
+
+    """
+    (H, W) = (x.shape[2], x.shape[3])
+    if isinstance(padding, (tuple, list)):
+        assert len(padding) == len(x.shape), ValueError(
+            "padding boundry must be the same size of input dims"
+        )
+        boundry = [padding[0], padding[1], padding[2], padding[3]]
+    elif isinstance(padding, int):
+        boundry = [padding, padding, padding, padding]
+    else:
+        raise ValueError("padding must be in or list or tuple!")
+    if x.dtype in [flow.float32, flow.float16, flow.float64]:
+        floating_value = float(constant_value)
+        integral_value = int(0)
+    else:
+        floating_value = float(0)
+        integral_value = int(constant_value)
+    return (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("Constant_Pad2d_")
+        )
+        .Op("constant_pad2d")
+        .Input("x", [x])
+        .Output("y")
+        .Attr("padding", list(boundry))
+        .Attr("floating_value", floating_value)
+        .Attr("integral_value", integral_value)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def zero_pad2d(
+    x: oneflow._oneflow_internal.BlobDesc,
+    padding: Union[int, tuple, list],
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Pads the input tensor using zeros.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): input blob, only support "NCHW" format.
+        padding (Union[int, oneflow._oneflow_internal.BlobDesc]): The size or bundary of padding, if is int uses the same padding in all dimension;
+        if 4-dims tuple, uses (	ext{padding\\_left}padding_left , 	ext{padding\\_right}padding_right , 	ext{padding\\_top}padding_top , 	ext{padding\\_bottom}padding_bottom )
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: [description]
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        import numpy as np
+
+
+        @flow.global_function()
+        def pad_Job(x: tp.Numpy.Placeholder((1, 2, 3, 3), const_value)
+        ) -> tp.Numpy:
+            return flow.constant_pad2d(x, padding=[2, 2, 1, 1], const_value)
+
+
+        x = np.arange(18).reshape((1, 2, 3, 3)).astype(np.float32)
+        const_value = 1.5
+        out = pad_Job(x, const_value)
+
+        # out [[[[ 0.  0.  0.  0.  0.  0.  0.]
+        #    [ 0.  0.  0.  1.  2.  0.  0.]
+        #    [ 0.  0.  3.  4.  5.  0.  0.]
+        #    [ 0.  0.  6.  7.  8.  0.  0.]
+        #    [ 0.  0.  0.  0.  0.  0.  0.]]
+
+        #   [[ 0.  0.  0.  0.  0.  0.  0.]
+        #    [ 0.  0.  9.  10.  11.  0.  0.]
+        #    [ 0.  0.  12.  13.  14.  0.  0.]
+        #    [ 0.  0.  15.  16.  17.  0.  0.]
+        #    [ 0.  0.  0.  0.  0.  0.  0.]]]]
+
+    """
+    if name is None:
+        name = id_util.UniqueStr("Zero_Pad2d_")
+    return constant_pad2d(x, padding, 0.0, name)
diff --git a/python/oneflow/compatible/single_client/ops/partial_fc_sample.py b/python/oneflow/compatible/single_client/ops/partial_fc_sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a270aee599d3577d769f87041ba7f02dbda77f6
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/partial_fc_sample.py
@@ -0,0 +1,53 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+from typing import Optional, Union
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import distribute as distribute_util
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+from oneflow.core.register import logical_blob_id_pb2 as logical_blob_id_util
+
+
+def distributed_partial_fc_sample(
+    weight: oneflow._oneflow_internal.BlobDesc,
+    label: oneflow._oneflow_internal.BlobDesc,
+    num_sample: int,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    parallel_num = flow.current_scope().device_parallel_desc_symbol.parallel_num
+    assert num_sample % parallel_num == 0
+    assert weight.shape[0] % parallel_num == 0
+    return (
+        flow.user_op_builder(
+            name
+            if name is not None
+            else id_util.UniqueStr("DistributedPartialFcSample_")
+        )
+        .Op("distributed_partial_fc_sample")
+        .Input("weight", [weight])
+        .Input("label", [label])
+        .Attr("num_sample", num_sample)
+        .Output("mapped_label")
+        .Output("sampled_label")
+        .Output("sampled_weight")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()
+    )
diff --git a/python/oneflow/compatible/single_client/ops/prelu.py b/python/oneflow/compatible/single_client/ops/prelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d376e01f7fef4176f7519806118a8c91c75f3ac
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/prelu.py
@@ -0,0 +1,155 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional, Sequence
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import distribute as distribute_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+from oneflow.core.job import initializer_conf_pb2 as initializer_conf_util
+from oneflow.core.job import regularizer_conf_pb2 as regularizer_conf_util
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+
+
+def prelu(
+    inputs: oneflow._oneflow_internal.BlobDesc,
+    alpha_initializer: Optional[initializer_conf_util.InitializerConf] = None,
+    alpha_regularizer: Optional[regularizer_conf_util.RegularizerConf] = None,
+    shared_axes: Optional[Sequence[int]] = None,
+    trainable: bool = True,
+    name: str = "PRelu",
+    model_distribute: oneflow._oneflow_internal.distribute.Distribute = oneflow._oneflow_internal.distribute.broadcast(),
+) -> oneflow._oneflow_internal.BlobDesc:
+    """The Prelu(Parametric Rectified Linear Unit) activation.
+
+    The :math:`\\alpha` is a parameter that can be trained in network
+
+    The equation is
+
+    .. math::
+
+        out = max(0, x) + \\alpha*min(0, x)
+
+    Args:
+        inputs (oneflow._oneflow_internal.BlobDesc): The input Blob.
+        alpha_initializer (Optional[initializer_conf_util.InitializerConf], optional): The initializer of alpha. Defaults to None.
+        alpha_regularizer (Optional[regularizer_conf_util.RegularizerConf], optional): The regularizer of alpha. Defaults to None.
+        shared_axes (Optional[Sequence[int]], optional): The axis along which to share learnable parameters for the prelu activation function. Defaults to None.
+        trainable (bool, optional): Whether to train the parameter :math:`\\alpha`. Defaults to True.
+        name (str, optional): The name for the operation. Defaults to "PRelu".
+        model_distribute (oneflow._oneflow_internal.distribute.Distribute, optional): Define the way to ditribute the model. Defaults to oneflow._oneflow_internal.distribute.broadcast().
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The activated Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+
+        BATCH_SIZE = 100
+
+
+        def lenet(data, train=False):
+            initializer = flow.truncated_normal(0.1)
+            conv1 = flow.layers.conv2d(
+                data,
+                32,
+                5,
+                padding="SAME",
+                name="conv1",
+                kernel_initializer=initializer,
+            )
+            prelu1 = flow.layers.prelu(conv1,
+                                    alpha_initializer=initializer,
+                                    shared_axes=[2, 3],
+                                    name="Prelu1")
+            pool1 = flow.nn.max_pool2d(
+                prelu1, ksize=2, strides=2, padding="SAME", name="pool1", data_format="NCHW"
+            )
+            conv2 = flow.layers.conv2d(
+                pool1,
+                64,
+                5,
+                padding="SAME",
+                name="conv2",
+                kernel_initializer=initializer,
+            )
+            prelu2 = flow.layers.prelu(conv2,
+                                    alpha_initializer=initializer,
+                                    shared_axes=[2, 3],
+                                    name="Prelu2")
+            pool2 = flow.nn.max_pool2d(
+                prelu2, ksize=2, strides=2, padding="SAME", name="pool2", data_format="NCHW"
+            )
+            reshape = flow.reshape(pool2, [pool2.shape[0], -1])
+            hidden = flow.layers.dense(
+                reshape,
+                512,
+                activation=flow.nn.relu,
+                kernel_initializer=initializer,
+                name="dense1",
+            )
+            if train:
+                hidden = flow.nn.dropout(hidden, rate=0.5, name="dropout")
+            return flow.layers.dense(hidden, 10, kernel_initializer=initializer, name="dense2")
+
+
+        @flow.global_function(type="train")
+        def train_job(
+                images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
+                labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
+        ) -> tp.Numpy:
+            with flow.scope.placement("gpu", "0:0"):
+                logits = lenet(images, train=True)
+                loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+                    labels, logits, name="softmax_loss"
+                )
+
+            lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.1])
+            flow.optimizer.SGD(lr_scheduler, momentum=0.9).minimize(loss)
+            return loss
+
+    """
+    alpha_shape = list(inputs.shape[1:])
+    if shared_axes is not None:
+        for i in shared_axes:
+            assert i >= 1 and i < len(inputs.shape)
+            alpha_shape[i - 1] = 1
+    if alpha_initializer is None:
+        alpha_initializer = flow.constant_initializer(0)
+    with flow.scope.namespace(name):
+        alpha = flow.get_variable(
+            name="alpha",
+            shape=alpha_shape,
+            dtype=inputs.dtype,
+            initializer=alpha_initializer,
+            regularizer=alpha_regularizer,
+            trainable=trainable,
+            distribute=model_distribute,
+            reuse=False,
+        )
+    op = (
+        flow.user_op_builder(name)
+        .Op("prelu")
+        .Input("x", [inputs])
+        .Input("alpha", [alpha])
+        .Output("y")
+        .Build()
+    )
+    return op.InferAndTryRun().SoleOutputBlob()
diff --git a/python/oneflow/compatible/single_client/ops/quantize_ops.py b/python/oneflow/compatible/single_client/ops/quantize_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..b114aabd12db1e28fa3d6ecb1ea52e0ff09ae2b1
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/quantize_ops.py
@@ -0,0 +1,353 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional, Tuple
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+
+
+def min_max_observer(
+    input: oneflow._oneflow_internal.BlobDesc,
+    quantization_bit: int = 8,
+    quantization_scheme: str = "symmetric",
+    quantization_formula: str = "google",
+    per_layer_quantization: bool = True,
+    name: Optional[str] = None,
+) -> Tuple[oneflow._oneflow_internal.BlobDesc, oneflow._oneflow_internal.BlobDesc]:
+    """Compute the quantization parameters of the input tensor.
+
+    First compute the max and min values of input tensor:
+
+    .. math::
+
+        & max\\_value = max(input)
+
+        & min\\_value = min(input)
+
+    Then compute the scale and zero_point with the following equations:
+
+        if quantization_scheme == "symmetric":
+
+        .. math::
+
+            & denom = 2^{quantization\\_to\\_bit - 1} - 1
+
+            & scale = max(|max\\_value|,|min\\_value|) / denom
+
+            & zero\\_point = 0
+
+        elif quantization_scheme == "affine":
+
+        .. math::
+
+            & denom = 2^{quantization\\_to\\_bit} - 1
+
+            & scale = (max\\_value - min\\_value) / denom
+
+            & zero\\_point = -min\\_value / scale
+
+    If per_layer_quantization is False, then the shape of scale and zero_point will be (input.shape[0],).
+
+    Args:
+        input (oneflow._oneflow_internal.BlobDesc): input tensor.
+        quantization_bit (int): Quantize input to uintX / intX, X can be in range [2, 8]. Defaults to 8.
+        quantization_scheme (str): "symmetric" or "affine", quantize to signed / unsigned integer. Defaults to "symmetric".
+        quantization_formula (str): Support "google" or "cambricon".
+        per_layer_quantization (bool): True or False, means per-layer / per-channel quantization. Defaults to True.
+        name (Optional[str]): This operator's name. Defaults to None.
+
+    Returns:
+        Tuple[oneflow._oneflow_internal.BlobDesc, oneflow._oneflow_internal.BlobDesc]: The scale and zero_point of input tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function(type="predict", function_config=flow.FunctionConfig())
+        def QuantizeJob(
+            input: tp.Numpy.Placeholder(input_shape, dtype=type_name_to_flow_type[dtype])
+        ): tp.Numpy
+            with flow.scope.placement(device_type, "0:0"):
+                scale, zero_point = flow.quantization.min_max_observer(
+                    input, quantization_bit=8,
+                    quantization_scheme="symmetric",
+                    quantization_formula="google",
+                    per_layer_quantization=True
+                )
+            return scale, zero_point
+
+        input = (np.random.random(input_shape) - 0.5).astype(type_name_to_np_type[dtype])
+        scale, zero_point = QuantizeJob(input)
+
+    """
+    if quantization_formula == "cambricon" and (not per_layer_quantization):
+        raise NotImplementedError(
+            "per-channel mode is not supported in cambricon scheme"
+        )
+    (scale, zero_point) = (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("MinMaxObserver_")
+        )
+        .Op("min_max_observer")
+        .Input("in", [input])
+        .Output("scale")
+        .Output("zero_point")
+        .Attr("quantization_bit", quantization_bit)
+        .Attr("quantization_scheme", quantization_scheme)
+        .Attr("quantization_formula", quantization_formula)
+        .Attr("per_layer_quantization", per_layer_quantization)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()
+    )
+    return (scale, zero_point)
+
+
+def moving_average_min_max_observer(
+    input: oneflow._oneflow_internal.BlobDesc,
+    quantization_bit: int = 8,
+    quantization_scheme: str = "symmetric",
+    quantization_formula: str = "google",
+    momentum: float = 0.95,
+    name: Optional[str] = None,
+) -> Tuple[oneflow._oneflow_internal.BlobDesc, oneflow._oneflow_internal.BlobDesc]:
+    """Compute the quantization parameters based on the moving average of the input tensor's min and max values.
+
+    First compute the moving\\_max and moving\\_min value of input tensor:
+
+        if quantization_scheme == "symmetric":
+
+        .. math::
+
+            & moving\\_max = moving\\_max * momentum + |max(input)| * (1 - momentum)
+
+            & moving\\_min = moving\\_max
+
+        elif quantization_scheme == "affine":
+
+        .. math::
+
+            & moving\\_max = moving\\_max * momentum + max(input) * (1 - momentum)
+
+            & moving\\_min = moving\\_min * momentum + min(input) * (1 - momentum)
+
+    The moving average of min and max values are initialized as the first batch of input `Blob`'s min and max.
+
+    Then compute the scale and zero_point with the following equations:
+
+        if quantization_scheme == "symmetric":
+
+        .. math::
+
+            & denom = 2^{quantization\\_to\\_bit - 1} - 1
+
+            & scale = moving\\_max / denom
+
+            & zero\\_point = 0
+
+        elif quantization_scheme == "affine":
+
+        .. math::
+
+            & denom = 2^{quantization\\_to\\_bit} - 1
+
+            & scale = (moving\\_max - moving\\_min) / denom
+
+            & zero\\_point = -moving\\_min / scale
+
+    Args:
+        input (oneflow._oneflow_internal.BlobDesc): input tensor.
+        quantization_bit (int): Quantize input to uintX / intX, X can be in range [2, 8]. Defaults to 8.
+        quantization_scheme (str): "symmetric" or "affine", quantize to signed / unsigned integer. Defaults to "symmetric".
+        quantization_formula (str): Support "google" or "cambricon".
+        momentum (float): Smoothing parameter for exponential moving average operation. Defaults to 0.95.
+        name (Optional[str]): This operator's name. Defaults to None.
+
+    Returns:
+        Tuple[oneflow._oneflow_internal.BlobDesc, oneflow._oneflow_internal.BlobDesc]: The scale and zero_point of input tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function(type="predict", function_config=flow.FunctionConfig())
+        def QuantizeJob(
+            input: tp.Numpy.Placeholder(input_shape, dtype=type_name_to_flow_type[dtype])
+        ): tp.Numpy
+            with flow.scope.placement(device_type, "0:0"):
+                scale, zero_point = flow.quantization.moving_average_min_max_observer(
+                    input, quantization_bit=8,
+                    quantization_scheme="symmetric",
+                    quantization_formula="google",
+                    momentum=0.95
+                )
+            return scale, zero_point
+
+        input = (np.random.random(input_shape) - 0.5).astype(type_name_to_np_type[dtype])
+        scale, zero_point = QuantizeJob(input)
+
+    """
+    op_name = (
+        name if name is not None else id_util.UniqueStr("MovingAverageMinMaxObserver_")
+    )
+    training = True if flow.current_global_function_desc().IsTrainable() else False
+    with flow.scope.namespace(op_name):
+        moving_max = flow.get_variable(
+            "moving_max",
+            shape=(1,),
+            dtype=input.dtype,
+            initializer=flow.zeros_initializer(input.dtype),
+            trainable=False,
+        )
+        moving_min = flow.get_variable(
+            "moving_min",
+            shape=(1,),
+            dtype=input.dtype,
+            initializer=flow.zeros_initializer(input.dtype),
+            trainable=False,
+        )
+        current_train_step = flow.get_variable(
+            "current_train_step",
+            shape=(1,),
+            dtype=flow.int64,
+            initializer=flow.zeros_initializer(flow.int64),
+            trainable=False,
+        )
+    stop_update_after_iters = 1
+    (scale, zero_point) = (
+        flow.user_op_builder(op_name)
+        .Op("moving_average_min_max_observer")
+        .Input("in", [input])
+        .Input("current_train_step", [current_train_step])
+        .Input("moving_max", [moving_max])
+        .Input("moving_min", [moving_min])
+        .Output("scale")
+        .Output("zero_point")
+        .Attr("training", training)
+        .Attr("stop_update_after_iters", stop_update_after_iters)
+        .Attr("quantization_bit", quantization_bit)
+        .Attr("quantization_scheme", quantization_scheme)
+        .Attr("quantization_formula", quantization_formula)
+        .Attr("momentum", momentum)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()
+    )
+    return (scale, zero_point)
+
+
+def fake_quantization(
+    input: oneflow._oneflow_internal.BlobDesc,
+    scale: oneflow._oneflow_internal.BlobDesc,
+    zero_point: oneflow._oneflow_internal.BlobDesc,
+    quantization_bit: int = 8,
+    quantization_scheme: str = "symmetric",
+    quantization_formula: str = "google",
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Simulate the quantize and dequantize operations in training time.
+
+    The output will be computed as:
+
+        if quantization_scheme == "symmetric":
+
+        .. math::
+
+            & quant\\_max = 2^{quantization\\_to\\_bit - 1} - 1
+
+            & quant\\_min = -quant\\_max
+
+            & clamp(round(x / scale), quant\\_min, quant\\_max) * scale
+
+        elif quantization_scheme == "affine":
+
+        .. math::
+
+            & quant\\_max = 2^{quantization\\_to\\_bit} - 1
+
+            & quant\\_min = 0
+
+            & (clamp(round(x / scale + zero\\_point), quant\\_min, quant\\_max) - zero\\_point) * scale
+
+    Args:
+        input (oneflow._oneflow_internal.BlobDesc): input tensor.
+        scale (oneflow._oneflow_internal.BlobDesc): Computed by min_max_observer or moving_average_min_max_observer op.
+        zero_point (oneflow._oneflow_internal.BlobDesc): Computed by min_max_observer or moving_average_min_max_observer op.
+        quantization_bit (int): Quantize input to uintX / intX, X can be in range [2, 8]. Defaults to 8.
+        quantization_scheme (str): "symmetric" or "affine", quantize to signed / unsigned integer. Defaults to "symmetric".
+        quantization_formula (str): Support "google" or "cambricon".
+        name (Optional[str]): This operator's name. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: Input tensor after quantize and dequantize operations.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+        @flow.global_function(type="predict", function_config=flow.FunctionConfig())
+        def QuantizeJob(
+            input: tp.Numpy.Placeholder(input_shape, dtype=type_name_to_flow_type[dtype])
+        ): tp.Numpy
+            with flow.scope.placement(device_type, "0:0"):
+                scale, zero_point = flow.quantization.min_max_observer(
+                    input, quantization_bit=8,
+                    quantization_scheme="symmetric",
+                    quantization_formula="google",
+                    per_layer_quantization=True
+                )
+                fake_quantize_out = flow.quantization.fake_quantization(
+                    input, scale, zero_point,
+                    quantization_bit=8,
+                    quantization_scheme="symmetric",
+                    quantization_formula="google"
+                )
+            return fake_quantize_out
+
+        input = (np.random.random(input_shape) - 0.5).astype(type_name_to_np_type[dtype])
+        fake_quantize_out = QuantizeJob(input)
+
+    """
+    return (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("Fake_Quantization_")
+        )
+        .Op("fake_quantization")
+        .Input("in", [input])
+        .Input("scale", [scale])
+        .Input("zero_point", [zero_point])
+        .Output("out")
+        .Attr("quantization_bit", quantization_bit)
+        .Attr("quantization_scheme", quantization_scheme)
+        .Attr("quantization_formula", quantization_formula)
+        .Build()
+        .InferAndTryRun()
+        .SoleOutputBlob()
+    )
diff --git a/python/oneflow/compatible/single_client/ops/random_ops.py b/python/oneflow/compatible/single_client/ops/random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..799c6c25d0f1f2a444861d1dc0f78986d8273d4d
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/random_ops.py
@@ -0,0 +1,106 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework import module as module_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+
+
+def Bernoulli(
+    x: oneflow._oneflow_internal.BlobDesc,
+    seed: Optional[int] = None,
+    dtype: Optional[flow.dtype] = None,
+    name: str = "Bernoulli",
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator returns a Blob with binaray random numbers (0 / 1) from a Bernoulli distribution.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): The input Blob.
+        seed (Optional[int], optional): The random seed. Defaults to None.
+        dtype (Optional[flow.dtype], optional): The data type. Defaults to None.
+        name (str, optional): The name for the operation. Defaults to "Bernoulli".
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def bernoulli_Job(x: tp.Numpy.Placeholder(shape=(3, 3), dtype=flow.float32),
+        ) -> tp.Numpy:
+            out = flow.random.bernoulli(x)
+            return out
+
+
+        x = np.array([[0.25, 0.45, 0.3],
+                    [0.55, 0.32, 0.13],
+                    [0.75, 0.15, 0.1]]).astype(np.float32)
+        out = bernoulli_Job(x)
+
+        # Because our random seed is not fixed, so the return value is different each time.
+        # out [[1. 0. 0.]
+        #      [0. 0. 1.]
+        #      [0. 0. 0.]]
+
+    """
+    assert isinstance(name, str)
+    if dtype is None:
+        dtype = x.dtype
+    if seed is not None:
+        assert name is not None
+    module = flow.find_or_create_module(
+        name, lambda: BernoulliModule(dtype=dtype, random_seed=seed, name=name)
+    )
+    return module(x)
+
+
+class BernoulliModule(module_util.Module):
+    def __init__(self, dtype: flow.dtype, random_seed: Optional[int], name: str):
+        module_util.Module.__init__(self, name)
+        (seed, has_seed) = flow.random.gen_seed(random_seed)
+        self.op_module_builder = (
+            flow.user_op_module_builder("bernoulli")
+            .InputSize("in", 1)
+            .Output("out")
+            .Attr("dtype", dtype)
+            .Attr("has_seed", has_seed)
+            .Attr("seed", seed)
+            .CheckAndComplete()
+        )
+        self.op_module_builder.user_op_module.InitOpKernel()
+
+    def forward(self, x: oneflow._oneflow_internal.BlobDesc):
+        if self.call_seq_no == 0:
+            name = self.module_name
+        else:
+            name = id_util.UniqueStr("Bernoulli_")
+        return (
+            self.op_module_builder.OpName(name)
+            .Input("in", [x])
+            .Build()
+            .InferAndTryRun()
+            .SoleOutputBlob()
+        )
diff --git a/python/oneflow/compatible/single_client/ops/random_util.py b/python/oneflow/compatible/single_client/ops/random_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6b04ad2fba5bac5ef6a734e4b47910af5e6adec
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/random_util.py
@@ -0,0 +1,43 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import random
+import sys
+import typing
+
+from oneflow.compatible.single_client.framework import hob as hob
+from oneflow.compatible.single_client.support import enable_if as enable_if
+
+
+def api_gen_random_seed(seed: typing.Optional[int] = None):
+    api = enable_if.unique([consistent_gen_random_seed, mirrored_gen_random_seed])
+    return api(seed)
+
+
+@enable_if.condition(hob.consistent_view_enabled)
+def consistent_gen_random_seed(seed=None):
+    if seed is None:
+        seed = random.randint(-sys.maxsize, sys.maxsize)
+    return (seed, True)
+
+
+@enable_if.condition(hob.mirrored_view_enabled)
+def mirrored_gen_random_seed(seed=None):
+    if seed is None:
+        seed = -1
+        has_seed = False
+    else:
+        has_seed = True
+    return (seed, has_seed)
diff --git a/python/oneflow/compatible/single_client/ops/reduce_mean.py b/python/oneflow/compatible/single_client/ops/reduce_mean.py
new file mode 100644
index 0000000000000000000000000000000000000000..00cf3f267760b8aee951c35e7ef80ad257c2bb59
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/reduce_mean.py
@@ -0,0 +1,87 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import collections
+from typing import Optional, Union
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+from oneflow.core.register import logical_blob_id_pb2 as logical_blob_id_util
+
+
+def reduce_mean(
+    input_blob: oneflow._oneflow_internal.BlobDesc,
+    axis: Optional[Union[collections.Sized, int]] = None,
+    keepdims: bool = False,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the mean of input Blob along the specified axis
+
+    Args:
+        input_blob (oneflow._oneflow_internal.BlobDesc): A Blob
+        axis (Optional[Union[collections.Sized, int]], optional): The dimension along which the mean value is computed. Defaults to None.
+        keepdims (bool, optional): Whether to keep the reduced dimension in the output Blob. Defaults to False.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result of average on the specified axis of input Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def reduce_mean_Job(x: tp.Numpy.Placeholder((3, 3))
+        ) -> tp.Numpy:
+            return flow.math.reduce_mean(x, axis=1, keepdims=True)
+
+
+        x = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]).astype(np.float32)
+        out = reduce_mean_Job(x)
+
+        # out [[2.]
+        #      [5.]
+        #      [8.]]
+
+    """
+    reduce_sum = flow.math.reduce_sum(
+        input_blob, axis=axis, keepdims=keepdims, name=name
+    )
+    if input_blob.is_dynamic:
+        reduce_count = flow.math.reduced_shape_elem_cnt(
+            input_blob, axis=axis, dtype=input_blob.dtype
+        )
+        return reduce_sum / reduce_count
+    else:
+        if axis is None:
+            axes = []
+        else:
+            axes = list(axis) if isinstance(axis, collections.Sized) else [axis]
+        reduce_count = 1
+        if len(axes) == 0:
+            for dim in input_blob.shape:
+                reduce_count *= dim
+        else:
+            for i in axes:
+                reduce_count *= input_blob.shape[i]
+        return flow.math.multiply(reduce_sum, 1.0 / reduce_count)
diff --git a/python/oneflow/compatible/single_client/ops/reduce_ops.py b/python/oneflow/compatible/single_client/ops/reduce_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5277b3e5721c91fc8ced53be8415c2b71cbd75e
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/reduce_ops.py
@@ -0,0 +1,606 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+from typing import Optional, Sequence, Sized, Union
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework import interpret_util as interpret_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+from oneflow.core.register import logical_blob_id_pb2 as logical_blob_id_util
+
+
+def _gen_unique_name_if_need(name, default_name):
+    if name is None:
+        return id_util.UniqueStr(default_name)
+    assert isinstance(name, str), name
+    return name
+
+
+def _check_axis(axis, shape):
+    if axis is None:
+        axis = list(range(len(shape)))
+    if isinstance(axis, int):
+        axis = [axis]
+    assert isinstance(axis, (list, tuple)), "Invalid axis {}".format(axis)
+    for x in axis:
+        if x < 0:
+            x += len(shape)
+        assert x >= 0 and x < len(shape), "Invalid axis {}, len(shape): {}".format(
+            axis, len(shape)
+        )
+    return axis
+
+
+def _do_reduce(x, name, op_type_name, keepdims, axis):
+    op = (
+        flow.user_op_builder(name)
+        .Op(op_type_name)
+        .Input("input_tensor", [x])
+        .Output("output_tensor")
+        .Attr("axis", axis)
+        .Attr("keepdims", keepdims)
+        .Build()
+    )
+    return op.InferAndTryRun().SoleOutputBlob()
+
+
+def reduce_sum(
+    input_tensor: oneflow._oneflow_internal.BlobDesc,
+    axis: Optional[Union[int, Sequence[int]]] = None,
+    keepdims: bool = False,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the sum of elements across dimensions of a tensor
+
+    Args:
+        input_tensor (oneflow._oneflow_internal.BlobDesc): A Blob
+        axis (Optional[Union[int, Sequence[int]]], optional): The dimension along which the sum value is computed. Defaults to None.
+        keepdims (bool, optional): Whether to keep the reduced dimension in the output Blob. Defaults to False.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result of sum on the specified axis of input Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def reduce_sum_Job(x: tp.Numpy.Placeholder((3, 3))
+        ) -> tp.Numpy:
+            return flow.math.reduce_sum(x, axis=1, keepdims=True)
+
+
+        x = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]).astype(np.float32)
+        out = reduce_sum_Job(x)
+
+        # out [[ 6.]
+        #      [15.]
+        #      [24.]]
+
+    """
+    name = _gen_unique_name_if_need(name, "ReduceSum_")
+    axis = _check_axis(axis, input_tensor.shape)
+    if len(axis) == 0:
+        return input_tensor
+    op = (
+        flow.user_op_builder(name)
+        .Op("reduce_sum")
+        .Input("input_tensor", [input_tensor])
+        .Output("output_tensor")
+        .Attr("axis", axis)
+        .Attr("keepdims", keepdims)
+        .Build()
+    )
+    return op.InferAndTryRun().SoleOutputBlob()
+
+
+def reduce_any(
+    x: oneflow._oneflow_internal.BlobDesc,
+    axis: Optional[Union[int, Sequence[int]]] = None,
+    keepdims: bool = False,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the `logical or` of input Blob along the specified axis
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        axis (Optional[Union[int, Sequence[int]]], optional): The dimension along which the logical and value is computed. Defaults to None.
+        keepdims (bool, optional): Whether to keep the reduced dimension in the output Blob. Defaults to False.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result of logical or on the specified axis of input Blob
+
+    Note:
+
+        The input Blob dtype is int8
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def reduce_any_Job(x: tp.Numpy.Placeholder((3, 3), dtype=flow.int8)
+        ) -> tp.Numpy:
+            return flow.math.reduce_any(x, axis=1, keepdims=True)
+
+
+        x = np.array([[1, 0, 0], [0, 0, 0], [1, 0, 1]]).astype(np.int8)
+        out = reduce_any_Job(x)
+
+        # out [[1]
+        #      [0]
+        #      [1]]
+
+    """
+    name = _gen_unique_name_if_need(name, "ReduceAny_")
+    axis = _check_axis(axis, x.shape)
+    if len(axis) == 0:
+        return flow.math.not_equal(x, flow.constant_scalar(value=0.0, dtype=x.dtype))
+    return _do_reduce(x, name, "reduce_any", keepdims, axis)
+
+
+def reduce_min(
+    x: oneflow._oneflow_internal.BlobDesc,
+    axis: Optional[Union[int, Sequence[int]]] = None,
+    keepdims: bool = False,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the minimum value of input Blob along the specified axis
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        axis (Optional[Union[int, Sequence[int]]], optional): The dimension along which the minimum value is computed. Defaults to None.
+        keepdims (bool, optional): Whether to keep the reduced dimension in the output Blob. Defaults to False.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result of minimum value on the specified axis of input Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def reduce_min_Job(x: tp.Numpy.Placeholder((3, 3))
+        ) -> tp.Numpy:
+            return flow.math.reduce_min(x, axis=1, keepdims=True)
+
+
+        x = np.array([[2, 1, 3], [5, 3, 6], [7, 4, 9]]).astype(np.float32)
+        out = reduce_min_Job(x)
+
+        # out [[1.]
+        #      [3.]
+        #      [4.]]
+
+    """
+    name = _gen_unique_name_if_need(name, "ReduceMin_")
+    axis = _check_axis(axis, x.shape)
+    if len(axis) == 0:
+        return x
+    return _do_reduce(x, name, "reduce_min", keepdims, axis)
+
+
+def reduce_max(
+    x: oneflow._oneflow_internal.BlobDesc,
+    axis: Optional[Union[int, Sequence[int]]] = None,
+    keepdims: bool = False,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the maximum value of input Blob along the specified axis
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        axis (Optional[Union[int, Sequence[int]]], optional): The dimension along which the maximum value is computed. Defaults to None.
+        keepdims (bool, optional): Whether to keep the reduced dimension in the output Blob. Defaults to False.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result of maximum value on the specified axis of input Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def reduce_max_Job(x: tp.Numpy.Placeholder((3, 3))
+        ) -> tp.Numpy:
+            return flow.math.reduce_max(x, axis=1, keepdims=True)
+
+
+        x = np.array([[2, 1, 4], [5, 3, 7], [7, 4, 9]]).astype(np.float32)
+        out = reduce_max_Job(x)
+
+        # out [[4.]
+        #      [7.]
+        #      [9.]]
+
+    """
+    name = _gen_unique_name_if_need(name, "ReduceMax_")
+    axis = _check_axis(axis, x.shape)
+    if len(axis) == 0:
+        return x
+    return _do_reduce(x, name, "reduce_max", keepdims, axis)
+
+
+def reduce_prod(
+    x: oneflow._oneflow_internal.BlobDesc,
+    axis: Optional[Union[int, Sequence[int]]] = None,
+    keepdims: bool = False,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the product of input Blob along the specified axis
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        axis (Optional[Union[int, Sequence[int]]], optional): The dimension along which the product is computed. Defaults to None.
+        keepdims (bool, optional): Whether to keep the reduced dimension in the output Blob. Defaults to False.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result of product value on the specified axis of input Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def reduce_product_Job(x: tp.Numpy.Placeholder((3, 3))
+        ) -> tp.Numpy:
+            return flow.math.reduce_prod(x, axis=1, keepdims=True)
+
+
+        x = np.array([[1, 2, 3], [3, 4, 5], [6, 3, 2]]).astype(np.float32)
+        out = reduce_product_Job(x)
+
+        # out [[ 6.]
+        #      [60.]
+        #      [36.]]
+
+    """
+    name = _gen_unique_name_if_need(name, "ReduceProd_")
+    axis = _check_axis(axis, x.shape)
+    if len(axis) == 0:
+        return x
+    return _do_reduce(x, name, "reduce_prod", keepdims, axis)
+
+
+def reduce_all(
+    x: oneflow._oneflow_internal.BlobDesc,
+    axis: Optional[Union[int, Sequence[int]]] = None,
+    keepdims: bool = False,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the `logical and` of input Blob along the specified axis
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        axis (Optional[Union[int, Sequence[int]]], optional): The dimension along which the logical and value is computed. Defaults to None.
+        keepdims (bool, optional): Whether to keep the reduced dimension in the output Blob. Defaults to False.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result of logical and value on the specified axis of input Blob
+
+    Note:
+
+        The input Blob dtype is int8
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def reduce_all_Job(x: tp.Numpy.Placeholder((3, 3), dtype=flow.int8)
+        ) -> tp.Numpy:
+            return flow.math.reduce_all(x, axis=1, keepdims=True)
+
+
+        x = np.array([[1, 0, 0], [0, 0, 0], [1, 1, 1]]).astype(np.int8)
+        out = reduce_all_Job(x)
+
+        # out [[0]
+        #      [0]
+        #      [1]]
+
+    """
+    name = _gen_unique_name_if_need(name, "ReduceAll_")
+    axis = _check_axis(axis, x.shape)
+    if len(axis) == 0:
+        return flow.math.not_equal(x, flow.constant_scalar(value=0.0, dtype=x.dtype))
+    return _do_reduce(x, name, "reduce_all", keepdims, axis)
+
+
+def reduce_euclidean_norm(
+    input_tensor: oneflow._oneflow_internal.BlobDesc,
+    axis: Optional[Union[int, Sequence[int]]] = None,
+    keepdims: bool = False,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the Euclidean norm of input Blob along the specified axis
+
+    The equation is:
+
+    .. math::
+
+        out=\\sqrt{\\sum_{t=0}^{n} x_{t}^2}
+
+    Args:
+        input_tensor (oneflow._oneflow_internal.BlobDesc): A Blob
+        axis (Optional[Union[int, Sequence[int]]], optional): The dimension along which the Euclidean norm is computed. Defaults to None.
+        keepdims (bool, optional): Whether to keep the reduced dimension in the output Blob. Defaults to False.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result of Euclidean norm on the specified axis of input Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def reduce_euclidean_norm_Job(x: tp.Numpy.Placeholder((3, 2))
+        ) -> tp.Numpy:
+            return flow.math.reduce_euclidean_norm(x, axis=1, keepdims=True)
+
+
+        x = np.array([[3, 4], [5, 12], [8, 15]]).astype(np.float32)
+        out = reduce_euclidean_norm_Job(x)
+
+        # out [[ 5.]
+        #      [13.]
+        #      [17.]]
+
+    """
+    name = _gen_unique_name_if_need(name, "ReduceEuclideanNorm_")
+    return flow.math.sqrt(
+        flow.math.reduce_sum(
+            flow.math.square(input_tensor, name + "_square"),
+            axis,
+            keepdims,
+            name + "_reduce_sum",
+        ),
+        name + "_sqrt",
+    )
+
+
+def reduce_logsumexp(
+    input_tensor: oneflow._oneflow_internal.BlobDesc,
+    axis: Optional[Union[int, Sequence[int]]] = None,
+    keepdims: bool = False,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the log of exponential sum of input Blob along the specified axis
+
+
+    The equation is:
+
+    .. math::
+
+        out = log(\\sum_{t=0}^{t=n} e^{x_{t}})
+
+    Args:
+        input_tensor (oneflow._oneflow_internal.BlobDesc): A Blob
+        axis (Optional[Union[int, Sequence[int]]], optional): The dimension along which the log of exponential sum is computed. Defaults to None.
+        keepdims (bool, optional): Whether to keep the reduced dimension in the output Blob. Defaults to False.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result of log of exponential sum on the specified axis of input Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def reduce_logsumexp_Job(x: tp.Numpy.Placeholder((3, 2))
+        ) -> tp.Numpy:
+            return flow.math.reduce_logsumexp(x, axis=1, keepdims=True)
+
+
+        x = np.array([[0, 0], [1, 1], [2, 2]]).astype(np.float32)
+        out = reduce_logsumexp_Job(x)
+
+        # out [[0.6931472]
+        #      [1.6931472]
+        #      [2.6931472]]
+
+    """
+    name = _gen_unique_name_if_need(name, "ReduceLogSumExp_")
+    axis = _check_axis(axis, input_tensor.shape)
+    return flow.math.log(
+        flow.math.reduce_sum(
+            flow.math.exp(input_tensor, name + "_exp"),
+            axis,
+            keepdims,
+            name + "_reduce_sum",
+        ),
+        name + "_log",
+    )
+
+
+def reduce_std(
+    input_tensor: oneflow._oneflow_internal.BlobDesc,
+    axis: Optional[Union[int, Sequence[int]]] = None,
+    keepdims: bool = False,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the standard deviation of input Blob along the specified axis
+
+    The equation is:
+
+    .. math::
+
+        out=\\sqrt{\\frac{1}{n}*\\sum_{i=1}^{n}(x_i-mean)^2}
+
+    Args:
+        input_tensor (oneflow._oneflow_internal.BlobDesc): A Blob
+        axis (Optional[Union[int, Sequence[int]]], optional): The dimension along which the standard deviation is computed. Defaults to None.
+        keepdims (bool, optional): Whether to keep the reduced dimension in the output Blob. Defaults to False.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result of standard deviation on the specified axis of input Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def reduce_std_Job(x: tp.Numpy.Placeholder((3, 3))
+        ) -> tp.Numpy:
+            return flow.math.reduce_std(x, axis=1, keepdims=True)
+
+
+        x = np.array([[0, 5, 10], [5, 5, 5], [12, 3, 0]]).astype(np.float32)
+        out = reduce_std_Job(x)
+
+        # out [[4.0824833]
+        #      [0.       ]
+        #      [5.0990195]]
+
+    """
+    name = _gen_unique_name_if_need(name, "ReduceStd_")
+    axis = _check_axis(axis, input_tensor.shape)
+    if isinstance(axis, list) and len(axis) == 0:
+        return flow.zeros_like(
+            input_tensor, dtype=input_tensor.dtype, name=name + "_zeros_like"
+        )
+    return flow.math.sqrt(
+        flow.math.reduce_variance(
+            input_tensor, axis, keepdims, name + "_reduce_variance"
+        ),
+        name + "_sqrt",
+    )
+
+
+def reduce_variance(
+    input_tensor: oneflow._oneflow_internal.BlobDesc,
+    axis: Optional[Union[int, Sequence[int]]] = None,
+    keepdims: bool = False,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the variance of input Blob along the specified axis
+
+    The equation is:
+
+    .. math::
+
+        out=\\frac{1}{n}*\\sum_{i=1}^{n}(x_i-mean)^2
+
+    Args:
+        input_tensor (oneflow._oneflow_internal.BlobDesc): A Blob
+        axis (Optional[Union[int, Sequence[int]]], optional): The dimension along which the variance is computed. Defaults to None.
+        keepdims (bool, optional): Whether to keep the reduced dimension in the output Blob. Defaults to False.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result of variance on the specified axis of input Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def reduce_variance_Job(x: tp.Numpy.Placeholder((3, 3))
+        ) -> tp.Numpy:
+            return flow.math.reduce_variance(x, axis=1, keepdims=True)
+
+
+        x = np.array([[0, 5, 10], [5, 5, 5], [12, 3, 0]]).astype(np.float32)
+        out = reduce_variance_Job(x)
+
+        # out [[16.666668]
+        #      [ 0.      ]
+        #      [26.      ]]
+
+    """
+    name = _gen_unique_name_if_need(name, "ReduceVariance_")
+    axis = _check_axis(axis, input_tensor.shape)
+    if isinstance(axis, list) and len(axis) == 0:
+        return flow.zeros_like(
+            input_tensor, dtype=input_tensor.dtype, name=name + "_zeros_like"
+        )
+    return flow.math.subtract(
+        flow.math.reduce_mean(
+            flow.math.square(input_tensor, name + "_square_minuend"),
+            axis,
+            keepdims,
+            name + "_reduce_mean_minuend",
+        ),
+        flow.math.square(
+            flow.math.reduce_mean(
+                input_tensor, axis, keepdims, name + "_reduce_mean_subtrahend"
+            ),
+            name + "_square_subtrahend",
+        ),
+        name + "_subtract",
+    )
diff --git a/python/oneflow/compatible/single_client/ops/regularizer_util.py b/python/oneflow/compatible/single_client/ops/regularizer_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa94adb5b726a5f163f5ff7223dde81e0671c763
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/regularizer_util.py
@@ -0,0 +1,152 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.core.job import regularizer_conf_pb2 as regularizer_conf_util
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+
+
+def l1_l2_regularizer(
+    l1: float = 0.01, l2: float = 0.01
+) -> regularizer_conf_util.RegularizerConf:
+    """This operator creates a L1 and L2 weight regularizer. 
+
+    Args:
+        l1 (float, optional): The L1 regularization coefficient. Defaults to 0.01.
+        l2 (float, optional): The L2 regularization coefficient. Defaults to 0.01.
+
+    Returns:
+        regularizer_conf_util.RegularizerConf: A regularizer that can be used in other layers or operators.
+    
+    For example: 
+
+    .. code-block:: python 
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def conv2d_l1_l2_Job(x: tp.Numpy.Placeholder((1, 256, 32, 32))
+        ) -> tp.Numpy:
+            initializer = flow.truncated_normal(0.1)
+            regularizer = flow.regularizers.l1_l2(l1=0.001, l2=0.001)
+            conv2d = flow.layers.conv2d(
+                x,
+                filters=128,
+                kernel_size=3,
+                strides=1,
+                padding='SAME',
+                kernel_initializer=initializer, 
+                kernel_regularizer=regularizer,
+                name="Conv2d"
+            )
+            return conv2d
+
+
+        x = np.random.randn(1, 256, 32, 32).astype(np.float32)
+        out = conv2d_l1_l2_Job(x)
+    
+    """
+    regularizer = regularizer_conf_util.RegularizerConf()
+    setattr(regularizer.l1_l2_conf, "l1", l1)
+    setattr(regularizer.l1_l2_conf, "l2", l2)
+    return regularizer
+
+
+def l1_regularizer(l: float = 0.01) -> regularizer_conf_util.RegularizerConf:
+    """This operator creates a L1 weight regularizer. 
+
+    Args:
+        l (float, optional): The L1 regularization coefficient. Defaults to 0.01.
+
+    Returns:
+        regularizer_conf_util.RegularizerConf: A regularizer that can be used in other layers or operators.
+    
+    For example: 
+
+    .. code-block:: python 
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def conv2d_l1_Job(x: tp.Numpy.Placeholder((1, 256, 32, 32))
+        ) -> tp.Numpy:
+            initializer = flow.truncated_normal(0.1)
+            regularizer = flow.regularizers.l1(l=0.001)
+            conv2d = flow.layers.conv2d(
+                x,
+                filters=128,
+                kernel_size=3,
+                strides=1,
+                padding='SAME',
+                kernel_initializer=initializer, 
+                kernel_regularizer=regularizer,
+                name="Conv2d"
+            )
+            return conv2d
+
+
+        x = np.random.randn(1, 256, 32, 32).astype(np.float32)
+        out = conv2d_l1_Job(x)
+            
+    """
+    return l1_l2_regularizer(l1=l, l2=0.0)
+
+
+def l2_regularizer(l: float = 0.01) -> regularizer_conf_util.RegularizerConf:
+    """This operator creates a L2 weight regularizer. 
+
+    Args:
+        l (float, optional): The L2 regularization coefficient. Defaults to 0.01.
+
+    Returns:
+        regularizer_conf_util.RegularizerConf: A regularizer that can be used in other layers or operators.
+
+    For example: 
+
+    .. code-block:: python 
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def conv2d_l2_Job(x: tp.Numpy.Placeholder((1, 256, 32, 32))
+        ) -> tp.Numpy:
+            initializer = flow.truncated_normal(0.1)
+            regularizer = flow.regularizers.l2(l=0.001)
+            conv2d = flow.layers.conv2d(
+                x,
+                filters=128,
+                kernel_size=3,
+                strides=1,
+                padding='SAME',
+                kernel_initializer=initializer, 
+                kernel_regularizer=regularizer,
+                name="Conv2d"
+            )
+            return conv2d
+
+
+        x = np.random.randn(1, 256, 32, 32).astype(np.float32)
+        out = conv2d_l2_Job(x)
+
+    """
+    return l1_l2_regularizer(l1=0.0, l2=l)
diff --git a/python/oneflow/compatible/single_client/ops/sort_ops.py b/python/oneflow/compatible/single_client/ops/sort_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..643a5358c080581ca5f1e523618d1e9b014b3df2
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/sort_ops.py
@@ -0,0 +1,169 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+from oneflow.compatible.single_client.ops.transpose_util import (
+    get_inversed_perm,
+    get_perm_when_transpose_axis_to_last_dim,
+)
+
+
+def _sort_at_last_dim(
+    input: oneflow._oneflow_internal.BlobDesc,
+    direction: str = "ASCENDING",
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    assert direction in ["ASCENDING", "DESCENDING"]
+    return (
+        flow.user_op_builder(name if name is not None else id_util.UniqueStr("Sort_"))
+        .Op("sort")
+        .Input("in", [input])
+        .Output("out")
+        .Attr("direction", direction)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def sort(
+    input: oneflow._oneflow_internal.BlobDesc,
+    axis: int = -1,
+    direction: str = "ASCENDING",
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator sorts the input Blob at specified axis.
+
+    Args:
+        input (oneflow._oneflow_internal.BlobDesc): A Blob
+        axis (int, optional): dimension to be sorted. Defaults to the last dim (-1)
+        direction (str, optional): The direction in which to sort the Blob values. If the direction is "ASCENDING", The order of input will be sorted as ascending, else, the order of input will be sorted as descending. Defaults to "ASCENDING".
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The sorted Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def sort_Job(x: tp.Numpy.Placeholder((5, ))
+        ) -> tp.Numpy:
+            return flow.sort(input=x)
+
+        x = np.array([10, 2, 9, 3, 7]).astype("float32")
+        out = sort_Job(x)
+
+        # out [ 2.  3.  7.  9. 10.]
+
+    """
+    assert direction in ["ASCENDING", "DESCENDING"]
+    name = name if name is not None else id_util.UniqueStr("Sort_")
+    num_axes = len(input.shape)
+    axis = axis if axis >= 0 else axis + num_axes
+    assert 0 <= axis < num_axes, "axis out of range"
+    if axis == num_axes - 1:
+        return _sort_at_last_dim(input, direction, name)
+    else:
+        perm = get_perm_when_transpose_axis_to_last_dim(num_axes, axis)
+        x = flow.transpose(input, perm, False, True, name + "_transpose")
+        x = _sort_at_last_dim(x, direction, name)
+        return flow.transpose(
+            x, get_inversed_perm(perm), False, True, name + "_inverse_transpose"
+        )
+
+
+def _argsort_at_last_dim(
+    input: oneflow._oneflow_internal.BlobDesc,
+    direction: str = "ASCENDING",
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    assert direction in ["ASCENDING", "DESCENDING"]
+    return (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("ArgSort_")
+        )
+        .Op("arg_sort")
+        .Input("in", [input])
+        .Output("out")
+        .Attr("direction", direction)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def argsort(
+    input: oneflow._oneflow_internal.BlobDesc,
+    axis: int = -1,
+    direction: str = "ASCENDING",
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator sorts the input Blob at specified axis and return the indices of the sorted Blob.
+
+    Args:
+        input (oneflow._oneflow_internal.BlobDesc): A Blob
+        axis (int, optional): dimension to be sorted. Defaults to the last dim (-1)
+        direction (str, optional): The direction in which to sort the Blob values. If the direction is "ASCENDING", The order of input will be sorted as ascending, else, the order of input will be sorted as descending. Defaults to "ASCENDING".
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The indices of the sorted Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def argsort_Job(x: tp.Numpy.Placeholder((5, ))
+        ) -> tp.Numpy:
+            return flow.argsort(input=x)
+
+        x = np.array([10, 2, 9, 3, 7]).astype("float32")
+        out = argsort_Job(x)
+
+        # out [1 3 4 2 0]
+
+    """
+    assert direction in ["ASCENDING", "DESCENDING"]
+    name = name if name is not None else id_util.UniqueStr("ArgSort_")
+    num_axes = len(input.shape)
+    axis = axis if axis >= 0 else axis + num_axes
+    assert 0 <= axis < num_axes, "axis out of range"
+    if axis == num_axes - 1:
+        return _argsort_at_last_dim(input, direction, name)
+    else:
+        perm = get_perm_when_transpose_axis_to_last_dim(num_axes, axis)
+        x = flow.transpose(input, perm, False, True, name + "_transpose")
+        x = _argsort_at_last_dim(x, direction, name)
+        return flow.transpose(
+            x, get_inversed_perm(perm), False, True, name + "_inverse_transpose"
+        )
diff --git a/python/oneflow/compatible/single_client/ops/summary_ops.py b/python/oneflow/compatible/single_client/ops/summary_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..26633f53e40cf7e57f8c5d1f3fc4a9b731d32c4f
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/summary_ops.py
@@ -0,0 +1,108 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.ops import user_op_builder as user_op_builder
+
+
+def write_scalar(value, step, tag, name=None):
+    """Write scalar to log file
+
+    Args:
+        value: A 'Blob' with 1 value and dtype in (flow.float, flow.double, flow.int64, flow.int32)
+        step: A 'Blob' with 1 value and dtype is 'flow.int64'
+        tag: A 'Blob' with 1 value and dtype is 'flow.int8'
+        name: This operator's name 
+    """
+    if name is None:
+        name = id_util.UniqueStr("WriteScalar_")
+    flow.user_op_builder(name).Op("summary_write_scalar").Input("in", [value]).Input(
+        "step", [step]
+    ).Input("tag", [tag]).Build().InferAndTryRun()
+
+
+def create_summary_writer(logdir, name=None):
+    """Create a summary writer object
+
+    Args:
+        logdir: log dir
+        name: This operator's name
+    """
+    if name is None:
+        name = id_util.UniqueStr("CreateWriter_")
+    flow.user_op_builder(name).Op("create_summary_writer").Attr(
+        "logdir", logdir
+    ).Build().InferAndTryRun()
+
+
+def flush_summary_writer(name=None):
+    """Flush the summary writer
+
+    Args:
+        name: This operator's name
+    """
+    if name is None:
+        name = id_util.UniqueStr("FlushWriter_")
+    flow.user_op_builder(name).Op("flush_summary_writer").Build().InferAndTryRun()
+
+
+def write_histogram(value, step, tag, name=None):
+    """Write histogram to log file
+
+    Args:
+        value: A 'Blob' with dtype in (flow.float, flow.double, flow.int64, flow.int32, flow.int8, flow.uint8)
+        step: A 'Blob' with 1 value and dtype is 'flow.int64'
+        tag: A 'Blob' with 1 value and dtype is 'flow.int8'
+        name: This operator's name 
+    """
+    if name is None:
+        name = id_util.UniqueStr("WriteHistogram_")
+    flow.user_op_builder(name).Op("summary_write_histogram").Input("in", [value]).Input(
+        "step", [step]
+    ).Input("tag", [tag]).Build().InferAndTryRun()
+
+
+def write_pb(value, step=None, name=None):
+    """Write raw protobuf data to log file
+
+    Args:
+        value: A 'Blob' with dtype in 'flow.int8'
+        step: A 'Blob' with 1 value and dtype is 'flow.int64'
+        name: This operator's name 
+    """
+    if name is None:
+        name = id_util.UniqueStr("WritePb_")
+    flow.user_op_builder(name).Op("summary_write_pb").Input("in", [value]).Input(
+        "step", [step]
+    ).Build().InferAndTryRun()
+
+
+def write_image(value, step=None, tag=None, name=None):
+    """Write image to log file
+
+    Args:
+        value: A 'Blob' with dtype in 'flow.uint8'
+        step: A 'Blob' with 1 value and dtype is 'flow.int64'
+        tag: A 'Blob' with 1 value and dtype is 'flow.int8'
+        name: This operator's name 
+    """
+    if name is None:
+        name = id_util.UniqueStr("WriteImage_")
+    if tag is None:
+        tag = "image"
+    flow.user_op_builder(name).Op("summary_write_image").Input("in", [value]).Input(
+        "step", [step]
+    ).Input("tag", [tag]).Build().InferAndTryRun()
diff --git a/python/oneflow/compatible/single_client/ops/tensor_buffer_ops.py b/python/oneflow/compatible/single_client/ops/tensor_buffer_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c7ee3762b457378de3f68aec63610c5cfeaeeca
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/tensor_buffer_ops.py
@@ -0,0 +1,240 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import functools
+import operator
+from typing import List, Optional, Sequence
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import id_util as id_util
+
+
+def tensor_buffer_to_tensor(
+    x: oneflow._oneflow_internal.BlobDesc,
+    dtype: flow.dtype,
+    instance_shape: Sequence[int],
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator converts the Blob's type from TensorBuffer to Tensor.
+    Some operator's output data type is `TensorBuffer`, you can use this operator to convert back
+    to `Tensor`.
+
+    Refer to `Concept Explanation <https://docs.oneflow.org/basics_topics/concept_explanation.html#3tensorbuffer-tensorlist>`_
+    for more about TensorBuffer.
+
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): Input `Blob`.
+        dtype (flow.dtype): The data dtype.
+        instance_shape (Sequence[int]): The shape of each TensorBuffer instance.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: A `Blob`.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def tensor_buffer_to_tensor_Job(x: tp.Numpy.Placeholder(shape=(4, 16, 64, 64), dtype=flow.float32),
+        ) -> tp.Numpy:
+            x = flow.tensor_to_tensor_buffer(x,
+                                            instance_dims=2)
+            return flow.tensor_buffer_to_tensor(x,
+                                                instance_shape=(64, 64),
+                                                dtype=flow.float)
+
+        x = np.random.randn(4, 16, 64, 64).astype(np.float32)
+        out = tensor_buffer_to_tensor_Job(x)
+
+        # out.shape (4, 16, 64, 64)
+
+    """
+    if name is None:
+        name = id_util.UniqueStr("TensorBufferToTensor_")
+    return (
+        flow.user_op_builder(name)
+        .Op("tensor_buffer_to_tensor")
+        .Input("in", [x])
+        .Output("out")
+        .Attr("dtype", dtype)
+        .Attr("instance_shape", instance_shape)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def tensor_to_tensor_buffer(
+    x: oneflow._oneflow_internal.BlobDesc,
+    instance_dims: int,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator converts the Blob's type from Tensor to TensorBuffer.
+
+    Refer to `Concept Explanation <https://docs.oneflow.org/basics_topics/concept_explanation.html#3tensorbuffer-tensorlist>`_
+    for more about TensorBuffer.
+
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): Input `Blob`.
+        instance_dims (int): The dimensions of dynamic tensor instance.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import numpy as np
+        import oneflow.compatible.single_client.typing as tp
+
+
+        @flow.global_function()
+        def tensor_buffer_to_tensor_Job(x: tp.Numpy.Placeholder(shape=(4, 16, 64, 64), dtype=flow.float32),
+        ) -> tp.Numpy:
+            x = flow.tensor_to_tensor_buffer(x,
+                                            instance_dims=2)
+            return flow.tensor_buffer_to_tensor(x,
+                                                instance_shape=(64, 64),
+                                                dtype=flow.float)
+
+        x = np.random.randn(4, 16, 64, 64).astype(np.float32)
+        out = tensor_buffer_to_tensor_Job(x)
+
+        # out.shape (4, 16, 64, 64)
+
+    """
+    if name is None:
+        name = id_util.UniqueStr("TensorToTensorBuffer_")
+    return (
+        flow.user_op_builder(name)
+        .Op("tensor_to_tensor_buffer")
+        .Input("in", [x])
+        .Output("out")
+        .Attr("instance_dims", instance_dims)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def gen_tensor_buffer(
+    shape: Sequence[int],
+    shape_list: Sequence[Sequence[int]],
+    value_list: Sequence[float],
+    data_type: Optional[flow.dtype] = flow.float32,
+    dynamic_out: Optional[bool] = False,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator generates a tensor buffer blob.
+
+    Args:
+        shape (Sequence[int]): shape of output blob
+        shape_list ( Sequence[Sequence[int]]): shapes for tensor buffer in output blob
+        value_list (Sequence[float]): values for tensor buffer in output blob
+        data_type (Optional[flow.dtype]): data type for tensor buffer in output blob
+        dynamic_out (Optional[bool]): if output is a dynamic blob
+        name (Optional[str]): The name for the operation. Defaults to None.
+
+    Returns:
+        BlobDesc: The result Blob.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+
+        @flow.global_function(function_config=func_config)
+        def GenTensorBufferJob():
+            with flow.scope.placement("cpu", "0:0"):
+                x = flow.gen_tensor_buffer([(2,)], [(2, 1), (1, 2)], [0.0, 1.0])
+                y = flow.tensor_buffer_to_list_of_tensors(x, (100, 100), flow.float, True)
+                return y
+
+        # y_0.shape (2, 1), y_1.shape (1. 2)
+
+    """
+    return (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("GenTensorBuffer_")
+        )
+        .Op("gen_tensor_buffer")
+        .Output("out")
+        .Attr("shape", shape)
+        .Attr("shape_list", shape_list)
+        .Attr("value_list", value_list)
+        .Attr("data_type", data_type)
+        .Attr("dynamic_out", dynamic_out)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def tensor_buffer_to_list_of_tensors(
+    x: oneflow._oneflow_internal.BlobDesc,
+    out_shape: Sequence[int],
+    out_dtype: flow.dtype,
+    dynamic_out: Optional[bool] = False,
+    name: Optional[str] = None,
+) -> List[oneflow._oneflow_internal.BlobDesc]:
+    """This operator converts the Blob of TensorBuffer to list of Tensors. Every element in x will be converted
+    to a Tensor and output will be flatten to a list.
+
+    Args:
+        x (BlobDesc): Input `Blob`, data type must be tensor buffer.
+        out_shape (Sequence[int]): max shape for a tensor buffer in x
+        out_dtype (flow.dtype,): output data type
+        dynamic_out (Optional[bool]): if output is dynamic blob. Default to False.
+        name (Optional[str]): The name for the operation. Default to None.
+
+    Returns:
+        List[BlobDesc]: result blobs
+
+    For example:
+
+    .. code-block:: python
+
+        # the same with `gen_tensor_buffer` op
+
+    """
+    return (
+        flow.user_op_builder(
+            name
+            if name is not None
+            else id_util.UniqueStr("TensorBufferToListOfTensors_")
+        )
+        .Op("tensor_buffer_to_list_of_tensors")
+        .Input("in", [x])
+        .Output("out", functools.reduce(operator.mul, x.shape, 1))
+        .Attr("out_dtype", out_dtype)
+        .Attr("out_shape", out_shape)
+        .Attr("dynamic_out", dynamic_out)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()
+    )
diff --git a/python/oneflow/compatible/single_client/ops/transpose_util.py b/python/oneflow/compatible/single_client/ops/transpose_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..c93cd1a221796b0eb6dc4b43d4f0735babd3c366
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/transpose_util.py
@@ -0,0 +1,36 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Sequence
+
+
+def is_perm(perm: Sequence[int]) -> bool:
+    return list(range(len(perm))) == sorted(list(perm))
+
+
+def get_perm_when_transpose_axis_to_last_dim(num_axes: int, axis: int) -> tuple:
+    axis = axis if axis >= 0 else axis + num_axes
+    assert 0 <= axis < num_axes, "axis out of range"
+    perm = [dim if dim < axis else dim + 1 for dim in range(num_axes - 1)]
+    perm.append(axis)
+    return tuple(perm)
+
+
+def get_inversed_perm(perm: Sequence[int]) -> tuple:
+    assert is_perm(perm)
+    inversed_perm = [-1] * len(perm)
+    for i in range(len(perm)):
+        inversed_perm[perm[i]] = i
+    return tuple(inversed_perm)
diff --git a/python/oneflow/compatible/single_client/ops/two_stage_reduce.py b/python/oneflow/compatible/single_client/ops/two_stage_reduce.py
new file mode 100644
index 0000000000000000000000000000000000000000..024c1f236d68f989b18ddabbc4ed0dbe3b8b7093
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/two_stage_reduce.py
@@ -0,0 +1,156 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional, Sequence, Union
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import distribute as distribute_util
+from oneflow.compatible.single_client.framework import hob as hob
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+from oneflow.compatible.single_client.ops import user_op_builder as user_op_builder
+from oneflow.compatible.single_client.support import enable_if as enable_if
+
+
+def api_two_stage_reduce_max(
+    x: oneflow._oneflow_internal.BlobDesc,
+    axis: Optional[Union[int, Sequence[int]]] = None,
+    keepdims: bool = False,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    func = enable_if.unique([two_stage_reduce_max])
+    return func(x, axis=axis, keepdims=keepdims, name=name)
+
+
+@enable_if.condition(hob.in_global_mode)
+def two_stage_reduce_max(x, axis=None, keepdims=False, name=None):
+    name = name if name is not None else id_util.UniqueStr("ReduceMax_")
+    return two_stage_reduce(x, axis, keepdims, "reduce_max", name)
+
+
+def api_two_stage_reduce_min(
+    x: oneflow._oneflow_internal.BlobDesc,
+    axis: Optional[Union[int, Sequence[int]]] = None,
+    keepdims: bool = False,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    func = enable_if.unique([two_stage_reduce_min])
+    return func(x, axis=axis, keepdims=keepdims, name=name)
+
+
+@enable_if.condition(hob.in_global_mode)
+def two_stage_reduce_min(x, axis=None, keepdims=False, name=None):
+    name = name if name is not None else id_util.UniqueStr("ReduceMin_")
+    return two_stage_reduce(x, axis, keepdims, "reduce_min", name)
+
+
+def two_stage_reduce(x, axis=None, keepdims=False, op_type_name=None, name=None):
+    assert check_x_dictribute(x, axis)
+    axis = _check_axis(axis, x.shape)
+    device_stage_out_list = []
+    device_stage_count_list = []
+    distribute_axis = x.distribute.axis
+    x_list = flow.advanced.distribute_split(x, axis=distribute_axis)
+    parallel_desc_symbol = flow.current_scope().device_parallel_desc_symbol
+    device_tag = parallel_desc_symbol.device_tag
+    parallel_id = 0
+    for (
+        machine_id,
+        device_ids,
+    ) in parallel_desc_symbol.machine_id2device_id_list.items():
+        for device_id in device_ids:
+            with flow.scope.placement(
+                device_tag, "@" + str(machine_id) + ":" + str(device_id)
+            ):
+                (device_stage_out, device_stage_count) = reduce_device_stage(
+                    x_list[parallel_id],
+                    axis,
+                    op_type_name + "_device_stage",
+                    name + "_device_stage" + str(parallel_id),
+                )
+                device_stage_out_list.append(device_stage_out)
+                device_stage_count_list.append(device_stage_count)
+                parallel_id += 1
+    device_stage_out = flow.advanced.distribute_concat(
+        device_stage_out_list, axis=distribute_axis
+    )
+    device_stage_count = flow.advanced.distribute_concat(
+        device_stage_count_list, axis=distribute_axis
+    )
+    device_stage_out = device_stage_out.with_distribute(flow.distribute.broadcast())
+    device_stage_count = device_stage_count.with_distribute(flow.distribute.broadcast())
+    out = reduce_global_stage(
+        device_stage_out,
+        device_stage_count,
+        axis,
+        keepdims,
+        op_type_name + "_global_stage",
+        name + "_global_stage",
+    )
+    return out
+
+
+def reduce_device_stage(x, axis, op_name, name):
+    (out, mask, count) = (
+        flow.user_op_builder(name)
+        .Op(op_name)
+        .Input("in", [x])
+        .Output("out")
+        .Output("mask")
+        .Output("count")
+        .Attr("axis", axis)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()
+    )
+    return (out, count)
+
+
+def reduce_global_stage(x, device_count, axis, keepdims, op_name, name):
+    (out, mask) = (
+        flow.user_op_builder(name)
+        .Op(op_name)
+        .Input("in", [x])
+        .Input("device_count", [device_count])
+        .Output("out")
+        .Output("mask")
+        .Attr("axis", axis)
+        .Attr("keepdims", keepdims)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()
+    )
+    return out
+
+
+def _check_axis(axis, shape):
+    if axis is None:
+        axis = list(range(len(shape)))
+    if isinstance(axis, int):
+        axis = [axis]
+    assert isinstance(axis, (list, tuple)), "Invalid axis {}".format(axis)
+    for x in axis:
+        if x < 0:
+            x += len(shape)
+        assert x >= 0 and x < len(shape), "Invalid axis {}".format(axis)
+    return axis
+
+
+def check_x_dictribute(x, axis):
+    for i in axis:
+        if x.distribute is oneflow._oneflow_internal.distribute.split(i):
+            return True
+    return False
diff --git a/python/oneflow/compatible/single_client/ops/user_data_ops.py b/python/oneflow/compatible/single_client/ops/user_data_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..01007c40a28967489bc0f51fa96960b2dfe57bce
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/user_data_ops.py
@@ -0,0 +1,2383 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import random
+import sys
+import traceback
+from typing import Optional, Sequence, Union
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework import module as module_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+
+
+def OFRecordRawDecoder(
+    input_blob: oneflow._oneflow_internal.BlobDesc,
+    blob_name: str,
+    shape: Sequence[int],
+    dtype: flow.dtype,
+    dim1_varying_length: bool = False,
+    truncate: bool = False,
+    auto_zero_padding: bool = False,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    if auto_zero_padding:
+        print(
+            "WARNING: auto_zero_padding has been deprecated, Please use truncate instead.\n            "
+        )
+    if name is None:
+        name = id_util.UniqueStr("OFRecordRawDecoder_")
+    return (
+        flow.user_op_builder(name)
+        .Op("ofrecord_raw_decoder")
+        .Input("in", [input_blob])
+        .Output("out")
+        .Attr("name", blob_name)
+        .Attr("shape", shape)
+        .Attr("data_type", dtype)
+        .Attr("dim1_varying_length", dim1_varying_length)
+        .Attr("truncate", truncate or auto_zero_padding)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def OFRecordBytesDecoder(
+    input_blob: oneflow._oneflow_internal.BlobDesc,
+    blob_name: str,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    if name is None:
+        name = id_util.UniqueStr("OFRecordBytesDecoder_")
+    return (
+        flow.user_op_builder(name)
+        .Op("ofrecord_bytes_decoder")
+        .Input("in", [input_blob])
+        .Output("out")
+        .Attr("name", blob_name)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def api_ofrecord_image_decoder_random_crop(
+    input_blob: oneflow._oneflow_internal.BlobDesc,
+    blob_name: str,
+    color_space: str = "BGR",
+    num_attempts: int = 10,
+    seed: Optional[int] = None,
+    random_area: Sequence[float] = [0.08, 1.0],
+    random_aspect_ratio: Sequence[float] = [0.75, 1.333333],
+    name: str = "OFRecordImageDecoderRandomCrop",
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator is an image decoder with random crop.
+
+    Args:
+        input_blob (oneflow._oneflow_internal.BlobDesc): The input Blob
+        blob_name (str): The name of the Blob
+        color_space (str, optional): The color space, such as "RGB", "BGR". Defaults to "BGR".
+        num_attempts (int, optional): The maximum number of random cropping attempts. Defaults to 10.
+        seed (Optional[int], optional): The random seed. Defaults to None.
+        random_area (Sequence[float], optional): The random cropping area. Defaults to [0.08, 1.0].
+        random_aspect_ratio (Sequence[float], optional): The random scaled ratio. Defaults to [0.75, 1.333333].
+        name (str, optional): The name for the operation. Defaults to "OFRecordImageDecoderRandomCrop".
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The random cropped Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        from typing import Tuple
+
+
+        @flow.global_function(type="predict")
+        def ofrecord_reader_job() -> Tuple[tp.Numpy, tp.Numpy]:
+            batch_size = 16
+            color_space = "RGB"
+            # our ofrecord file path is "./dataset/part-0"
+            ofrecord = flow.data.ofrecord_reader(
+                "./imgdataset",
+                batch_size=batch_size,
+                data_part_num=1,
+                part_name_suffix_length=-1,
+                part_name_prefix='part-',
+                random_shuffle=True,
+                shuffle_after_epoch=True,
+            )
+            image = flow.data.OFRecordImageDecoderRandomCrop(
+                    ofrecord, "encoded", color_space=color_space
+                )
+            res_image, scale, new_size = flow.image.Resize(
+                    image, target_size=(224, 224)
+                )
+            label = flow.data.OFRecordRawDecoder(
+                ofrecord, "class/label", shape=(1, ), dtype=flow.int32
+            )
+
+            return res_image, label
+
+        if __name__ == "__main__":
+            images, labels = ofrecord_reader_job()
+            # images.shape (16, 224, 224, 3)
+
+    """
+    assert isinstance(name, str)
+    if seed is not None:
+        assert name is not None
+    module = flow.find_or_create_module(
+        name,
+        lambda: OFRecordImageDecoderRandomCropModule(
+            blob_name=blob_name,
+            color_space=color_space,
+            num_attempts=num_attempts,
+            random_seed=seed,
+            random_area=random_area,
+            random_aspect_ratio=random_aspect_ratio,
+            name=name,
+        ),
+    )
+    return module(input_blob)
+
+
+class OFRecordImageDecoderRandomCropModule(module_util.Module):
+    def __init__(
+        self,
+        blob_name: str,
+        color_space: str,
+        num_attempts: int,
+        random_seed: Optional[int],
+        random_area: Sequence[float],
+        random_aspect_ratio: Sequence[float],
+        name: str,
+    ):
+        module_util.Module.__init__(self, name)
+        (seed, has_seed) = flow.random.gen_seed(random_seed)
+        self.op_module_builder = (
+            flow.user_op_module_builder("ofrecord_image_decoder_random_crop")
+            .InputSize("in", 1)
+            .Output("out")
+            .Attr("name", blob_name)
+            .Attr("color_space", color_space)
+            .Attr("num_attempts", num_attempts)
+            .Attr("random_area", random_area)
+            .Attr("random_aspect_ratio", random_aspect_ratio)
+            .Attr("has_seed", has_seed)
+            .Attr("seed", seed)
+            .CheckAndComplete()
+        )
+        self.op_module_builder.user_op_module.InitOpKernel()
+
+    def forward(self, input: oneflow._oneflow_internal.BlobDesc):
+        if self.call_seq_no == 0:
+            name = self.module_name
+        else:
+            name = id_util.UniqueStr("OFRecordImageDecoderRandomCrop_")
+        return (
+            self.op_module_builder.OpName(name)
+            .Input("in", [input])
+            .Build()
+            .InferAndTryRun()
+            .SoleOutputBlob()
+        )
+
+
+def OFRecordImageDecoder(
+    input_blob: oneflow._oneflow_internal.BlobDesc,
+    blob_name: str,
+    color_space: str = "BGR",
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator is an image decoder.
+
+    Args:
+        input_blob (oneflow._oneflow_internal.BlobDesc): The input Blob
+        blob_name (str): The name of the input Blob
+        color_space (str, optional): The color space, such as "RGB", "BGR". Defaults to "BGR".
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        from typing import Tuple
+
+
+        @flow.global_function(type="predict")
+        def image_decoder_job() -> Tuple[tp.Numpy, tp.Numpy]:
+            batch_size = 16
+            color_space = "RGB"
+            # our ofrecord file path is "./dataset/part-0"
+            ofrecord = flow.data.ofrecord_reader(
+                "./imgdataset",
+                batch_size=batch_size,
+                data_part_num=1,
+                part_name_suffix_length=-1,
+                part_name_prefix='part-',
+                random_shuffle=True,
+                shuffle_after_epoch=True,
+            )
+            image = flow.data.OFRecordImageDecoder(
+                    ofrecord, "encoded", color_space=color_space
+                )
+            res_image, scale, new_size = flow.image.Resize(
+                    image, target_size=(224, 224)
+                )
+            label = flow.data.OFRecordRawDecoder(
+                ofrecord, "class/label", shape=(1, ), dtype=flow.int32
+            )
+
+            return res_image, label
+
+        if __name__ == "__main__":
+            images, labels = image_decoder_job()
+            # image.shape (16, 224, 224, 3)
+
+    """
+    if name is None:
+        name = id_util.UniqueStr("OFRecordImageDecoder_")
+    return (
+        flow.user_op_builder(name)
+        .Op("ofrecord_image_decoder")
+        .Input("in", [input_blob])
+        .Output("out")
+        .Attr("name", blob_name)
+        .Attr("color_space", color_space)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def api_image_resize(
+    image: oneflow._oneflow_internal.BlobDesc,
+    target_size: Union[int, Sequence[int]] = None,
+    min_size: Optional[int] = None,
+    max_size: Optional[int] = None,
+    keep_aspect_ratio: bool = False,
+    resize_side: str = "shorter",
+    channels: int = 3,
+    dtype: Optional[flow.dtype] = None,
+    interpolation_type: str = "auto",
+    name: Optional[str] = None,
+    color_space: Optional[str] = None,
+    interp_type: Optional[str] = None,
+    resize_shorter: int = 0,
+    resize_x: int = 0,
+    resize_y: int = 0,
+) -> Union[
+    oneflow._oneflow_internal.BlobDesc, Sequence[oneflow._oneflow_internal.BlobDesc]
+]:
+    """Resize images to target size.
+
+    Args:
+        image: A `Tensor` consists of images to be resized.
+        target_size: A list or tuple when `keep_aspect_ratio` is false or an int when `keep_aspect_ratio` is true. When `keep_aspect_ratio` is false, `target_size` has a form of `(target_width, target_height)` that image will resize to. When `keep_aspect_ratio` is true, the longer side or shorter side of the image will be resized to target size.
+        min_size: An int, optional. Only works when `keep_aspect_ratio` is true and `resize_side` is "longer". If `min_size` is not None, the shorter side must be greater than or equal to `min_size`. Default is None.
+        max_size: An int, optional. Only works when `keep_aspect_ratio` is true and `resize_side` is "shorter". If `max_size` is not None, the longer side must be less than or equal to `max_size`. Default is None.
+        keep_aspect_ratio: A bool. If is false, indicate that image will be resized to fixed width and height, otherwise image will be resized keeping aspect ratio.
+        resize_side: A str of "longer" or "shorter". Only works when `keep_aspect_ratio` is True. If `resize_side` is "longer", the longer side of image will be resized to `target_size`. If `resize_side` is "shorter", the shorter side of image will be resized to `target_size`.
+        channels: An int. how many channels an image has
+        dtype: `oneflow.compatible.single_client.dtype`. Indicate output resized image data type.
+        interpolation_type: A str of "auto", "bilinear", "nearest_neighbor", "bicubic" or "area". Indicate interpolation method used to resize image.
+        name: A str, optional. Name for the operation.
+        color_space: Deprecated, a str of "RGB", "BGR" or "GRAY". Please use `channels` instead.
+        interp_type: Deprecated, s str of "Linear", "Cubic" or "NN". Please use `interpolation_type` instead.
+        resize_shorter: Deprecated, a int. Indicate target size that the shorter side of image will resize to. Please use `target_size` and `resize_side` instead.
+        resize_x: Deprecated, a int. Indicate the target size that the width of image will resize to. Please use `target_size` instead.
+        resize_y: Deprecated, a int. Indicate the target size that the height of image will resize to. Please use `target_size` instead.
+
+    Returns:
+        Tuple of resized images `Blob`, width and height scales `Blob` and new width and height `Blob`
+        (new width and height `Blob` will be None when keep_aspect_ratio is false).
+        If deprecated params are used, a single resized images `Blob` will be returned.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        from typing import Tuple
+
+
+        @flow.global_function(type="predict")
+        def ofrecord_reader_job() -> Tuple[tp.Numpy, tp.Numpy]:
+            batch_size = 16
+            color_space = "RGB"
+            # our ofrecord file path is "./dataset/part-0"
+            ofrecord = flow.data.ofrecord_reader(
+                "./imgdataset",
+                batch_size=batch_size,
+                data_part_num=1,
+                part_name_suffix_length=-1,
+                part_name_prefix='part-',
+                random_shuffle=True,
+                shuffle_after_epoch=True,
+            )
+            image = flow.data.OFRecordImageDecoderRandomCrop(
+                    ofrecord, "encoded", color_space=color_space
+                )
+            res_image, scale, new_size = flow.image.Resize(
+                    image, target_size=(224, 224)
+                )
+            label = flow.data.OFRecordRawDecoder(
+                ofrecord, "class/label", shape=(1, ), dtype=flow.int32
+            )
+
+            return res_image, label
+
+        if __name__ == "__main__":
+            images, labels = ofrecord_reader_job()
+            # image.shape (16, 224, 224, 3)
+
+    """
+    deprecated_param_used = False
+    if color_space is not None:
+        print("WARNING: color_space has been deprecated. Please use channels instead.")
+        print(traceback.format_stack()[-2])
+        deprecated_param_used = True
+        assert isinstance(color_space, str)
+        if color_space.upper() == "RGB" or color_space.upper() == "BGR":
+            channels = 3
+        elif color_space.upper() == "GRAY":
+            channels = 1
+        else:
+            raise ValueError("invalid color_space")
+    if interp_type is not None:
+        print(
+            "WARNING: interp_type has been deprecated. Please use interpolation_type instead."
+        )
+        print(traceback.format_stack()[-2])
+        deprecated_param_used = True
+        assert isinstance(interp_type, str)
+        if interp_type == "Linear":
+            interpolation_type = "bilinear"
+        elif interp_type == "NN":
+            interpolation_type = "nearest_neighbor"
+        elif interp_type == "Cubic":
+            interpolation_type = "bicubic"
+        else:
+            raise ValueError("invalid interp_type")
+    if resize_x > 0 and resize_y > 0:
+        print(
+            "WARNING: resize_x and resize_y has been deprecated. Please use target_size instead."
+        )
+        print(traceback.format_stack()[-2])
+        deprecated_param_used = True
+        target_size = (resize_x, resize_y)
+        keep_aspect_ratio = False
+    if resize_shorter > 0:
+        print(
+            "WARNING: resize_shorter has been deprecated. Please use target_size instead."
+        )
+        print(traceback.format_stack()[-2])
+        deprecated_param_used = True
+        target_size = resize_shorter
+        keep_aspect_ratio = True
+        resize_side = "shorter"
+    if name is None:
+        name = id_util.UniqueStr("ImageResize_")
+    if keep_aspect_ratio:
+        if not isinstance(target_size, int):
+            raise ValueError(
+                "target_size must be an int when keep_aspect_ratio is True"
+            )
+        if min_size is None:
+            min_size = 0
+        if max_size is None:
+            max_size = 0
+        if resize_side == "shorter":
+            resize_longer = False
+        elif resize_side == "longer":
+            resize_longer = True
+        else:
+            raise ValueError('resize_side must be "shorter" or "longer"')
+        op = (
+            flow.user_op_builder(name)
+            .Op("image_resize_keep_aspect_ratio")
+            .Input("in", [image])
+            .Output("out")
+            .Output("size")
+            .Output("scale")
+            .Attr("target_size", target_size)
+            .Attr("min_size", min_size)
+            .Attr("max_size", max_size)
+            .Attr("resize_longer", resize_longer)
+            .Attr("interpolation_type", interpolation_type)
+            .Build()
+        )
+        (res_image, new_size, scale) = op.InferAndTryRun().RemoteBlobList()
+        scale = flow.tensor_buffer_to_tensor(
+            scale, dtype=flow.float32, instance_shape=(2,)
+        )
+        new_size = flow.tensor_buffer_to_tensor(
+            new_size, dtype=flow.int32, instance_shape=(2,)
+        )
+    else:
+        if (
+            not isinstance(target_size, (list, tuple))
+            or len(target_size) != 2
+            or (not all((isinstance(size, int) for size in target_size)))
+        ):
+            raise ValueError(
+                "target_size must be a form like (width, height) when keep_aspect_ratio is False"
+            )
+        if dtype is None:
+            dtype = flow.uint8
+        (target_w, target_h) = target_size
+        op = (
+            flow.user_op_builder(name)
+            .Op("image_resize_to_fixed")
+            .Input("in", [image])
+            .Output("out")
+            .Output("scale")
+            .Attr("target_width", target_w)
+            .Attr("target_height", target_h)
+            .Attr("channels", channels)
+            .Attr("data_type", dtype)
+            .Attr("interpolation_type", interpolation_type)
+            .Build()
+        )
+        (res_image, scale) = op.InferAndTryRun().RemoteBlobList()
+        new_size = None
+    if deprecated_param_used:
+        return res_image
+    return (res_image, scale, new_size)
+
+
+def api_image_target_resize(
+    images: oneflow._oneflow_internal.BlobDesc,
+    target_size: int,
+    min_size: Optional[int] = None,
+    max_size: Optional[int] = None,
+    resize_side: str = "shorter",
+    interpolation_type: str = "auto",
+    name: Optional[str] = None,
+) -> Sequence[oneflow._oneflow_internal.BlobDesc]:
+    """This operator resizes image to target size.
+
+    Args:
+        images (oneflow._oneflow_internal.BlobDesc): The input Blob. Its type should be `kTensorBuffer`. More details please refer to the code example.
+        target_size (int): An int, the target size.
+        min_size (Optional[int], optional): If `min_size` is not None, the shorter side must be greater than or equal to `min_size`. Default is None. Defaults to None.
+        max_size (Optional[int], optional): If `max_size` is not None, the longer side must be less than or equal to `max_size`. Defaults to None.
+        resize_side (str, optional): A str of "longer" or "shorter". Only works when `keep_aspect_ratio` is True. If `resize_side` is "longer", the longer side of image will be resized to `target_size`. If `resize_side` is "shorter", the shorter side of image will be resized to `target_size`. Defaults to "shorter".
+        interpolation_type (str, optional): A str of "auto", "bilinear", "nearest_neighbor", "bicubic" or "area". Indicate interpolation method used to resize image. Defaults to "auto".
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        Sequence[oneflow._oneflow_internal.BlobDesc]: A Sequence includes the result Blob.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        from typing import Tuple
+        import numpy as np
+        import cv2
+
+
+        def _read_images_by_cv(image_files):
+            images = [cv2.imread(image_file).astype(np.single) for image_file in image_files]
+            return [np.expand_dims(image, axis=0) for image in images]
+
+
+        def _get_images_static_shape(images):
+            image_shapes = [image.shape for image in images]
+            image_static_shape = np.amax(image_shapes, axis=0)
+            assert isinstance(
+                image_static_shape, np.ndarray
+            ), "image_shapes: {}, image_static_shape: {}".format(
+                str(image_shapes), str(image_static_shape)
+            )
+            image_static_shape = image_static_shape.tolist()
+            assert image_static_shape[0] == 1, str(image_static_shape)
+            image_static_shape[0] = len(image_shapes)
+            return image_static_shape
+
+        def _of_image_target_resize(images, image_static_shape, target_size, max_size):
+            func_config = flow.FunctionConfig()
+            func_config.default_data_type(flow.float)
+            func_config.default_logical_view(flow.scope.mirrored_view())
+
+            @flow.global_function(function_config=func_config)
+            def image_target_resize_job(images_def: tp.ListListNumpy.Placeholder(shape=image_static_shape, dtype=flow.float)
+            ) -> Tuple[tp.ListListNumpy, tp.ListNumpy, tp.ListNumpy]:
+                # The input Blob type should be "kTensorBuffer"
+                # So we use oneflow.compatible.single_client.tensor_list_to_tensor_buffer to convert
+                images_buffer = flow.tensor_list_to_tensor_buffer(images_def)
+
+                resized_images_buffer, size, scale = flow.image_target_resize(
+                    images_buffer,
+                    target_size=target_size,
+                    max_size=max_size,
+                    resize_side="shorter",
+                )
+                # We convert back to "tensorlist" type
+                resized_images = flow.tensor_buffer_to_tensor_list(
+                    resized_images_buffer,
+                    shape=(target_size, max_size, image_static_shape[-1]),
+                    dtype=flow.float,
+                )
+                return resized_images, size, scale
+
+            resized_images, size, scale = image_target_resize_job([images])
+            resized_image = resized_images[0]
+            size = size[0]
+            scale = scale[0]
+
+            return resized_images, size, scale
+
+        if __name__ == "__main__":
+            img = _read_images_by_cv(['./img/1.jpg'])
+            img_shape = _get_images_static_shape(img) # In example is [1, 349, 367, 3]
+            target_size = 256
+            max_size = 512
+            resized_images, size, scale = _of_image_target_resize(img, tuple(img_shape), target_size, max_size)
+            # Here the shorter side is "349", we resize it to target_size(256)
+            # The scale is 256 / 349 = 0.73
+            # The longer side will be resized to 367 * scale = 269
+            # get the first element from the resized_images (its type is `list.list`)
+            print(resized_images[0][0].shape) # (1, 256, 269, 3)
+
+    """
+    if name is None:
+        name = id_util.UniqueStr("ImageTargetResize_")
+    (res_image, scale, new_size) = api_image_resize(
+        images,
+        target_size=target_size,
+        min_size=min_size,
+        max_size=max_size,
+        keep_aspect_ratio=True,
+        resize_side=resize_side,
+        interpolation_type=interpolation_type,
+        name=name,
+    )
+    return (res_image, new_size, scale)
+
+
+def CropMirrorNormalize(
+    input_blob: oneflow._oneflow_internal.BlobDesc,
+    mirror_blob: Optional[oneflow._oneflow_internal.BlobDesc] = None,
+    color_space: str = "BGR",
+    output_layout: str = "NCHW",
+    crop_h: int = 0,
+    crop_w: int = 0,
+    crop_pos_y: float = 0.5,
+    crop_pos_x: float = 0.5,
+    mean: Sequence[float] = [0.0],
+    std: Sequence[float] = [1.0],
+    output_dtype: flow.dtype = flow.float,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator performs the cropping, normalization, and horizontal flip for input Blob.
+
+    If `crop_h` and `crop_w` are provided, the image cropping position is specified by "crop_pos_y" and "crop_pos_x".
+
+    The position is computed as follows:
+
+    .. math::
+
+        & crop_x = crop\\_pos\\_x*(Width-crop\\_w)
+
+        & crop_y = crop\\_pos\\_y*(Height-crop\\_h)
+
+    The `Width` and `Height` is the width and height of input Blob.
+
+    Args:
+        input_blob (oneflow._oneflow_internal.BlobDesc): The input Blob.
+        mirror_blob (Optional[oneflow._oneflow_internal.BlobDesc], optional): The operation for horizontal flip, if it is `None`, the operator will not perform the horizontal flip. Defaults to None.
+        color_space (str, optional): The color space for input Blob. Defaults to "BGR".
+        output_layout (str, optional): The output format. Defaults to "NCHW".
+        crop_h (int, optional): The image cropping window height. Defaults to 0.
+        crop_w (int, optional): The image cropping window width. Defaults to 0.
+        crop_pos_y (float, optional): The vertical position of the image cropping window, the value range is normalized to (0.0, 1.0). Defaults to 0.5.
+        crop_pos_x (float, optional): The horizontal position of the image cropping window, the value range is normalized to (0.0, 1.0). Defaults to 0.5.
+        mean (Sequence[float], optional): The mean value for normalization. Defaults to [0.0].
+        std (Sequence[float], optional): The standard deviation values for normalization. Defaults to [1.0].
+        output_dtype (flow.dtype, optional): The datatype of output Blob. Defaults to flow.float.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Raises:
+        NotImplementedError: The data type of input Blob should be `tensor_buffer` or `uint8`
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        from typing import Tuple
+
+
+        @flow.global_function(type="predict")
+        def crop_mirror_job() -> Tuple[tp.Numpy, tp.Numpy]:
+            batch_size = 1
+            color_space = "RGB"
+            # our ofrecord file path is "./dataset/part-0"
+            ofrecord = flow.data.ofrecord_reader(
+                "./imgdataset",
+                batch_size=batch_size,
+                data_part_num=1,
+                part_name_suffix_length=-1,
+                part_name_prefix='part-',
+                shuffle_after_epoch=True,
+            )
+            image = flow.data.OFRecordImageDecoder(
+                    ofrecord, "encoded", color_space=color_space
+                )
+            res_image, scale, new_size = flow.image.Resize(
+                    image, target_size=(512, 512)
+                )
+            label = flow.data.OFRecordRawDecoder(
+                ofrecord, "class/label", shape=(1, ), dtype=flow.int32
+            )
+            rng = flow.random.CoinFlip(batch_size=batch_size)
+            normal = flow.image.CropMirrorNormalize(
+                    res_image,
+                    mirror_blob=rng,
+                    color_space=color_space,
+                    crop_h= 256,
+                    crop_w= 256,
+                    crop_pos_y=0.5,
+                    crop_pos_x=0.5,
+                    mean=[123.68, 116.779, 103.939],
+                    std=[58.393, 57.12, 57.375],
+                    output_dtype=flow.float,
+                )
+
+            return normal, label
+
+        if __name__ == "__main__":
+            images, labels = crop_mirror_job()
+            # images.shape (1, 3, 256, 256)
+
+    """
+    if name is None:
+        name = id_util.UniqueStr("CropMirrorNormalize_")
+    op_type_name = ""
+    if input_blob.dtype is flow.tensor_buffer:
+        op_type_name = "crop_mirror_normalize_from_tensorbuffer"
+    elif input_blob.dtype is flow.uint8:
+        op_type_name = "crop_mirror_normalize_from_uint8"
+    else:
+        print(
+            "ERROR! oneflow.compatible.single_client.data.crop_mirror_normalize op",
+            " NOT support input data type : ",
+            input_blob.dtype,
+        )
+        raise NotImplementedError
+    op = flow.user_op_builder(name).Op(op_type_name).Input("in", [input_blob])
+    if mirror_blob is not None:
+        op = op.Input("mirror", [mirror_blob])
+    return (
+        op.Output("out")
+        .Attr("color_space", color_space)
+        .Attr("output_layout", output_layout)
+        .Attr("mean", mean)
+        .Attr("std", std)
+        .Attr("crop_h", crop_h)
+        .Attr("crop_w", crop_w)
+        .Attr("crop_pos_y", crop_pos_y)
+        .Attr("crop_pos_x", crop_pos_x)
+        .Attr("output_dtype", output_dtype)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def api_image_random_crop(
+    input_blob: oneflow._oneflow_internal.BlobDesc,
+    num_attempts: int = 10,
+    seed: Optional[int] = None,
+    random_area: Sequence[float] = None,
+    random_aspect_ratio: Sequence[float] = None,
+    name: str = "ImageRandomCrop",
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator crops the input image randomly.
+
+    Args:
+        input_blob (oneflow._oneflow_internal.BlobDesc): The input Blob.
+        num_attempts (int, optional): The maximum number of random cropping attempts. Defaults to 10.
+        seed (Optional[int], optional): The random seed. Defaults to None.
+        random_area (Sequence[float], optional): The random cropping area. Defaults to None.
+        random_aspect_ratio (Sequence[float], optional): The random scaled ratio. Defaults to None.
+        name (str, optional): The name for the operation. Defaults to "ImageRandomCrop".
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        import numpy as np
+        import cv2
+
+
+        def _read_images_by_cv(image_files):
+            images = [cv2.imread(image_file).astype(np.single) for image_file in image_files]
+            return [np.expand_dims(image, axis=0) for image in images]
+
+
+        def _get_images_static_shape(images):
+            image_shapes = [image.shape for image in images]
+            image_static_shape = np.amax(image_shapes, axis=0)
+            assert isinstance(
+                image_static_shape, np.ndarray
+            ), "image_shapes: {}, image_static_shape: {}".format(
+                str(image_shapes), str(image_static_shape)
+            )
+            image_static_shape = image_static_shape.tolist()
+            assert image_static_shape[0] == 1, str(image_static_shape)
+            image_static_shape[0] = len(image_shapes)
+            return image_static_shape
+
+        def _of_image_random_crop(images, image_static_shape):
+            func_config = flow.FunctionConfig()
+            func_config.default_data_type(flow.float)
+            func_config.default_logical_view(flow.scope.mirrored_view())
+
+            @flow.global_function(function_config=func_config)
+            def image_random_crop_job(images_def: tp.ListListNumpy.Placeholder(shape=image_static_shape, dtype=flow.float)
+            ) -> tp.ListListNumpy:
+                # The input Blob type should be "kTensorBuffer"
+                # So we use oneflow.compatible.single_client.tensor_list_to_tensor_buffer to convert
+                images_buffer = flow.tensor_list_to_tensor_buffer(images_def)
+                # Do the random crop
+                random_crop_buffer = flow.image.random_crop(
+                    images_buffer,
+                    random_area=[0.15, 0.80],
+                    random_aspect_ratio=[0.75, 1.55],
+                )
+                # We convert back to "tensorlist" type
+                random_crop_images = flow.tensor_buffer_to_tensor_list(
+                    random_crop_buffer,
+                    shape=(image_static_shape[1], image_static_shape[2], image_static_shape[-1]),
+                    dtype=flow.float,
+                )
+                return random_crop_images
+
+            random_crop_images = image_random_crop_job([images])
+
+            return random_crop_images
+
+        if __name__ == "__main__":
+            img = _read_images_by_cv(['./img/1.jpg'])
+            img_shape = _get_images_static_shape(img) # In example is (1, 234, 346, 3)
+            random_crop_images = _of_image_random_crop(img, tuple(img_shape))
+            # random_crop_images.shape is (234, 346, 3)
+
+    """
+    assert isinstance(name, str)
+    if seed is not None:
+        assert name is not None
+    if random_area is None:
+        random_area = [0.08, 1.0]
+    if random_aspect_ratio is None:
+        random_aspect_ratio = [0.75, 1.333333]
+    module = flow.find_or_create_module(
+        name,
+        lambda: ImageRandomCropModule(
+            num_attempts=num_attempts,
+            random_seed=seed,
+            random_area=random_area,
+            random_aspect_ratio=random_aspect_ratio,
+            name=name,
+        ),
+    )
+    return module(input_blob)
+
+
+class ImageRandomCropModule(module_util.Module):
+    def __init__(
+        self,
+        num_attempts: int,
+        random_seed: Optional[int],
+        random_area: Sequence[float],
+        random_aspect_ratio: Sequence[float],
+        name: str,
+    ):
+        module_util.Module.__init__(self, name)
+        (seed, has_seed) = flow.random.gen_seed(random_seed)
+        self.op_module_builder = (
+            flow.user_op_module_builder("image_random_crop")
+            .InputSize("in", 1)
+            .Output("out")
+            .Attr("num_attempts", num_attempts)
+            .Attr("random_area", random_area)
+            .Attr("random_aspect_ratio", random_aspect_ratio)
+            .Attr("has_seed", has_seed)
+            .Attr("seed", seed)
+            .CheckAndComplete()
+        )
+        self.op_module_builder.user_op_module.InitOpKernel()
+
+    def forward(self, input: oneflow._oneflow_internal.BlobDesc):
+        if self.call_seq_no == 0:
+            name = self.module_name
+        else:
+            name = id_util.UniqueStr("ImageRandomCrop_")
+        return (
+            self.op_module_builder.OpName(name)
+            .Input("in", [input])
+            .Build()
+            .InferAndTryRun()
+            .SoleOutputBlob()
+        )
+
+
+def api_coin_flip(
+    batch_size: int = 1,
+    seed: Optional[int] = None,
+    probability: float = 0.5,
+    name: str = "CoinFlip",
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator performs the horizontal flip.
+
+    Args:
+        batch_size (int, optional): The batch size. Defaults to 1.
+        seed (Optional[int], optional): The random seed. Defaults to None.
+        probability (float, optional): The flip probability. Defaults to 0.5.
+        name (str, optional): The name for the operation. Defaults to "CoinFlip".
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: [description]
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        from typing import Tuple
+
+
+        @flow.global_function(type="predict")
+        def coin_flip_job() -> Tuple[tp.Numpy, tp.Numpy]:
+            batch_size = 1
+            color_space = "RGB"
+            # our ofrecord file path is "./dataset/part-0"
+            ofrecord = flow.data.ofrecord_reader(
+                "./imgdataset",
+                batch_size=batch_size,
+                data_part_num=1,
+                part_name_suffix_length=-1,
+                part_name_prefix='part-',
+                shuffle_after_epoch=True,
+            )
+            image = flow.data.OFRecordImageDecoder(
+                    ofrecord, "encoded", color_space=color_space
+                )
+            res_image, scale, new_size = flow.image.Resize(
+                    image, target_size=(512, 512)
+                )
+            label = flow.data.OFRecordRawDecoder(
+                ofrecord, "class/label", shape=(1, ), dtype=flow.int32
+            )
+            coin_flip = flow.random.CoinFlip(
+                batch_size=batch_size,
+                probability=0.8
+            )
+            normal = flow.image.CropMirrorNormalize(
+                    res_image,
+                    mirror_blob=coin_flip,
+                    color_space=color_space,
+                    crop_h= 256,
+                    crop_w= 256,
+                    crop_pos_y=0.5,
+                    crop_pos_x=0.5,
+                    mean=[123.68, 116.779, 103.939],
+                    std=[58.393, 57.12, 57.375],
+                    output_dtype=flow.float,
+                )
+
+            return normal, label
+
+        if __name__ == "__main__":
+            images, labels = coin_flip_job()
+
+    """
+    assert isinstance(name, str)
+    if seed is not None:
+        assert name is not None
+    module = flow.find_or_create_module(
+        name,
+        lambda: CoinFlipModule(
+            batch_size=batch_size, probability=probability, random_seed=seed, name=name
+        ),
+    )
+    return module()
+
+
+class CoinFlipModule(module_util.Module):
+    def __init__(
+        self, batch_size: str, probability: float, random_seed: Optional[int], name: str
+    ):
+        module_util.Module.__init__(self, name)
+        (seed, has_seed) = flow.random.gen_seed(random_seed)
+        self.op_module_builder = (
+            flow.user_op_module_builder("coin_flip")
+            .Output("out")
+            .Attr("batch_size", batch_size)
+            .Attr("probability", probability)
+            .Attr("has_seed", has_seed)
+            .Attr("seed", seed)
+            .CheckAndComplete()
+        )
+        self.op_module_builder.user_op_module.InitOpKernel()
+
+    def forward(self):
+        if self.call_seq_no == 0:
+            name = self.module_name
+        else:
+            name = id_util.UniqueStr("CoinFlip_")
+        return (
+            self.op_module_builder.OpName(name)
+            .Build()
+            .InferAndTryRun()
+            .SoleOutputBlob()
+        )
+
+
+def image_decode(
+    images_bytes_buffer: oneflow._oneflow_internal.BlobDesc,
+    dtype: flow.dtype = flow.uint8,
+    color_space: str = "BGR",
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator decode the image.
+
+    Args:
+        images_bytes_buffer (oneflow._oneflow_internal.BlobDesc): The input Blob. Its type should be `kTensorBuffer`. More details please refer to the code example.
+        dtype (flow.dtype, optional): The data type. Defaults to flow.uint8.
+        color_space (str, optional): The color space. Defaults to "BGR".
+        name (Optional[str], optional): The name for the opreation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The decoded image list.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        import numpy as np
+        from PIL import Image
+
+
+        def _of_image_decode(images):
+            image_files = [open(im, "rb") for im in images]
+            images_bytes = [imf.read() for imf in image_files]
+            static_shape = (len(images_bytes), max([len(bys) for bys in images_bytes]))
+            for imf in image_files:
+                imf.close()
+
+            func_config = flow.FunctionConfig()
+            func_config.default_data_type(flow.float)
+            func_config.default_logical_view(flow.scope.mirrored_view())
+
+            @flow.global_function(function_config=func_config)
+            def image_decode_job(
+                images_def: tp.ListListNumpy.Placeholder(shape=static_shape, dtype=flow.int8)
+            )->tp.ListListNumpy:
+                # convert to tensor buffer
+                images_buffer = flow.tensor_list_to_tensor_buffer(images_def)
+                decoded_images_buffer = flow.image_decode(images_buffer)
+                # Remember to set a shape
+                # convert back to tensor list
+                return flow.tensor_buffer_to_tensor_list(
+                    decoded_images_buffer, shape=(640, 640, 3), dtype=flow.uint8
+                )
+
+            images_np_arr = [
+                np.frombuffer(bys, dtype=np.byte).reshape(1, -1) for bys in images_bytes
+            ]
+            decoded_images = image_decode_job([images_np_arr])
+            return decoded_images[0]
+
+
+        if __name__ == "__main__":
+            img = _of_image_decode(['./img/1.jpg'])
+            print(img[0].shape) # Our image shape is (1, 349, 367, 3)
+
+    """
+    if name is None:
+        name = id_util.UniqueStr("ImageDecode_")
+    op = (
+        flow.user_op_builder(name)
+        .Op("image_decode")
+        .Input("in", [images_bytes_buffer])
+        .Output("out")
+        .Attr("color_space", color_space)
+        .Attr("data_type", dtype)
+        .Build()
+    )
+    return op.InferAndTryRun().SoleOutputBlob()
+
+
+def image_batch_align(
+    images: oneflow._oneflow_internal.BlobDesc,
+    shape: Sequence[int],
+    dtype: flow.dtype,
+    alignment: int,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator aligns the shape for a batch of images.
+
+    The aligned shape is computed as:
+
+    .. math::
+
+        & shape_{width} = int(\\frac{(shape_{width}+alignment-1)}{alignment})*alignment
+
+        & shape_{height} = int(\\frac{(shape_{height}+alignment-1)}{alignment})*alignment
+
+    Args:
+        images (oneflow._oneflow_internal.BlobDesc): The images.
+        shape (Sequence[int]): The maximum static shape of input images.
+        dtype (flow.dtype): The data type.
+        alignment (int): The align factor.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import cv2
+        import numpy as np
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+
+
+        def _of_image_batch_align(images, input_shape, output_shape, alignment):
+            func_config = flow.FunctionConfig()
+            func_config.default_data_type(flow.float)
+            func_config.default_logical_view(flow.scope.mirrored_view())
+
+            @flow.global_function(function_config=func_config)
+            def image_batch_align_job(
+                images_def: tp.ListListNumpy.Placeholder(shape=input_shape, dtype=flow.float)
+            ) -> tp.ListNumpy:
+                # Convert to tensor buffer
+                images_buffer = flow.tensor_list_to_tensor_buffer(images_def)
+                image = flow.image_batch_align(
+                    images_buffer, shape=output_shape[1:], dtype=flow.float, alignment=alignment
+                )
+                return image
+
+            image = image_batch_align_job([images])
+            return image[0]
+
+
+        def _read_images_by_cv(image_files):
+            images = [cv2.imread(image_file).astype(np.single) for image_file in image_files]
+            return [np.expand_dims(image, axis=0) for image in images]
+
+
+        def _get_images_static_shape(images):
+            image_shapes = [image.shape for image in images]
+            image_static_shape = np.amax(image_shapes, axis=0)
+            assert isinstance(
+                image_static_shape, np.ndarray
+            ), "image_shapes: {}, image_static_shape: {}".format(
+                str(image_shapes), str(image_static_shape)
+            )
+            image_static_shape = image_static_shape.tolist()
+            assert image_static_shape[0] == 1, str(image_static_shape)
+            image_static_shape[0] = len(image_shapes)
+            return image_static_shape
+
+        def _roundup(x, n):
+            # compute the aligned shape
+            return int((x + n - 1) / n) * n
+
+        if __name__ == "__main__":
+            img = _read_images_by_cv(['./img/1.jpg', './img/2.jpg', './img/3.jpg'])
+            img_shape = _get_images_static_shape(img) # In example is [3, 349, 367, 3]
+            alignment = 16 # alignment factor
+            aligned_image_shape = [
+                img_shape[0],
+                _roundup(img_shape[1], alignment),
+                _roundup(img_shape[2], alignment),
+                img_shape[3],
+            ]
+            image = _of_image_batch_align(img, tuple(img_shape), aligned_image_shape, alignment)
+
+    """
+    if name is None:
+        name = id_util.UniqueStr("ImageBatchAlign_")
+    op = (
+        flow.user_op_builder(name)
+        .Op("image_batch_align")
+        .Input("in", [images])
+        .Output("out")
+        .Attr("shape", shape)
+        .Attr("data_type", dtype)
+        .Attr("alignment", alignment)
+        .Build()
+    )
+    return op.InferAndTryRun().SoleOutputBlob()
+
+
+def image_normalize(
+    image: oneflow._oneflow_internal.BlobDesc,
+    std: Sequence[float],
+    mean: Sequence[float],
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator normalizes the image.
+
+    Args:
+        image (oneflow._oneflow_internal.BlobDesc): The input image.
+        std (Sequence[float]): The standard deviation of the images.
+        mean (Sequence[float]): The mean value of the images.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import cv2
+        import numpy as np
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+
+
+        def _of_image_normalize(images, image_shape, std, mean):
+            func_config = flow.FunctionConfig()
+            func_config.default_data_type(flow.float)
+            func_config.default_logical_view(flow.scope.mirrored_view())
+
+            @flow.global_function(function_config=func_config)
+            def image_normalize_job(
+                images_def: tp.ListListNumpy.Placeholder(shape=image_shape, dtype=flow.float)
+            ) -> tp.ListListNumpy:
+                # Convert to tensor buffer
+                images_buffer = flow.tensor_list_to_tensor_buffer(images_def)
+                # Normalize the imagess
+                norm_images = flow.image_normalize(images_buffer, std, mean)
+                # Convert back to tensor list
+                return flow.tensor_buffer_to_tensor_list(
+                    norm_images, shape=image_shape[1:], dtype=flow.float
+                )
+
+            image_tensor = image_normalize_job([images])
+            return image_tensor[0]
+
+
+        def _read_images_by_cv(image_files):
+            images = [cv2.imread(image_file).astype(np.single) for image_file in image_files]
+            return [np.expand_dims(image, axis=0) for image in images]
+
+
+        def _get_images_static_shape(images):
+            image_shapes = [image.shape for image in images]
+            image_static_shape = np.amax(image_shapes, axis=0)
+            assert isinstance(
+                image_static_shape, np.ndarray
+            ), "image_shapes: {}, image_static_shape: {}".format(
+                str(image_shapes), str(image_static_shape)
+            )
+            image_static_shape = image_static_shape.tolist()
+            assert image_static_shape[0] == 1, str(image_static_shape)
+            image_static_shape[0] = len(image_shapes)
+            return image_static_shape
+
+        if __name__ == "__main__":
+            img = _read_images_by_cv(['./img/1.jpg', './img/2.jpg', './img/3.jpg'])
+            img_shape = _get_images_static_shape(img) # In example is [3, 349, 367, 3]
+            image = _of_image_normalize(img,
+                                        tuple(img_shape),
+                                        std=(102.9801, 115.9465, 122.7717),
+                                        mean=(1.0, 1.0, 1.0))
+
+    """
+    if name is None:
+        name = id_util.UniqueStr("ImageNormalize_")
+    assert isinstance(std, (list, tuple))
+    assert isinstance(mean, (list, tuple))
+    op = (
+        flow.user_op_builder(name)
+        .Op("image_normalize")
+        .Input("in", [image])
+        .Output("out")
+        .Attr("std", std)
+        .Attr("mean", mean)
+        .Build()
+    )
+    return op.InferAndTryRun().SoleOutputBlob()
+
+
+def image_flip(
+    image: oneflow._oneflow_internal.BlobDesc,
+    flip_code: Union[int, oneflow._oneflow_internal.BlobDesc],
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator flips the images.
+
+    The flip code corresponds to the different flip mode:
+
+    0 (0x00): Non Flip
+
+    1 (0x01): Horizontal Flip
+
+    16 (0x10): Vertical Flip
+
+    17 (0x11): Both Horizontal and Vertical Flip
+
+    Args:
+        image (oneflow._oneflow_internal.BlobDesc): The input images.
+        flip_code (Union[int, oneflow._oneflow_internal.BlobDesc]): The flip code.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import cv2
+        import numpy as np
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+
+
+        def _of_image_flip(images, image_shape, flip_code):
+            func_config = flow.FunctionConfig()
+            func_config.default_data_type(flow.float)
+            func_config.default_logical_view(flow.scope.mirrored_view())
+
+            @flow.global_function(function_config=func_config)
+            def image_flip_job(
+                images_def: tp.ListListNumpy.Placeholder(shape=image_shape, dtype=flow.float)
+            ) -> tp.ListListNumpy:
+                images_buffer = flow.tensor_list_to_tensor_buffer(images_def)
+                flip_images = flow.image_flip(images_buffer, flip_code)
+                return flow.tensor_buffer_to_tensor_list(
+                    flip_images, shape=image_shape[1:], dtype=flow.float
+                )
+
+            image_tensor = image_flip_job([images])
+            return image_tensor[0]
+
+
+        def _read_images_by_cv(image_files):
+            images = [cv2.imread(image_file).astype(np.single) for image_file in image_files]
+            return [np.expand_dims(image, axis=0) for image in images]
+
+
+        def _get_images_static_shape(images):
+            image_shapes = [image.shape for image in images]
+            image_static_shape = np.amax(image_shapes, axis=0)
+            assert isinstance(
+                image_static_shape, np.ndarray
+            ), "image_shapes: {}, image_static_shape: {}".format(
+                str(image_shapes), str(image_static_shape)
+            )
+            image_static_shape = image_static_shape.tolist()
+            assert image_static_shape[0] == 1, str(image_static_shape)
+            image_static_shape[0] = len(image_shapes)
+            return image_static_shape
+
+        if __name__ == "__main__":
+            img = _read_images_by_cv(['./img/1.jpg', './img/2.jpg', './img/3.jpg'])
+            img_shape = _get_images_static_shape(img) # In example is [3, 349, 367, 3]
+            image = _of_image_flip(img,
+                           tuple(img_shape),
+                           flip_code=1)
+
+    """
+    assert isinstance(image, oneflow._oneflow_internal.BlobDesc)
+    if name is None:
+        name = id_util.UniqueStr("ImageFlip_")
+    if not isinstance(flip_code, oneflow._oneflow_internal.BlobDesc):
+        assert isinstance(flip_code, int)
+        flip_code = flow.constant(
+            flip_code,
+            shape=(image.shape[0],),
+            dtype=flow.int8,
+            name="{}_FlipCode_".format(name),
+        )
+    else:
+        assert image.shape[0] == flip_code.shape[0]
+    op = (
+        flow.user_op_builder(name)
+        .Op("image_flip")
+        .Input("in", [image])
+        .Input("flip_code", [flip_code])
+        .Output("out")
+        .Build()
+    )
+    return op.InferAndTryRun().SoleOutputBlob()
+
+
+def object_bbox_flip(
+    bbox: oneflow._oneflow_internal.BlobDesc,
+    image_size: oneflow._oneflow_internal.BlobDesc,
+    flip_code: Union[int, oneflow._oneflow_internal.BlobDesc],
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator flips the object bounding box.
+
+    The flip code corresponds to the different flip mode:
+
+    0 (0x00): Non Flip
+
+    1 (0x01): Horizontal Flip
+
+    16 (0x10): Vertical Flip
+
+    17 (0x11): Both Horizontal and Vertical Flip
+
+    Args:
+        bbox (oneflow._oneflow_internal.BlobDesc): The bounding box.
+        image_size (oneflow._oneflow_internal.BlobDesc): The size of input image.
+        flip_code (Union[int, oneflow._oneflow_internal.BlobDesc]): The flip code.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import numpy as np
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+
+
+        def _of_object_bbox_flip(bbox_list, image_size, flip_code):
+            bbox_shape = _get_bbox_static_shape(bbox_list)
+            func_config = flow.FunctionConfig()
+            func_config.default_data_type(flow.float)
+            func_config.default_logical_view(flow.scope.mirrored_view())
+
+            @flow.global_function(function_config=func_config)
+            def object_bbox_flip_job(
+                bbox_def: tp.ListListNumpy.Placeholder(
+                    shape=tuple(bbox_shape), dtype=flow.float
+                ),
+                image_size_def: tp.ListNumpy.Placeholder(
+                    shape=image_size.shape, dtype=flow.int32
+                ),
+            ) -> tp.ListListNumpy:
+                bbox_buffer = flow.tensor_list_to_tensor_buffer(bbox_def)
+                flip_bbox = flow.object_bbox_flip(bbox_buffer, image_size_def, flip_code)
+                return flow.tensor_buffer_to_tensor_list(
+                    flip_bbox, shape=bbox_shape[1:], dtype=flow.float
+                )
+
+            input_bbox_list = [np.expand_dims(bbox, axis=0) for bbox in bbox_list]
+            bbox_tensor = object_bbox_flip_job([input_bbox_list], [image_size])
+            return bbox_tensor[0]
+
+
+        def _get_bbox_static_shape(bbox_list):
+            bbox_shapes = [bbox.shape for bbox in bbox_list]
+            bbox_static_shape = np.amax(bbox_shapes, axis=0)
+            assert isinstance(
+                bbox_static_shape, np.ndarray
+            ), "bbox_shapes: {}, bbox_static_shape: {}".format(
+                str(bbox_shapes), str(bbox_static_shape)
+            )
+            bbox_static_shape = bbox_static_shape.tolist()
+            bbox_static_shape.insert(0, len(bbox_list))
+            return bbox_static_shape
+
+        if __name__ == "__main__":
+            bbox = np.array([[[20.0, 40.0, 80.0, 160.0],
+                            [30.0, 50.0, 70.0, 100.0]]]).astype(np.single) # [x1, y1, x2, y2]
+            image_size = np.array([[480, 620]]).astype(np.int32)
+            bbox_flip =  _of_object_bbox_flip(bbox,
+                                            image_size,
+                                            flip_code=1) # Horizontal Flip
+            print(bbox_flip[0][0])
+
+            # [[399.  40. 459. 160.]
+            #  [409.  50. 449. 100.]]
+    """
+    assert isinstance(bbox, oneflow._oneflow_internal.BlobDesc)
+    assert isinstance(image_size, oneflow._oneflow_internal.BlobDesc)
+    assert bbox.shape[0] == image_size.shape[0]
+    if name is None:
+        name = id_util.UniqueStr("ObjectBboxFlip_")
+    if not isinstance(flip_code, oneflow._oneflow_internal.BlobDesc):
+        assert isinstance(flip_code, int)
+        flip_code = flow.constant(
+            flip_code,
+            shape=(bbox.shape[0],),
+            dtype=flow.int8,
+            name="{}_FlipCode".format(name),
+        )
+    else:
+        assert bbox.shape[0] == flip_code.shape[0]
+    op = (
+        flow.user_op_builder(name)
+        .Op("object_bbox_flip")
+        .Input("bbox", [bbox])
+        .Input("image_size", [image_size])
+        .Input("flip_code", [flip_code])
+        .Output("out")
+        .Build()
+    )
+    return op.InferAndTryRun().SoleOutputBlob()
+
+
+def object_bbox_scale(
+    bbox: oneflow._oneflow_internal.BlobDesc,
+    scale: oneflow._oneflow_internal.BlobDesc,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator scales the input image and the corresponding bounding box. It returns the scaled bounding box.
+
+    Args:
+        bbox (oneflow._oneflow_internal.BlobDesc): The bounding box.
+        scale (oneflow._oneflow_internal.BlobDesc): The scale factor.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+
+    For example:
+
+    .. code-block:: python
+
+        import numpy as np
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        import cv2
+        from typing import Tuple
+
+
+        def _read_images_by_cv(image_files):
+            images = [cv2.imread(image_file).astype(np.single) for image_file in image_files]
+            return images
+
+
+        def _get_images_static_shape(images):
+            image_shapes = [image.shape for image in images]
+            image_static_shape = np.amax(image_shapes, axis=0)
+            assert isinstance(
+                image_static_shape, np.ndarray
+            ), "image_shapes: {}, image_static_shape: {}".format(
+                str(image_shapes), str(image_static_shape)
+            )
+            image_static_shape = image_static_shape.tolist()
+            image_static_shape.insert(0, len(image_shapes))
+            return image_static_shape
+
+
+        def _get_bbox_static_shape(bbox_list):
+            bbox_shapes = [bbox.shape for bbox in bbox_list]
+            bbox_static_shape = np.amax(bbox_shapes, axis=0)
+            assert isinstance(
+                bbox_static_shape, np.ndarray
+            ), "bbox_shapes: {}, bbox_static_shape: {}".format(
+                str(bbox_shapes), str(bbox_static_shape)
+            )
+            bbox_static_shape = bbox_static_shape.tolist()
+            bbox_static_shape.insert(0, len(bbox_list))
+            return bbox_static_shape
+
+
+        def _of_target_resize_bbox_scale(images, bbox_list, target_size, max_size):
+            image_shape = _get_images_static_shape(images)
+            bbox_shape = _get_bbox_static_shape(bbox_list)
+
+            func_config = flow.FunctionConfig()
+            func_config.default_data_type(flow.float)
+            func_config.default_logical_view(flow.scope.mirrored_view())
+
+            @flow.global_function(function_config=func_config)
+            def target_resize_bbox_scale_job(
+                image_def: tp.ListListNumpy.Placeholder(
+                    shape=tuple(image_shape), dtype=flow.float
+                ),
+                bbox_def: tp.ListListNumpy.Placeholder(
+                    shape=tuple(bbox_shape), dtype=flow.float
+                ),
+            ) -> Tuple[tp.ListListNumpy, tp.ListNumpy]:
+                images_buffer = flow.tensor_list_to_tensor_buffer(image_def)
+                resized_images_buffer, new_size, scale = flow.image_target_resize(
+                    images_buffer, target_size=target_size, max_size=max_size
+                )
+                bbox_buffer = flow.tensor_list_to_tensor_buffer(bbox_def)
+                scaled_bbox = flow.object_bbox_scale(bbox_buffer, scale)
+                scaled_bbox_list = flow.tensor_buffer_to_tensor_list(
+                    scaled_bbox, shape=bbox_shape[1:], dtype=flow.float
+                )
+                return scaled_bbox_list, new_size
+
+            input_image_list = [np.expand_dims(image, axis=0) for image in images]
+            input_bbox_list = [np.expand_dims(bbox, axis=0) for bbox in bbox_list]
+            output_bbox_list, output_image_size = target_resize_bbox_scale_job(
+                [input_image_list], [input_bbox_list]
+            )
+            return output_bbox_list[0], output_image_size[0]
+
+
+        if __name__ == "__main__":
+            images = _read_images_by_cv(['./img/1.jpg', './img/2.jpg'])
+            bbox = np.array([[[20.0, 40.0, 80.0, 160.0],
+                            [30.0, 50.0, 70.0, 100.0]],
+                            [[26.0, 40.0, 86.0, 160.0],
+                            [36.0, 56.0, 76.0, 106.0]]]).astype(np.single) # [x1, y1, x2, y2]
+            bbox, size = _of_target_resize_bbox_scale(images, bbox, 280, 350)
+            print(bbox[0])
+            print(bbox[1])
+
+            # [[[ 16.0218    32.09169   64.0872   128.36676 ]
+            #   [ 24.032698  40.114613  56.076298  80.229225]]]
+
+            # [[[ 24.186047  37.170418  80.       148.68167 ]
+            #   [ 33.488373  52.038586  70.69768   98.5016  ]]]
+
+    """
+    assert isinstance(bbox, oneflow._oneflow_internal.BlobDesc)
+    assert isinstance(scale, oneflow._oneflow_internal.BlobDesc)
+    assert bbox.shape[0] == scale.shape[0]
+    if name is None:
+        name = id_util.UniqueStr("ObjectBboxScale_")
+    op = (
+        flow.user_op_builder(name)
+        .Op("object_bbox_scale")
+        .Input("bbox", [bbox])
+        .Input("scale", [scale])
+        .Output("out")
+        .Build()
+    )
+    return op.InferAndTryRun().SoleOutputBlob()
+
+
+def object_segm_poly_flip(
+    poly: oneflow._oneflow_internal.BlobDesc,
+    image_size: oneflow._oneflow_internal.BlobDesc,
+    flip_code: Union[int, oneflow._oneflow_internal.BlobDesc],
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator flips the segmentation points in image.
+
+    The flip code corresponds to the different flip mode:
+
+    0 (0x00): Non Flip
+
+    1 (0x01): Horizontal Flip
+
+    16 (0x10): Vertical Flip
+
+    17 (0x11): Both Horizontal and Vertical Flip
+
+    Args:
+        poly (oneflow._oneflow_internal.BlobDesc): The poly segmentation points.
+        image_size (oneflow._oneflow_internal.BlobDesc): The image size.
+        flip_code (Union[int, oneflow._oneflow_internal.BlobDesc]): The filp code.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import numpy as np
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        import cv2
+
+
+        def _read_images_by_cv(image_files):
+            images = [cv2.imread(image_file).astype(np.single) for image_file in image_files]
+            return [np.expand_dims(image, axis=0) for image in images]
+
+
+        def _of_object_segm_poly_flip(poly_list, image_size, flip_code):
+            poly_shape = _get_segm_poly_static_shape(poly_list)
+
+            func_config = flow.FunctionConfig()
+            func_config.default_data_type(flow.float)
+            func_config.default_logical_view(flow.scope.mirrored_view())
+
+            @flow.global_function(function_config=func_config)
+            def object_segm_poly_flip_job(
+                poly_def: tp.ListListNumpy.Placeholder(
+                    shape=tuple(poly_shape), dtype=flow.float
+                ),
+                image_size_def: tp.ListNumpy.Placeholder(
+                    shape=image_size.shape, dtype=flow.int32
+                ),
+            ) -> tp.ListListNumpy:
+                poly_buffer = flow.tensor_list_to_tensor_buffer(poly_def)
+                flip_poly = flow.object_segmentation_polygon_flip(
+                    poly_buffer, image_size_def, flip_code
+                )
+                return flow.tensor_buffer_to_tensor_list(
+                    flip_poly, shape=poly_shape[1:], dtype=flow.float
+                )
+
+            input_poly_list = [np.expand_dims(poly, axis=0) for poly in poly_list]
+            poly_tensor = object_segm_poly_flip_job([input_poly_list], [image_size])
+            return poly_tensor[0]
+
+
+        def _get_segm_poly_static_shape(poly_list):
+            poly_shapes = [poly.shape for poly in poly_list]
+            poly_static_shape = np.amax(poly_shapes, axis=0)
+            assert isinstance(
+                poly_static_shape, np.ndarray
+            ), "poly_shapes: {}, poly_static_shape: {}".format(
+                str(poly_shapes), str(poly_static_shape)
+            )
+            poly_static_shape = poly_static_shape.tolist()
+            poly_static_shape.insert(0, len(poly_list))
+            return poly_static_shape
+
+        if __name__ == "__main__":
+            segm_poly_list = []
+            segmentations = [[[20.0, 40.0], [80.0, 160.0], [100.0, 210.0]], # Image 1 segmentation point
+                            [[25.0, 45.0], [85.0, 165.0], [105.0, 215.0]]] # Image 2 segmentation point
+            for segmentation in segmentations:
+                polygon = []
+                for seg in segmentation:
+                    polygon.extend(seg)
+                poly_array = np.array(polygon, dtype=np.single).reshape(-1, 2) # Reshape it
+                segm_poly_list.append(poly_array)
+
+            image_size = np.array([[480, 620], # Image 1 size
+                                [640, 640]]).astype(np.int32) # Image 2 size
+            of_segm_poly_list = _of_object_segm_poly_flip(
+                segm_poly_list, image_size, flip_code=1
+            ) # Horizontal Flip
+            print(of_segm_poly_list[0])
+            print(of_segm_poly_list[1])
+
+            # of_segm_poly_list[0]
+            # [[[460.  40.]
+            #   [400. 160.]
+            #   [380. 210.]]]
+
+            # of_segm_poly_list[1]
+            # [[[615.  45.]
+            #   [555. 165.]
+            #   [535. 215.]]]
+
+    """
+    assert isinstance(poly, oneflow._oneflow_internal.BlobDesc)
+    assert isinstance(image_size, oneflow._oneflow_internal.BlobDesc)
+    assert poly.shape[0] == image_size.shape[0]
+    if name is None:
+        name = id_util.UniqueStr("ObjectSegmPolyFilp_")
+    if not isinstance(flip_code, oneflow._oneflow_internal.BlobDesc):
+        assert isinstance(flip_code, int)
+        flip_code = flow.constant(
+            flip_code,
+            shape=(poly.shape[0],),
+            dtype=flow.int8,
+            name="{}_FlipCode".format(name),
+        )
+    else:
+        assert poly.shape[0] == flip_code.shape[0]
+    op = (
+        flow.user_op_builder(name)
+        .Op("object_segmentation_polygon_flip")
+        .Input("poly", [poly])
+        .Input("image_size", [image_size])
+        .Input("flip_code", [flip_code])
+        .Output("out")
+        .Build()
+    )
+    return op.InferAndTryRun().SoleOutputBlob()
+
+
+def object_segm_poly_scale(
+    poly: oneflow._oneflow_internal.BlobDesc,
+    scale: oneflow._oneflow_internal.BlobDesc,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator scales the segmentation points in the images.
+
+    Args:
+        poly (oneflow._oneflow_internal.BlobDesc): The poly segmentation points.
+        scale (oneflow._oneflow_internal.BlobDesc): The image scale.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+
+    For example:
+
+    .. code-block:: python
+
+        import numpy as np
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        import cv2
+        from typing import Tuple
+
+
+        def _read_images_by_cv(image_files):
+            images = [cv2.imread(image_file).astype(np.single) for image_file in image_files]
+            return images
+
+
+        def _get_images_static_shape(images):
+            image_shapes = [image.shape for image in images]
+            image_static_shape = np.amax(image_shapes, axis=0)
+            assert isinstance(
+                image_static_shape, np.ndarray
+            ), "image_shapes: {}, image_static_shape: {}".format(
+                str(image_shapes), str(image_static_shape)
+            )
+            image_static_shape = image_static_shape.tolist()
+            image_static_shape.insert(0, len(image_shapes))
+            return image_static_shape
+
+
+        def _get_segm_poly_static_shape(poly_list):
+            poly_shapes = [poly.shape for poly in poly_list]
+            poly_static_shape = np.amax(poly_shapes, axis=0)
+            assert isinstance(
+                poly_static_shape, np.ndarray
+            ), "poly_shapes: {}, poly_static_shape: {}".format(
+                str(poly_shapes), str(poly_static_shape)
+            )
+            poly_static_shape = poly_static_shape.tolist()
+            poly_static_shape.insert(0, len(poly_list))
+            return poly_static_shape
+
+
+        def _get_bbox_static_shape(bbox_list):
+            bbox_shapes = [bbox.shape for bbox in bbox_list]
+            bbox_static_shape = np.amax(bbox_shapes, axis=0)
+            assert isinstance(
+                bbox_static_shape, np.ndarray
+            ), "bbox_shapes: {}, bbox_static_shape: {}".format(
+                str(bbox_shapes), str(bbox_static_shape)
+            )
+            bbox_static_shape = bbox_static_shape.tolist()
+            bbox_static_shape.insert(0, len(bbox_list))
+            return bbox_static_shape
+
+
+        def _of_object_segm_poly_scale(images, poly_list, target_size, max_size):
+            image_shape = _get_images_static_shape(images)
+            print(image_shape)
+            poly_shape = _get_segm_poly_static_shape(poly_list)
+            print("Poly shape is ", poly_shape)
+            func_config = flow.FunctionConfig()
+            func_config.default_data_type(flow.float)
+            func_config.default_logical_view(flow.scope.mirrored_view())
+
+            @flow.global_function(function_config=func_config)
+            def object_segm_poly_scale_job(
+                image_def: tp.ListListNumpy.Placeholder(
+                    shape=tuple(image_shape), dtype=flow.float
+                ),
+                poly_def: tp.ListListNumpy.Placeholder(
+                    shape=tuple(poly_shape), dtype=flow.float
+                ),
+            ) -> Tuple[tp.ListListNumpy, tp.ListNumpy]:
+                images_buffer = flow.tensor_list_to_tensor_buffer(image_def)
+                resized_images_buffer, new_size, scale = flow.image_target_resize(
+                    images_buffer, target_size=target_size, max_size=max_size
+                )
+                poly_buffer = flow.tensor_list_to_tensor_buffer(poly_def)
+                scaled_poly = flow.object_segmentation_polygon_scale(poly_buffer, scale)
+                scaled_poly_list = flow.tensor_buffer_to_tensor_list(
+                    scaled_poly, shape=poly_shape[1:], dtype=flow.float
+                )
+                return scaled_poly_list, new_size
+
+            input_image_list = [np.expand_dims(image, axis=0) for image in images]
+            input_poly_list = [np.expand_dims(poly, axis=0) for poly in poly_list]
+
+            output_poly_list, output_image_size = object_segm_poly_scale_job(
+                [input_image_list], [input_poly_list]
+            )
+
+            return output_poly_list[0], output_image_size
+
+        if __name__ == "__main__":
+            images = _read_images_by_cv(['./img/1.jpg', './img/2.jpg'])
+            segm_poly_list = []
+            segmentations = [[[20.0, 40.0], [80.0, 160.0], [100.0, 210.0]], # Image 1 segmentation point
+                            [[25.0, 45.0], [85.0, 165.0], [105.0, 215.0]]] # Image 2 segmentation point
+
+            for segmentation in segmentations:
+                polygon = []
+                for seg in segmentation:
+                    polygon.extend(seg)
+                poly_array = np.array(polygon, dtype=np.single).reshape(-1, 2) # Reshape it
+                segm_poly_list.append(poly_array)
+
+            bbox, size = _of_object_segm_poly_scale(images, segm_poly_list, 280, 350)
+
+    """
+    assert isinstance(poly, oneflow._oneflow_internal.BlobDesc)
+    assert isinstance(scale, oneflow._oneflow_internal.BlobDesc)
+    assert poly.shape[0] == scale.shape[0]
+    if name is None:
+        name = id_util.UniqueStr("ObjectSegmPolyFilp_")
+    op = (
+        flow.user_op_builder(name)
+        .Op("object_segmentation_polygon_scale")
+        .Input("poly", [poly])
+        .Input("scale", [scale])
+        .Output("out")
+        .Build()
+    )
+    return op.InferAndTryRun().SoleOutputBlob()
+
+
+def object_segm_poly_to_mask(
+    poly: oneflow._oneflow_internal.BlobDesc,
+    poly_index: oneflow._oneflow_internal.BlobDesc,
+    image_size: oneflow._oneflow_internal.BlobDesc,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator converts the poly segment points to the segment mask array.
+
+    Args:
+        poly (oneflow._oneflow_internal.BlobDesc): The poly segment points.
+        poly_index (oneflow._oneflow_internal.BlobDesc): The poly segment index.
+        image_size (oneflow._oneflow_internal.BlobDesc): The input image size.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+
+    .. code-block:: python
+
+        import numpy as np
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        import cv2
+        from typing import Tuple
+
+
+        def _read_images_by_cv(image_files):
+            images = [cv2.imread(image_file).astype(np.single) for image_file in image_files]
+            return images
+
+
+        def _get_images_static_shape(images):
+            image_shapes = [image.shape for image in images]
+            image_static_shape = np.amax(image_shapes, axis=0)
+            assert isinstance(
+                image_static_shape, np.ndarray
+            ), "image_shapes: {}, image_static_shape: {}".format(
+                str(image_shapes), str(image_static_shape)
+            )
+            image_static_shape = image_static_shape.tolist()
+            image_static_shape.insert(0, len(image_shapes))
+            return image_static_shape
+
+
+        def _get_segm_poly_static_shape(poly_list, poly_index_list):
+            assert len(poly_list) == len(poly_index_list)
+            num_images = len(poly_list)
+            max_poly_elems = 0
+            for poly, poly_index in zip(poly_list, poly_index_list):
+                assert len(poly.shape) == 2
+                assert len(poly_index.shape) == 2, str(poly_index.shape)
+                assert poly.shape[0] == poly_index.shape[0]
+                assert poly.shape[1] == 2
+                assert poly_index.shape[1] == 3
+                max_poly_elems = max(max_poly_elems, poly.shape[0])
+            return [num_images, max_poly_elems, 2], [num_images, max_poly_elems, 3]
+
+        def _segm_poly_to_tensor(img_segm_poly_list):
+            poly_array_list = []
+            poly_index_array_list = []
+            for img_idx, segm_poly_list in enumerate(img_segm_poly_list):
+                img_poly_elem_list = []
+                img_poly_index_list = []
+
+                for obj_idx, poly_list in enumerate(segm_poly_list):
+                    for poly_idx, poly in enumerate(poly_list):
+                        img_poly_elem_list.extend(poly)
+                        for pt_idx, pt in enumerate(poly):
+                            if pt_idx % 2 == 0:
+                                img_poly_index_list.append([pt_idx / 2, poly_idx, obj_idx])
+
+                img_poly_array = np.array(img_poly_elem_list, dtype=np.single).reshape(-1, 2)
+                assert img_poly_array.size > 0, segm_poly_list
+                poly_array_list.append(img_poly_array)
+
+                img_poly_index_array = np.array(img_poly_index_list, dtype=np.int32)
+                assert img_poly_index_array.size > 0, segm_poly_list
+                poly_index_array_list.append(img_poly_index_array)
+
+            return poly_array_list, poly_index_array_list
+
+
+        def _of_poly_to_mask_pipline(
+            images, poly_list, poly_index_list, num_segms_list, target_size, max_size
+        ):
+            print(len(images))
+            print(len(poly_list))
+
+            assert len(images) == len(poly_list)
+            assert len(poly_list) == len(poly_index_list)
+            image_shape = _get_images_static_shape(images)
+            poly_shape, poly_index_shape = _get_segm_poly_static_shape(
+                poly_list, poly_index_list
+            )
+            max_num_segms = max(num_segms_list)
+
+            func_config = flow.FunctionConfig()
+            func_config.default_logical_view(flow.scope.mirrored_view())
+            func_config.default_data_type(flow.float)
+
+
+            @flow.global_function(function_config=func_config)
+            def poly_to_mask_job(
+                image_def: tp.ListListNumpy.Placeholder(
+                    shape=tuple(image_shape), dtype=flow.float
+                ),
+                poly_def: tp.ListListNumpy.Placeholder(
+                    shape=tuple(poly_shape), dtype=flow.float
+                ),
+                poly_index_def: tp.ListListNumpy.Placeholder(
+                    shape=tuple(poly_index_shape), dtype=flow.int32
+                ),
+            ) -> Tuple[tp.ListListNumpy, tp.ListListNumpy]:
+                images_buffer = flow.tensor_list_to_tensor_buffer(image_def)
+                resized_images_buffer, new_size, scale = flow.image_target_resize(
+                    images_buffer, target_size=target_size, max_size=max_size
+                )
+                poly_buffer = flow.tensor_list_to_tensor_buffer(poly_def)
+                poly_index_buffer = flow.tensor_list_to_tensor_buffer(poly_index_def)
+                scaled_poly_buffer = flow.object_segmentation_polygon_scale(poly_buffer, scale)
+                mask_buffer = flow.object_segmentation_polygon_to_mask(
+                    scaled_poly_buffer, poly_index_buffer, new_size
+                )
+                mask_list = flow.tensor_buffer_to_tensor_list(
+                    mask_buffer, shape=(max_num_segms, target_size, max_size), dtype=flow.int8
+                )
+                scaled_poly_list = flow.tensor_buffer_to_tensor_list(
+                    scaled_poly_buffer, shape=poly_shape[1:], dtype=flow.float
+                )
+                return mask_list, scaled_poly_list
+
+            input_image_list = [np.expand_dims(image, axis=0) for image in images]
+            input_poly_list = [np.expand_dims(poly, axis=0) for poly in poly_list]
+            input_poly_index_list = [
+                np.expand_dims(poly_index, axis=0) for poly_index in poly_index_list
+            ]
+
+            output_mask_list, output_poly_list = poly_to_mask_job(
+                [input_image_list], [input_poly_list], [input_poly_index_list]
+            )
+
+            return output_mask_list[0], output_poly_list[0]
+
+        if __name__ == "__main__":
+            images = _read_images_by_cv(['./img/1.jpg', './img/2.jpg'])
+            segm_poly_list = []
+
+            segmentations = [[[20.0, 40.0, 80.0, 160.0, 100.0, 210.0, 120.0, 215.0]], # Image 1 segmentation point
+                            [[24.0, 42.0, 86.0, 168.0, 103.0, 223.0, 125.0, 235.0]]] # Image 2 segmentation point
+
+            for segmentation in segmentations:
+                polygon = []
+                for seg in segmentation:
+                    polygon.extend(seg)
+
+                poly_array = np.array(polygon, dtype=np.single).reshape(-1, 2) # Reshape it
+                segm_poly_list.append([poly_array])
+
+            poly_list, poly_index_list = _segm_poly_to_tensor(segm_poly_list)
+            num_segms_list = [len(segm_poly_list) for segm_poly_list in segm_poly_list]
+            target_size = 280
+            max_size = 350
+            of_mask_list, of_scaled_poly_list = _of_poly_to_mask_pipline(
+                images, poly_list, poly_index_list, num_segms_list, target_size, max_size
+            )
+            of_mask_list = [
+                mask_array.reshape(-1, mask_array.shape[-2], mask_array.shape[-1])
+                for mask_array in of_mask_list
+            ] # reshape it
+
+    """
+    assert isinstance(poly, oneflow._oneflow_internal.BlobDesc)
+    assert isinstance(poly_index, oneflow._oneflow_internal.BlobDesc)
+    assert isinstance(image_size, oneflow._oneflow_internal.BlobDesc)
+    assert poly.shape[0] == poly_index.shape[0]
+    assert poly.shape[0] == image_size.shape[0]
+    if name is None:
+        name = id_util.UniqueStr("ObjectSegmPolyToMask_")
+    op = (
+        flow.user_op_builder(name)
+        .Op("object_segmentation_polygon_to_mask")
+        .Input("poly", [poly])
+        .Input("poly_index", [poly_index])
+        .Input("image_size", [image_size])
+        .Output("out")
+        .Build()
+    )
+    return op.InferAndTryRun().SoleOutputBlob()
+
+
+def api_coco_reader(
+    annotation_file: str,
+    image_dir: str,
+    batch_size: int,
+    shuffle: bool = True,
+    random_seed: Optional[int] = None,
+    group_by_aspect_ratio: bool = True,
+    stride_partition: bool = True,
+    remove_images_without_annotations: bool = True,
+    name: str = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    assert name is not None
+    module = flow.find_or_create_module(
+        name,
+        lambda: COCOReader(
+            annotation_file=annotation_file,
+            image_dir=image_dir,
+            batch_size=batch_size,
+            shuffle=shuffle,
+            random_seed=random_seed,
+            group_by_aspect_ratio=group_by_aspect_ratio,
+            remove_images_without_annotations=remove_images_without_annotations,
+            stride_partition=stride_partition,
+            name=name,
+        ),
+    )
+    return module()
+
+
+class COCOReader(module_util.Module):
+    def __init__(
+        self,
+        annotation_file: str,
+        image_dir: str,
+        batch_size: int,
+        shuffle: bool = True,
+        random_seed: Optional[int] = None,
+        group_by_aspect_ratio: bool = True,
+        remove_images_without_annotations: bool = True,
+        stride_partition: bool = True,
+        name: str = None,
+    ):
+        assert name is not None
+        if random_seed is None:
+            random_seed = random.randrange(sys.maxsize)
+        module_util.Module.__init__(self, name)
+        self.op_module_builder = (
+            flow.consistent_user_op_module_builder("COCOReader")
+            .Output("image")
+            .Output("image_id")
+            .Output("image_size")
+            .Output("gt_bbox")
+            .Output("gt_label")
+            .Output("gt_segm")
+            .Output("gt_segm_index")
+            .Attr("session_id", flow.current_scope().session_id)
+            .Attr("annotation_file", annotation_file)
+            .Attr("image_dir", image_dir)
+            .Attr("batch_size", batch_size)
+            .Attr("shuffle_after_epoch", shuffle)
+            .Attr("random_seed", random_seed)
+            .Attr("group_by_ratio", group_by_aspect_ratio)
+            .Attr(
+                "remove_images_without_annotations", remove_images_without_annotations
+            )
+            .Attr("stride_partition", stride_partition)
+            .CheckAndComplete()
+        )
+        self.op_module_builder.user_op_module.InitOpKernel()
+
+    def forward(self):
+        if self.call_seq_no == 0:
+            name = self.module_name
+        else:
+            name = id_util.UniqueStr("COCOReader")
+        return (
+            self.op_module_builder.OpName(name)
+            .Build()
+            .InferAndTryRun()
+            .RemoteBlobList()
+        )
+
+
+def ofrecord_image_classification_reader(
+    ofrecord_dir: str,
+    image_feature_name: str,
+    label_feature_name: str,
+    batch_size: int = 1,
+    data_part_num: int = 1,
+    part_name_prefix: str = "part-",
+    part_name_suffix_length: int = -1,
+    random_shuffle: bool = False,
+    shuffle_buffer_size: int = 1024,
+    shuffle_after_epoch: bool = False,
+    color_space: str = "BGR",
+    decode_buffer_size_per_thread: int = 32,
+    num_decode_threads_per_machine: Optional[int] = None,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator creates a reader for image classification tasks.
+
+    Args:
+        ofrecord_dir (str): The directory of ofrecord file.
+        image_feature_name (str): The name of the image feature.
+        label_feature_name (str): The name of the label feature.
+        batch_size (int, optional): The batch_size. Defaults to 1.
+        data_part_num (int, optional): The amounts of data part. Defaults to 1.
+        part_name_prefix (str, optional): The prefix of data part name. Defaults to "part-".
+        part_name_suffix_length (int, optional): The suffix name of data part name. Defaults to -1.
+        random_shuffle (bool, optional): Whether to random shuffle the data. Defaults to False.
+        shuffle_buffer_size (int, optional): The buffer size for shuffle data. Defaults to 1024.
+        shuffle_after_epoch (bool, optional): Whether to shuffle the data after each epoch. Defaults to False.
+        color_space (str, optional): The color space. Defaults to "BGR".
+        decode_buffer_size_per_thread (int, optional): The decode buffer size for per thread. Defaults to 32.
+        num_decode_threads_per_machine (Optional[int], optional): The amounts of decode threads for each machine. Defaults to None.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        from typing import Tuple
+
+
+        @flow.global_function(type="predict")
+        def image_classifier_job() -> Tuple[tp.Numpy, tp.Numpy]:
+            image, label = flow.data.ofrecord_image_classification_reader(
+                ofrecord_dir="./imgdataset",
+                image_feature_name="encoded",
+                label_feature_name="class/label",
+                batch_size=8,
+                data_part_num=1,
+                part_name_prefix="part-",
+                part_name_suffix_length=-1,
+                random_shuffle=False,
+                shuffle_after_epoch=False,
+                color_space="RGB",
+                decode_buffer_size_per_thread=16,
+            )
+            res_image, scale, new_size = flow.image.Resize(
+                    image, target_size=(224, 224)
+                )
+            return res_image, label
+
+
+        if __name__ == "__main__":
+            images, labels = image_classifier_job()
+            # images.shape (8, 224, 224, 3)
+
+    """
+    if name is None:
+        name = id_util.UniqueStr("OFRecordImageClassificationReader_")
+    (image, label) = (
+        flow.user_op_builder(name)
+        .Op("ofrecord_image_classification_reader")
+        .Output("image")
+        .Output("label")
+        .Attr("data_dir", ofrecord_dir)
+        .Attr("data_part_num", data_part_num)
+        .Attr("batch_size", batch_size)
+        .Attr("part_name_prefix", part_name_prefix)
+        .Attr("random_shuffle", random_shuffle)
+        .Attr("shuffle_buffer_size", shuffle_buffer_size)
+        .Attr("shuffle_after_epoch", shuffle_after_epoch)
+        .Attr("part_name_suffix_length", part_name_suffix_length)
+        .Attr("color_space", color_space)
+        .Attr("image_feature_name", image_feature_name)
+        .Attr("label_feature_name", label_feature_name)
+        .Attr("decode_buffer_size_per_thread", decode_buffer_size_per_thread)
+        .Attr("num_decode_threads_per_machine", num_decode_threads_per_machine or 0)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()
+    )
+    label = flow.tensor_buffer_to_tensor(label, dtype=flow.int32, instance_shape=[1])
+    label = flow.squeeze(label, axis=[-1])
+    return (image, label)
+
+
+def OneRecDecoder(
+    input_blob,
+    key,
+    dtype,
+    shape,
+    is_dynamic=False,
+    reshape=None,
+    batch_padding=None,
+    name=None,
+):
+    if name is None:
+        name = id_util.UniqueStr("OneRecDecoder_")
+    if reshape is not None:
+        has_reshape = True
+    else:
+        has_reshape = False
+        reshape = shape
+    if batch_padding is not None:
+        has_batch_padding = True
+    else:
+        has_batch_padding = False
+        batch_padding = shape
+    return (
+        flow.user_op_builder(name)
+        .Op("onerec_decoder")
+        .Input("in", [input_blob])
+        .Output("out")
+        .Attr("key", key)
+        .Attr("data_type", dtype)
+        .Attr("static_shape", shape)
+        .Attr("is_dynamic", is_dynamic)
+        .Attr("has_reshape", has_reshape)
+        .Attr("reshape", reshape)
+        .Attr("has_batch_padding", has_batch_padding)
+        .Attr("batch_padding", batch_padding)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def gpt_data_loader(
+    data_file_prefix: str,
+    seq_length: int,
+    num_samples: int,
+    batch_size: int,
+    dtype: flow.dtype = flow.int64,
+    shuffle: bool = True,
+    random_seed: Optional[int] = None,
+    split_sizes: Optional[Sequence[str]] = None,
+    split_index: Optional[int] = None,
+    parallel_distribution: Optional[Sequence[str]] = None,
+    start_from_saved_progress: bool = False,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    if name is None:
+        name = (
+            "gpt_data_loader"
+            if start_from_saved_progress
+            else id_util.UniqueStr("gpt_data_loader_")
+        )
+    label_length = 1
+    if parallel_distribution is None:
+        parallel_distribution = []
+    if split_index is None:
+        split_index = 0
+    if split_sizes is None:
+        split_sizes = (1,)
+    if split_index >= len(split_sizes):
+        raise ValueError(
+            "split index {} is out of range, split_sizes {}".formart(
+                split_index, split_sizes
+            )
+        )
+    if random_seed is None:
+        from datetime import datetime
+
+        random_seed = int(datetime.utcnow().timestamp())
+
+    def distribute_to_str(dist):
+        if dist is None:
+            return ""
+        elif type(dist) is str:
+            return dist
+        elif type(dist) is oneflow._oneflow_internal.distribute.SplitDistribute:
+            return "S({})".format(dist.axis)
+        elif type(dist) is oneflow._oneflow_internal.distribute.BroadcastDistribute:
+            return "B"
+        else:
+            raise ValueError("unsupported distribute")
+
+    parallel_distribution = list(map(distribute_to_str, parallel_distribution))
+    if start_from_saved_progress:
+        iteration_name = "{}-iteration-sq{}-sa{}-bs{}-sd{}-sp{}-spi{}-{}".format(
+            name,
+            seq_length,
+            num_samples,
+            batch_size,
+            random_seed,
+            "_".join([str(s) for s in split_sizes]),
+            split_index,
+            "_".join(
+                [
+                    "S{}".format(p[2:-1]) if p.startswith("S") else p
+                    for p in parallel_distribution
+                ]
+            ),
+        )
+        iteration = flow.get_variable(
+            name=iteration_name,
+            shape=(1,),
+            dtype=flow.int64,
+            initializer=flow.constant_initializer(0, flow.int64),
+            model_name="iteration",
+            reuse=False,
+        )
+    op_builder = flow.user_op_builder(name).Op("megatron_gpt_mmap_data_loader")
+    if start_from_saved_progress:
+        op_builder.Input("iteration", [iteration])
+    op = (
+        op_builder.Output("out")
+        .Attr("data_file_prefix", data_file_prefix)
+        .Attr("seq_length", seq_length)
+        .Attr("label_length", label_length)
+        .Attr("num_samples", num_samples)
+        .Attr("batch_size", batch_size)
+        .Attr("dtype", dtype)
+        .Attr("shuffle", shuffle)
+        .Attr("random_seed", random_seed)
+        .Attr("split_sizes", split_sizes)
+        .Attr("split_index", split_index)
+        .Attr("parallel_distribution", parallel_distribution)
+        .Build()
+    )
+    return op.InferAndTryRun().SoleOutputBlob()
diff --git a/python/oneflow/compatible/single_client/ops/user_op_builder.py b/python/oneflow/compatible/single_client/ops/user_op_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..55430ca07630752e77966dd9705fba21470dac60
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/user_op_builder.py
@@ -0,0 +1,551 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import random
+import traceback
+
+from google.protobuf import text_format
+
+import oneflow._oneflow_internal
+from oneflow._oneflow_internal.oneflow.core.common import data_type as data_type_cfg
+from oneflow._oneflow_internal.oneflow.core.common import shape as shape_cfg
+from oneflow._oneflow_internal.oneflow.core.framework import (
+    user_op_attr as user_op_attr_cfg,
+)
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.eager import eager_blob_util as eager_blob_util
+from oneflow.compatible.single_client.eager import gradient_util as gradient_util
+from oneflow.compatible.single_client.experimental import namescope as name_scope
+from oneflow.compatible.single_client.framework import c_api_util as c_api_util
+from oneflow.compatible.single_client.framework import (
+    compile_context as compile_context,
+)
+from oneflow.compatible.single_client.framework import distribute as distribute
+from oneflow.compatible.single_client.framework import hob as hob
+from oneflow.compatible.single_client.framework import interpret_util as interpret_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+from oneflow.compatible.single_client.support import enable_if as enable_if
+from oneflow.core.eager import eager_symbol_pb2 as eager_symbol_util
+from oneflow.core.framework import user_op_attr_pb2 as attr_value_pb
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+from oneflow.core.register import logical_blob_id_pb2 as logical_blob_id_util
+
+blob_register = oneflow._oneflow_internal.GetDefaultBlobRegister()
+
+
+class UserOp(object):
+    def __init__(self, op_name, op_type_name=None):
+        self.op_conf_ = op_conf_util.OperatorConf()
+        self.op_conf_.name = op_name
+        if op_type_name is not None:
+            self.op_conf_.user_conf.op_type_name = op_type_name
+        device_tag = flow.current_scope().device_parallel_desc_symbol.device_tag
+        self.op_conf_.device_tag = device_tag
+        self.output_arg_key_list_ = []
+
+    @property
+    def op_conf(self):
+        return self.op_conf_
+
+    def InferAndTryRun(self):
+        raise NotImplementedError
+
+    def MakeRemoteBlob(self, lbi):
+        raise NotImplementedError
+
+    def RemoteBlobList(self):
+        remote_blob_list = []
+        for k in self.op_conf_.user_conf.output:
+            if k not in self.output_arg_key_list_:
+                raise ValueError(
+                    "output_arg_name {} of {} op is not set in python op builder".format(
+                        k, self.op_conf_.name
+                    )
+                )
+        for output_arg_name in self.output_arg_key_list_:
+            assert output_arg_name in self.op_conf_.user_conf.output
+            for i in range(len(self.op_conf_.user_conf.output[output_arg_name].s)):
+                lbi = logical_blob_id_util.LogicalBlobId()
+                lbi.op_name = self.op_conf_.name
+                lbi.blob_name = "{}_{}".format(output_arg_name, i)
+                remote_blob_obj = self.MakeRemoteBlob(lbi)
+                remote_blob_list.append(remote_blob_obj)
+                if flow.eager_execution_enabled():
+                    gradient_util.GetDefaultBackwardBlobRegister().TrySetObject4BlobName(
+                        remote_blob_obj.logical_blob_name, remote_blob_obj.blob_object
+                    )
+        return tuple(remote_blob_list)
+
+    def RemoteBlobDict(self):
+        remote_blob_dict = {}
+        for k in self.op_conf_.user_conf.output:
+            if k not in self.output_arg_key_list_:
+                raise ValueError(
+                    "output_arg_name {} of {} op is not set in python op builder".format(
+                        k, self.op_conf_.name
+                    )
+                )
+        for output_arg_name in self.output_arg_key_list_:
+            assert output_arg_name in self.op_conf_.user_conf.output
+            if output_arg_name not in remote_blob_dict:
+                remote_blob_dict[output_arg_name] = []
+            for i in range(len(self.op_conf_.user_conf.output[output_arg_name].s)):
+                lbi = logical_blob_id_util.LogicalBlobId()
+                lbi.op_name = self.op_conf_.name
+                lbi.blob_name = "{}_{}".format(output_arg_name, i)
+                remote_blob_dict[output_arg_name].append(self.MakeRemoteBlob(lbi))
+        return remote_blob_dict
+
+    def SoleOutputBlob(self):
+        blobs = self.RemoteBlobList()
+        assert len(blobs) == 1
+        return blobs[0]
+
+
+class UserOpModule(object):
+    @property
+    def opkernel_object(self):
+        return self.opkernel_object_
+
+    def set_opkernel_object(self, opkernel_object):
+        assert not hasattr(self, "opkernel_object_")
+        self.opkernel_object_ = opkernel_object
+
+    def InitOpKernel(self):
+        raise NotImplementedError
+
+
+def api_user_op_builder(op_name):
+    """Build a wrapper of user op.
+
+    For instance::
+        def myargmax(
+            input: oneflow._oneflow_internal.BlobDesc) -> oneflow._oneflow_internal.BlobDesc:
+            return (
+            flow.user_op_builder("myargmax")
+            .Op("argmax")
+            .Input("in", [input])
+            .Output("out")
+            .Build()
+            .InferAndTryRun()
+            .RemoteBlobList()[0]
+            )
+
+    Args:
+        op_name (str): name of new user op
+
+    Returns:
+        UserOpConfBuilder: `UserOpConfBuilder` object used to build a wrapper of user op.
+    """
+    api = enable_if.unique([lazy_user_op_builder, eager_user_op_builder])
+    return api(op_name)
+
+
+@enable_if.condition(hob.in_global_mode & ~hob.eager_execution_enabled)
+def lazy_user_op_builder(op_name):
+    job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+    op_name = name_scope.GetJobNameScopePrefix(job_name) + op_name
+    return UserOpConfBuilder(LazyUserOp, op_name, None)
+
+
+class LazyUserOp(UserOp):
+    def __init__(self, op_name, op_type_name):
+        UserOp.__init__(self, op_name, op_type_name)
+
+    def InferAndTryRun(self):
+        compile_context.CurJobAddOp(self.op_conf_)
+        return self
+
+    def MakeRemoteBlob(self, lbi):
+        return remote_blob_util.RemoteBlob(lbi)
+
+
+@enable_if.condition(hob.in_global_mode & hob.eager_execution_enabled)
+def eager_user_op_builder(op_name):
+    job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+    op_name = name_scope.GetJobNameScopePrefix(job_name) + op_name
+    return UserOpConfBuilder(EagerUserOp, op_name, None)
+
+
+class EagerUserOp(UserOp):
+    def __init__(self, op_name, op_type_name):
+        UserOp.__init__(self, op_name, op_type_name)
+
+    def InferAndTryRun(self):
+        interpret_util.Forward(self.op_conf_)
+        return self
+
+    def MakeRemoteBlob(self, lbi):
+        return remote_blob_util.EagerLogicalBlob(lbi)
+
+
+def api_consistent_user_op_builder(op_name):
+    job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+    op_name = name_scope.GetJobNameScopePrefix(job_name) + op_name
+    return UserOpConfBuilder(ConsistentUserOp, op_name, None)
+
+
+class ConsistentUserOp(UserOp):
+    def __init__(self, op_name, op_type_name):
+        UserOp.__init__(self, op_name, op_type_name)
+
+    def InferAndTryRun(self):
+        interpret_util.ConsistentForward(self.op_conf_)
+        return self
+
+    def MakeRemoteBlob(self, lbi):
+        return remote_blob_util.RemoteBlob(lbi)
+
+
+class UserOpConfBuilder(object):
+    def __init__(self, user_op_or_module_class, op_name, op_type_name):
+        self.user_op_ = user_op_or_module_class(op_name, op_type_name)
+
+    def CheckAndComplete(self):
+        assert self.user_op_.op_conf_.user_conf.op_type_name != ""
+        self.user_op_.op_conf_ = c_api_util.CheckAndCompleteUserOpConf(
+            self.user_op_.op_conf_
+        )
+        return self
+
+    def Build(self):
+        """Build op when in/output and other attribute set up.
+
+        Returns:
+            self
+
+        """
+        return self.CheckAndComplete().user_op_
+
+    def OpName(self, op_name):
+        job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+        op_name = name_scope.GetJobNameScopePrefix(job_name) + op_name
+        self.user_op_.op_conf_.name = op_name
+        user_conf = self.user_op_.op_conf_.user_conf
+
+        def GetLbn(output_name, i):
+            return "{}/{}_{}".format(op_name, output_name, i)
+
+        for (output_name, output) in user_conf.output.items():
+            output.s[:] = [GetLbn(output_name, i) for i in range(len(output.s))]
+        return self
+
+    def Op(self, op_type_name):
+        """set typename of op
+
+        Args:
+            op_type_name (string): op type name
+
+        Returns:
+            self
+        """
+        self.user_op_.op_conf_.user_conf.op_type_name = op_type_name
+        return self
+
+    def Input(self, input_name, input_blob_list):
+        """Set input blob of op
+
+        Args:
+            input_name (str): input name of blob
+            input_blob_list : list of blobs
+
+        Returns:
+            self
+        """
+        assert isinstance(input_blob_list, (tuple, list))
+        input_conf = self.user_op_.op_conf_.user_conf.input
+        input_conf[input_name].ClearField("s")
+        for input_blob in input_blob_list:
+            input_conf[input_name].s.append(input_blob.unique_name)
+        return self
+
+    def InputSize(self, input_name, input_blob_size):
+        input_conf = self.user_op_.op_conf_.user_conf.input
+        assert input_blob_size >= 0
+        assert input_name not in input_conf
+        for i in range(input_blob_size):
+            unique_name = "%s/%s_%s" % (self.user_op_.op_conf_.name, input_name, i)
+            input_conf[input_name].s.append(unique_name)
+        return self
+
+    def Output(self, output_name, num=1):
+        """Set output blob of op
+
+        Args:
+            output_name (str): name of output blob
+            num (int, optional):  Defaults to 1.
+
+        Returns:
+            self
+        """
+        assert isinstance(num, int) and num >= 1
+        out_lbns = []
+        for i in range(num):
+            lbn = "{}/{}_{}".format(self.user_op_.op_conf_.name, output_name, i)
+            out_lbns.append(lbn)
+        self.user_op_.op_conf_.user_conf.output[output_name].s[:] = out_lbns
+        self.user_op_.output_arg_key_list_.append(output_name)
+        return self
+
+    def Attr(self, attr_name, attr_value, attr_type_name=None):
+        """Set value of op's attribute.
+
+        Args:
+            attr_name (str): attribute name of op
+            attr_value (Any): attribute value of op
+
+        Raises:
+            ValueError: raised when value is not idential to op's attribute type.
+
+        Returns:
+            [type]: [description]
+        """
+        if attr_type_name != None:
+            print(
+                'WARNING: Argument \'attr_type_name\' of UserOpConfBuilder.Attr has been deprecated. Please remove it.\n\n            For instance:\n                -     .Attr("out_num", out_num, "AttrTypeInt64")\n                +     .Attr("out_num", out_num)\n                        '
+            )
+            print(traceback.format_stack()[-2])
+        attribute = user_op_attr_cfg.AttrValue()
+        assert isinstance(attr_name, str)
+        attr_type = oneflow._oneflow_internal.GetUserOpAttrType(
+            self.user_op_.op_conf_.user_conf.op_type_name, attr_name
+        )
+        if attr_type == user_op_attr_cfg.kAtInt32:
+            assert isinstance(attr_value, int)
+            attribute.set_at_int32(attr_value)
+        elif attr_type == user_op_attr_cfg.kAtInt64:
+            assert isinstance(attr_value, int)
+            attribute.set_at_int64(attr_value)
+        elif attr_type == user_op_attr_cfg.kAtBool:
+            assert isinstance(attr_value, bool)
+            attribute.set_at_bool(attr_value)
+        elif attr_type == user_op_attr_cfg.kAtFloat:
+            assert isinstance(attr_value, (float, int))
+            attribute.set_at_float(attr_value)
+        elif attr_type == user_op_attr_cfg.kAtDouble:
+            assert isinstance(attr_value, (float, int))
+            attribute.set_at_double(attr_value)
+        elif attr_type == user_op_attr_cfg.kAtString:
+            assert isinstance(attr_value, str)
+            attribute.set_at_string(attr_value)
+        elif attr_type == user_op_attr_cfg.kAtShape:
+            assert isinstance(attr_value, (tuple, list))
+            attribute_mutable_at_shape = attribute.mutable_at_shape()
+            for x in attr_value:
+                assert isinstance(x, int)
+                attribute_mutable_at_shape.add_dim(x)
+        elif attr_type == user_op_attr_cfg.kAtDataType:
+            assert attr_value in flow.dtypes()
+            attr_value = oneflow._oneflow_internal.deprecated.GetProtoDtype4OfDtype(
+                attr_value
+            )
+            assert isinstance(attr_value, int)
+            attribute.set_at_data_type(data_type_cfg.DataType(attr_value))
+        elif attr_type == user_op_attr_cfg.kAtListInt32:
+            assert isinstance(attr_value, (tuple, list))
+            attribute_mutable_at_list_int32 = attribute.mutable_at_list_int32()
+            for x in attr_value:
+                assert isinstance(x, int)
+                attribute_mutable_at_list_int32.add_val(x)
+        elif attr_type == user_op_attr_cfg.kAtListInt64:
+            assert isinstance(attr_value, (tuple, list))
+            attribute_mutable_at_list_int64 = attribute.mutable_at_list_int64()
+            for x in attr_value:
+                assert isinstance(x, int)
+                attribute_mutable_at_list_int64.add_val(x)
+        elif attr_type == user_op_attr_cfg.kAtListFloat:
+            assert isinstance(attr_value, (tuple, list))
+            attribute_mutable_at_list_float = attribute.mutable_at_list_float()
+            for x in attr_value:
+                assert isinstance(x, (float, int))
+                attribute_mutable_at_list_float.add_val(x)
+        elif attr_type == user_op_attr_cfg.kAtListDataType:
+            assert isinstance(attr_value, (tuple, list))
+            attribute_mutable_at_list_data_type = attribute.mutable_at_list_data_type()
+            for x in attr_value:
+                assert x in flow.dtypes()
+                x = oneflow._oneflow_internal.deprecated.GetProtoDtype4OfDtype(x)
+                assert isinstance(x, int)
+                attribute_mutable_at_list_data_type.add_val(data_type_cfg.DataType(x))
+        elif attr_type == user_op_attr_cfg.kAtListShape:
+            assert isinstance(attr_value, (tuple, list))
+            attribute_mutable_at_list_shape = (
+                attribute.mutable_at_list_shape().mutable_val()
+            )
+            for x in attr_value:
+                assert isinstance(x, (tuple, list))
+                shape = shape_cfg.ShapeProto()
+                for dim in x:
+                    assert isinstance(dim, int)
+                    shape.add_dim(dim)
+                attribute_mutable_at_list_shape.Add().CopyFrom(shape)
+        elif attr_type == user_op_attr_cfg.kAtListString:
+            assert isinstance(attr_value, (tuple, list))
+            attribute_mutable_at_list_string = attribute.mutable_at_list_string()
+            for x in attr_value:
+                assert isinstance(x, str)
+                attribute_mutable_at_list_string.add_val(x)
+        else:
+            raise ValueError("Invalid op attribute type {}".format(attr_type))
+        self.user_op_.op_conf_.user_conf.attr[attr_name].CopyFrom(
+            text_format.Parse(str(attribute), attr_value_pb.AttrValue())
+        )
+        return self
+
+
+def api_user_op_module_builder(op_type_name):
+    api = enable_if.unique(
+        [lazy_user_op_module_builder, eager_logical_user_op_module_builder]
+    )
+    return api(op_type_name)
+
+
+class UserOpModuleBuilder(UserOpConfBuilder):
+    def __init__(self, *args, **kwargs):
+        UserOpConfBuilder.__init__(self, *args, **kwargs)
+        self.user_op_module.op_conf.scope_symbol_id = flow.current_scope().symbol_id
+
+    @property
+    def user_op_module(self):
+        return self.user_op_
+
+    def Op(self, op_type_name):
+        raise ValueError(
+            "user op module builder of {} can't call '.Op(op_type_name)' method".format(
+                op_type_name
+            )
+        )
+
+
+@enable_if.condition(hob.in_global_mode & ~hob.eager_execution_enabled)
+def lazy_user_op_module_builder(op_type_name):
+    job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+    op_name = name_scope.GetJobNameScopePrefix(job_name) + op_type_name
+    return UserOpModuleBuilder(LazyUserOpModule, op_name, op_type_name)
+
+
+@enable_if.condition(hob.in_global_mode & hob.eager_execution_enabled)
+def eager_logical_user_op_module_builder(op_type_name):
+    job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+    op_name = name_scope.GetJobNameScopePrefix(job_name) + op_type_name
+    return UserOpModuleBuilder(EagerLogicalUserOpModule, op_name, op_type_name)
+
+
+class LazyUserOpModule(UserOpModule, UserOp):
+    def __init__(self, op_name, op_type_name):
+        UserOp.__init__(self, op_name, op_type_name)
+
+    def InitOpKernel(self):
+        self.set_opkernel_object(None)
+
+    def InferAndTryRun(self):
+        assert hob.in_global_mode(None)
+        compile_context.CurJobAddOp(self.op_conf_)
+        return self
+
+    def MakeRemoteBlob(self, lbi):
+        return remote_blob_util.RemoteBlob(lbi)
+
+
+class EagerLogicalUserOpModule(UserOpModule, UserOp):
+    def __init__(self, op_name, op_type_name):
+        UserOp.__init__(self, op_name, op_type_name)
+
+    def InitOpKernel(self):
+        def BuildInstruction(builder):
+            if not isinstance(
+                self.op_conf,
+                oneflow._oneflow_internal.oneflow.core.operator.op_conf.OperatorConf,
+            ):
+                cfg_op_conf = oneflow._oneflow_internal.deprecated.MakeOpConfByString(
+                    str(self.op_conf)
+                )
+            self.set_opkernel_object(builder.NewOpKernelObject(cfg_op_conf))
+
+        oneflow._oneflow_internal.deprecated.LogicalRun(BuildInstruction)
+
+    def InferAndTryRun(self):
+        assert hob.in_global_mode(None)
+        interpret_util.OpKernelForward(self.op_conf, self.opkernel_object)
+        return self
+
+    def MakeRemoteBlob(self, lbi):
+        return remote_blob_util.EagerLogicalBlob(lbi)
+
+
+def api_consistent_user_op_module_builder(op_type_name):
+    api = enable_if.unique(
+        [
+            lazy_consistent_user_op_module_builder,
+            eager_consistent_user_op_module_builder,
+        ]
+    )
+    return api(op_type_name)
+
+
+@enable_if.condition(hob.in_global_mode & ~hob.eager_execution_enabled)
+def lazy_consistent_user_op_module_builder(op_type_name):
+    job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+    op_name = name_scope.GetJobNameScopePrefix(job_name) + op_type_name
+    return UserOpModuleBuilder(LazyConsistentUserOpModule, op_name, op_type_name)
+
+
+@enable_if.condition(hob.in_global_mode & hob.eager_execution_enabled)
+def eager_consistent_user_op_module_builder(op_type_name):
+    job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+    op_name = name_scope.GetJobNameScopePrefix(job_name) + op_type_name
+    return UserOpModuleBuilder(EagerConsistentUserOpModule, op_name, op_type_name)
+
+
+class LazyConsistentUserOpModule(UserOpModule, UserOp):
+    def __init__(self, op_name, op_type_name):
+        UserOp.__init__(self, op_name, op_type_name)
+
+    def InitOpKernel(self):
+        self.set_opkernel_object(None)
+
+    def InferAndTryRun(self):
+        assert hob.in_global_mode(None)
+        compile_context.CurJobAddConsistentOp(self.op_conf_)
+        return self
+
+    def MakeRemoteBlob(self, lbi):
+        return remote_blob_util.RemoteBlob(lbi)
+
+
+class EagerConsistentUserOpModule(UserOpModule, UserOp):
+    def __init__(self, op_name, op_type_name):
+        UserOp.__init__(self, op_name, op_type_name)
+
+    def InitOpKernel(self):
+        def BuildInstruction(builder):
+            if not isinstance(
+                self.op_conf,
+                oneflow._oneflow_internal.oneflow.core.operator.op_conf.OperatorConf,
+            ):
+                cfg_op_conf = oneflow._oneflow_internal.deprecated.MakeOpConfByString(
+                    str(self.op_conf)
+                )
+            self.set_opkernel_object(builder.NewOpKernelObject(cfg_op_conf))
+
+        oneflow._oneflow_internal.deprecated.LogicalRun(BuildInstruction)
+
+    def InferAndTryRun(self):
+        assert hob.in_global_mode(None)
+        interpret_util.OpKernelConsistentForward(self.op_conf, self.opkernel_object)
+        return self
+
+    def MakeRemoteBlob(self, lbi):
+        return remote_blob_util.EagerLogicalBlob(lbi)
diff --git a/python/oneflow/compatible/single_client/ops/util/__init__.py b/python/oneflow/compatible/single_client/ops/util/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/python/oneflow/compatible/single_client/ops/util/custom_op_module.py b/python/oneflow/compatible/single_client/ops/util/custom_op_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..81cc597f0b9aa3933b9f48c2fcf27f12bdad5b43
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/util/custom_op_module.py
@@ -0,0 +1,170 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import importlib.util
+import os
+import os.path
+import shutil
+import subprocess as sp
+import sys
+import sysconfig
+
+import numpy
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import sysconfig as oneflow_sysconfig
+
+
+def run_cmd(cmd, cwd=None):
+    if cwd:
+        res = sp.run(cmd, cwd=cwd, shell=True, stdout=sp.PIPE, stderr=sp.STDOUT)
+    else:
+        res = sp.run(cmd, shell=True, stdout=sp.PIPE, stderr=sp.STDOUT)
+    out = res.stdout.decode("utf8")
+    if res.returncode != 0:
+        err_msg = "Run cmd failed: {}, output: {}".format(cmd, out)
+        raise Exception(err_msg)
+    if len(out) and out[-1] == "\n":
+        out = out[:-1]
+    return out
+
+
+def compile(compiler, flags, link, inputs, output):
+    if os.path.exists(output):
+        return True
+    if isinstance(inputs, list):
+        cmd = "{} {} {} {} -o {}".format(
+            compiler, " ".join(inputs), flags, link, output
+        )
+    else:
+        cmd = "{} {} {} {} -o {}".format(compiler, inputs, flags, link, output)
+    run_cmd(cmd)
+    return True
+
+
+def get_cflags():
+    return " ".join(oneflow_sysconfig.get_compile_flags())
+
+
+def get_lflags():
+    return (
+        " ".join(oneflow_sysconfig.get_link_flags())
+        + " -Wl,-rpath "
+        + oneflow_sysconfig.get_lib()
+    )
+
+
+class PythonKernelRegistry(object):
+    """A helper class to store python kernel module
+    """
+
+    def __init__(self):
+        self.kernels_ = {}
+
+    def Register(self, op_module_name, module):
+        self.kernels_[op_module_name] = module
+
+
+_python_kernel_reg = PythonKernelRegistry()
+
+
+class CustomOpModule(object):
+    def __init__(self, op_module_name, module_path=""):
+        self.op_module_name_ = op_module_name
+        self.api = None
+        self.so_path_ = ""
+        self.objs_ = []
+        self.has_api_ = False
+        self.has_def_ = False
+        self.has_py_kernel_ = False
+        self.has_cpu_kernel_ = False
+        self.has_gpu_kernel_ = False
+        self.got_so_ = False
+        module_path = os.path.normpath(module_path)
+        pwd_path = os.getcwd()
+        if module_path != "." and module_path != pwd_path:
+            module_folder = os.path.join(module_path, self.op_module_name_)
+            pwd_folder = os.path.join(pwd_path, self.op_module_name_)
+            if os.path.exists(pwd_folder):
+                shutil.rmtree(pwd_folder)
+            shutil.copytree(module_folder, pwd_folder)
+        self.src_prefix_ = os.path.join(
+            pwd_path, self.op_module_name_, self.op_module_name_
+        )
+        out_path = os.path.join(pwd_path, self.op_module_name_, "out")
+        if not os.path.exists(out_path):
+            os.makedirs(out_path)
+        self.out_prefix_ = os.path.join(out_path, self.op_module_name_)
+
+    def py_api(self):
+        assert os.path.exists("{}_py_api.py".format(self.src_prefix_))
+        spec = importlib.util.spec_from_file_location(
+            self.op_module_name_, "{}_py_api.py".format(self.src_prefix_)
+        )
+        self.api = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(self.api)
+        return self
+
+    def cpp_def(self):
+        flags = "-std=c++11 -c -fPIC -O2 " + get_cflags()
+        compile(
+            "g++",
+            flags,
+            get_lflags(),
+            "{}_cpp_def.cpp".format(self.src_prefix_),
+            "{}_cpp_def.o".format(self.out_prefix_),
+        )
+        self.objs_.append("{}_cpp_def.o".format(self.out_prefix_))
+        self.has_def_ = True
+        return self
+
+    def py_kernel(self):
+        assert os.path.exists("{}_py_kernel.py".format(self.src_prefix_))
+        spec = importlib.util.spec_from_file_location(
+            self.op_module_name_, "{}_py_kernel.py".format(self.src_prefix_)
+        )
+        kernel = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(kernel)
+        _python_kernel_reg.Register(self.op_module_name_, kernel)
+        oneflow._oneflow_internal.RegisterPyKernelCaller(self.op_module_name_)
+        self.has_py_kernel_ = True
+        return self
+
+    def cpp_kernel(self):
+        flags = "-std=c++11 -c -fPIC -O2 " + get_cflags()
+        compile(
+            "g++",
+            flags,
+            "",
+            "{}_cpp_kernel.cpp".format(self.src_prefix_),
+            "{}_cpp_kernel.o".format(self.out_prefix_),
+        )
+        self.objs_.append("{}_cpp_kernel.o".format(self.out_prefix_))
+        self.has_cpu_kernel_ = True
+        return self
+
+    def gpu_kernel(self):
+        raise NotImplementedError
+
+    def build_load(self):
+        if len(self.objs_) > 0:
+            flags = "-std=c++11 -shared -fPIC " + get_cflags()
+            compile(
+                "g++", flags, get_lflags(), self.objs_, "{}.so".format(self.out_prefix_)
+            )
+            self.got_so_ = True
+            self.so_path_ = self.out_prefix_ + ".so"
+        flow.config.load_library_now(self.so_path_)
diff --git a/python/oneflow/compatible/single_client/ops/watch.py b/python/oneflow/compatible/single_client/ops/watch.py
new file mode 100644
index 0000000000000000000000000000000000000000..22256841199375f9bdd730f13de7d58da2c8fb68
--- /dev/null
+++ b/python/oneflow/compatible/single_client/ops/watch.py
@@ -0,0 +1,434 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import inspect
+import uuid
+from typing import Callable, Optional, Union
+
+import numpy as np
+
+import oneflow._oneflow_internal
+from oneflow._oneflow_internal import ConsistentBlob, MirroredBlob
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import eager as eager_util
+from oneflow.compatible.single_client.framework import c_api_util as c_api_util
+from oneflow.compatible.single_client.framework import (
+    compile_context as compile_context,
+)
+from oneflow.compatible.single_client.framework import hob as hob
+from oneflow.compatible.single_client.framework import id_util as id_util
+from oneflow.compatible.single_client.framework import local_blob as local_blob_util
+from oneflow.compatible.single_client.framework import remote_blob as remote_blob_util
+from oneflow.compatible.single_client.framework import session_context as session_ctx
+from oneflow.compatible.single_client.framework import typing as oft
+from oneflow.compatible.single_client.framework import typing_util as oft_util
+from oneflow.compatible.single_client.framework import watcher as watcher_util
+from oneflow.compatible.single_client.support import enable_if as enable_if
+from oneflow.core.job.lbi_diff_watcher_info_pb2 import LbiAndDiffWatcherUuidPair
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+
+
+def Watch(
+    blob_watched: oneflow._oneflow_internal.BlobDesc,
+    handler_or_prompt: Optional[Union[Callable, str]] = None,
+) -> None:
+    """Register callback for a blob. The callback function will be called after the computation produce the blob finishes. We can use it to watch the values of Blob.
+
+    Args:
+        blob_watched: a `Blob`
+        handler_or_prompt: a function has an argument of a `Blob`
+
+    For example:
+
+    Example 1:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+
+
+        def watch_handler(y: tp.Numpy):
+            print("out", y)
+
+
+        @flow.global_function()
+        def watch_Job() -> None:
+            init = flow.constant_initializer(2.5)
+            variable = flow.get_variable(
+                "variable-weight",
+                shape=(5, ),
+                initializer=init,
+                trainable=True
+            )
+            flow.watch(variable, watch_handler)
+
+
+        checkpoint = flow.train.CheckPoint()
+        checkpoint.init()
+        watch_Job()
+
+        # out [2.5 2.5 2.5 2.5 2.5]
+
+    Example 2:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        import numpy as np
+
+        def watch_handler(y: tp.Numpy):
+            print("out", y)
+
+
+        @flow.global_function()
+        def watch_Job(x: tp.Numpy.Placeholder((1, 3, 2, 2))
+        ) -> None:
+            initializer = flow.truncated_normal(0.1)
+            conv2d = flow.layers.conv2d(
+                x,
+                filters=3,
+                kernel_size=1,
+                strides=1,
+                padding='SAME',
+                kernel_initializer=initializer,
+                name="Conv2d"
+            )
+
+            flow.watch(conv2d, watch_handler)
+
+
+        checkpoint = flow.train.CheckPoint()
+        checkpoint.init()
+        x = np.ones(shape=(1, 3, 2, 2)).astype(np.float32)
+        watch_Job(x)
+
+        # out [[[[ 0.03757111  0.03757111]
+        #        [ 0.03757111  0.03757111]]
+
+        #       [[-0.36131713 -0.36131713]
+        #        [-0.36131713 -0.36131713]]
+
+        #       [[-0.12266113 -0.12266113]
+        #        [-0.12266113 -0.12266113]]]]
+
+    """
+    api = enable_if.unique([EagerWatch, LazyWatch])
+    return api(blob_watched, handler_or_prompt)
+
+
+@enable_if.condition(hob.in_global_mode & hob.eager_execution_enabled)
+def EagerWatch(blob_watched, handler_or_prompt=None):
+    handler = _CheckOrMakeHandler(blob_watched, handler_or_prompt)
+    local_blob = local_blob_util.MakeLocalBlob4EagerBlob(blob_watched)
+    handler(oft_util.TransformWatchedBlob(local_blob, handler))
+
+
+@enable_if.condition(hob.in_global_mode & ~hob.eager_execution_enabled)
+def LazyWatch(blob_watched, handler_or_prompt=None):
+    handler = _CheckOrMakeHandler(blob_watched, handler_or_prompt)
+    if isinstance(blob_watched, ConsistentBlob):
+        LazyConsistentWatch(blob_watched, handler)
+    elif isinstance(blob_watched, MirroredBlob):
+        handlers = _MakeSubConsistentBlobHandlers(blob_watched, handler)
+        for (consistent_blob, sub_handler) in zip(
+            blob_watched.sub_consistent_blob_list, handlers
+        ):
+            assert isinstance(consistent_blob, ConsistentBlob)
+            LazyConsistentWatch(consistent_blob, sub_handler)
+    else:
+        raise NotImplementedError
+
+
+def LazyConsistentWatch(blob_watched, handler):
+    handler_uuid = str(uuid.uuid1())
+    op_conf = op_conf_util.OperatorConf()
+    op_conf.name = id_util.UniqueStr("ForeignWatch_")
+    setattr(op_conf.foreign_watch_conf, "in", blob_watched.unique_name)
+    op_conf.foreign_watch_conf.handler_uuid = handler_uuid
+    device_name = blob_watched.parallel_conf.device_name(0)
+    with flow.scope.placement("cpu", "0:0"):
+        compile_context.CurJobAddOp(op_conf)
+    watcher_util.BindUuidAndHandler(handler_uuid, blob_watched, handler)
+
+
+def WatchDiff(
+    blob_watched: oneflow._oneflow_internal.BlobDesc,
+    handler_or_prompt: Optional[Union[Callable, str]] = None,
+) -> None:
+    """Register callback for gradient of a blob. The callback will be called after the computation produce the gradient blob finishes.
+
+    Args:
+        blob_watched: a `Blob`
+        handler_or_prompt: a function has an argument of a `Blob`
+
+    For example:
+
+    Example 1:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+
+
+        BATCH_SIZE = 20
+
+        def watch_diff_handler(blob: tp.Numpy):
+            print("watch_diff_handler:", blob, blob.shape, blob.dtype)
+
+        @flow.global_function(type="train")
+        def train_job(
+            images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
+            labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
+        ) -> tp.Numpy:
+            initializer = flow.truncated_normal(0.1)
+            with flow.scope.placement("gpu", "0:0"):
+                reshape = flow.reshape(images, [images.shape[0], -1])
+                hidden = flow.layers.dense(
+                    reshape,
+                    512,
+                    activation=flow.nn.relu,
+                    kernel_initializer=initializer,
+                    name="hidden",
+                )
+                logits = flow.layers.dense(
+                    hidden, 10, kernel_initializer=initializer, name="output"
+                )
+                loss = flow.nn.sparse_softmax_cross_entropy_with_logits(labels, logits, name="softmax_loss")
+
+            lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.1])
+            flow.optimizer.SGD(lr_scheduler, momentum=0).minimize(loss)
+            flow.watch_diff(logits, watch_diff_handler)
+            return loss
+
+
+        if __name__ == "__main__":
+            checkpoint = flow.train.CheckPoint()
+            checkpoint.init()
+            (train_images, train_labels), (test_images, test_labels) = flow.data.load_mnist(
+                    BATCH_SIZE
+            )
+            for i, (images, labels) in enumerate(zip(train_images, train_labels)):
+                loss = train_job(images, labels)
+
+
+        # watch_diff_handler: [[-1.88834548e-01  2.71021971e-03  2.28271242e-02  7.17673637e-03
+        #                       4.10183379e-03  8.93106461e-02  2.23669074e-02  3.86103359e-03
+        #                       3.12465224e-02  5.23346756e-03] .....
+
+    Example 2:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        import numpy as np
+
+
+        BATCH_SIZE = 20
+
+        def watch_diff_handler(blob: tp.Numpy):
+            print("watch_diff_handler:", blob)
+
+
+        @flow.global_function(type="train")
+        def watch_matmul_diff_job(
+            images: tp.Numpy.Placeholder((3, 3), dtype=flow.float),
+        ) -> None:
+            with flow.scope.placement("cpu", "0:0"):
+                weight_initializer = flow.constant_initializer(2)
+                weight_shape = (3, BATCH_SIZE)
+                weight = flow.get_variable(
+                    "matmultest-weight",
+                    shape=weight_shape,
+                    initializer=weight_initializer)
+                output = flow.linalg.matmul(images, weight)
+
+            lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.1])
+            flow.optimizer.SGD(lr_scheduler, momentum=0.9).minimize(output)
+            flow.watch_diff(weight, watch_diff_handler)
+
+
+        if __name__ == "__main__":
+            check_point = flow.train.CheckPoint()
+            check_point.init()
+
+            x = np.array([[1, 1, 1],
+                        [1, 1, 1],
+                        [1, 1, 1]]).astype(np.float32)
+            watch_matmul_diff_job(x)
+
+        # watch_diff_handler: [[3. 3. 3.]
+        #                      [3. 3. 3.]
+        #                      [3. 3. 3.]]
+
+    Example 3:
+
+    .. code-block:: python
+
+        import oneflow.compatible.single_client as flow
+        import oneflow.compatible.single_client.typing as tp
+        import numpy as np
+
+
+        def watch_diff_handler(blob: tp.Numpy):
+            print("watch_diff_handler:", blob, blob.shape, blob.dtype)
+
+
+        @flow.global_function(type="train")
+        def watch_conv_diff_job(
+            images: tp.Numpy.Placeholder((1, 1, 4, 4), dtype=flow.float),
+        ) -> None:
+            with flow.scope.placement("gpu", "0:0"):
+                weight_shape = (1, 1, 3, 3)
+                weight_initializer = flow.truncated_normal(0.1)
+                weight = flow.get_variable(
+                    name="conv-weight",
+                    shape=weight_shape,
+                    initializer=weight_initializer
+                )
+                output = flow.nn.conv2d(images, weight, strides=1, padding="VALID")
+
+            lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.1])
+            flow.optimizer.SGD(lr_scheduler, momentum=0.9).minimize(output)
+            flow.watch_diff(weight, watch_diff_handler)
+
+
+        if __name__ == "__main__":
+            check_point = flow.train.CheckPoint()
+            check_point.init()
+
+            x = np.array([[[[ 1.,  2.,  3.,  4.],
+                            [ 5.,  6.,  7.,  8.],
+                            [ 9., 10., 11., 12.],
+                            [13., 14., 15., 16.]]]]).astype(np.float32)
+
+            watch_conv_diff_job(x)
+
+        # watch_diff_handler: [[[[14. 18. 22.]
+        #                        [30. 34. 38.]
+        #                        [46. 50. 54.]]]]
+
+    """
+    api = enable_if.unique([EagerWatchDiff, LazyWatchDiff])
+    return api(blob_watched, handler_or_prompt)
+
+
+@enable_if.condition(hob.in_global_mode & hob.eager_execution_enabled)
+def EagerWatchDiff(blob_watched, handler_or_prompt=None):
+    handler = _CheckOrMakeHandler(blob_watched, handler_or_prompt)
+    handler_uuid = str(uuid.uuid1())
+    lbi_and_uuid = LbiAndDiffWatcherUuidPair()
+    lbi_and_uuid.lbi.op_name = blob_watched.lbi.op_name()
+    lbi_and_uuid.lbi.blob_name = blob_watched.lbi.blob_name()
+    lbi_and_uuid.watcher_uuid = handler_uuid
+    c_api_util.CurJobBuildAndInferCtx_AddLbiAndDiffWatcherUuidPair(lbi_and_uuid)
+    uuid2watch_handler = session_ctx.GetDefaultSession().uuid2watch_handler
+    uuid2watch_handler[handler_uuid] = lambda x: EagerWatch(x, handler_or_prompt)
+
+
+@enable_if.condition(hob.in_global_mode & ~hob.eager_execution_enabled)
+def LazyWatchDiff(blob_watched, handler_or_prompt=None):
+    handler = _CheckOrMakeHandler(blob_watched, handler_or_prompt)
+    if isinstance(blob_watched, ConsistentBlob):
+        LazyConsistentWatchDiff(blob_watched, handler)
+    elif isinstance(blob_watched, MirroredBlob):
+        handlers = _MakeSubConsistentBlobHandlers(blob_watched, handler)
+        for (consistent_blob, sub_handler) in zip(
+            blob_watched.sub_consistent_blob_list, handlers
+        ):
+            assert isinstance(consistent_blob, ConsistentBlob)
+            LazyConsistentWatchDiff(consistent_blob, sub_handler)
+    else:
+        raise NotImplementedError
+
+
+def LazyConsistentWatchDiff(blob_watched, handler):
+    handler_uuid = str(uuid.uuid1())
+    lbi_and_uuid = LbiAndDiffWatcherUuidPair()
+    lbi_and_uuid.lbi.op_name = blob_watched.lbi.op_name()
+    lbi_and_uuid.lbi.blob_name = blob_watched.lbi.blob_name()
+    lbi_and_uuid.watcher_uuid = handler_uuid
+    c_api_util.CurJobBuildAndInferCtx_AddLbiAndDiffWatcherUuidPair(lbi_and_uuid)
+    watcher_util.BindUuidAndHandler(handler_uuid, blob_watched, handler)
+
+
+def _CheckOrMakeHandler(blob_watched, handler_or_prompt):
+    if callable(handler_or_prompt):
+        parameters = inspect.signature(handler_or_prompt).parameters
+        oft_util.CheckWatchCallbackParameterAnnotation(parameters)
+        annotation = parameters[list(parameters.keys())[0]].annotation
+        oft_util.CheckWatchedBlobByAnnotation(blob_watched, annotation)
+        return handler_or_prompt
+    prompt = handler_or_prompt
+
+    def Handler(x: GetTypeAnnotation(blob_watched)):
+        if prompt is not None:
+            print(str(prompt))
+        print(x)
+
+    return Handler
+
+
+def _MakeSubConsistentBlobHandlers(blob_watched, handler):
+    assert isinstance(blob_watched, MirroredBlob)
+    handler4parallel_id_and_local_blob = _MakeHandler4ParallelIdAndLocalBlob(
+        blob_watched, handler
+    )
+    return [
+        _WrapperHandler4ParallelIdAndLocalBlob(i, handler4parallel_id_and_local_blob)
+        for i in range(len(blob_watched.sub_consistent_blob_list))
+    ]
+
+
+def _WrapperHandler4ParallelIdAndLocalBlob(
+    parallel_id, handler4parallel_id_and_local_blob
+):
+    return lambda local_blob: handler4parallel_id_and_local_blob(
+        parallel_id, local_blob
+    )
+
+
+def _MakeHandler4ParallelIdAndLocalBlob(blob_watched, handler):
+    parallel_id2consistent_local_blob = {}
+    len_sub_remote_blobs = len(blob_watched.sub_consistent_blob_list)
+
+    def HandlerParallelIdAndLocalBlob(parallel_id, local_blob):
+        assert parallel_id not in parallel_id2consistent_local_blob
+        parallel_id2consistent_local_blob[parallel_id] = local_blob
+        if len(parallel_id2consistent_local_blob) != len_sub_remote_blobs:
+            return
+        local_blob_list = [
+            parallel_id2consistent_local_blob[parallel_id]
+            for i in range(len_sub_remote_blobs)
+        ]
+        local_numpy = local_blob_list[0].numpy()
+        if len(local_blob_list) > 1:
+            print("WARNING: watch return tensor list will concat as axis = 0.")
+            local_numpy_list = [x.numpy() for x in local_blob_list]
+            local_numpy = np.concatenate(local_numpy_list, axis=0)
+        local_blob = local_blob_util.LocalBlob(local_numpy, blob_watched.is_dynamic)
+        handler(oft_util.TransformWatchedBlob(local_blob, handler))
+
+    return HandlerParallelIdAndLocalBlob
+
+
+def GetTypeAnnotation(blob_watched):
+    if not blob_watched.is_dynamic:
+        return oft.Numpy
+    else:
+        return oft.ListNumpy
diff --git a/python/oneflow/compatible/single_client/optimizer/__init__.py b/python/oneflow/compatible/single_client/optimizer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7294651d36080afa92c37e804ab21df79d3928b1
--- /dev/null
+++ b/python/oneflow/compatible/single_client/optimizer/__init__.py
@@ -0,0 +1,38 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.ops.optimizer import (
+    LAMB,
+    LARS,
+    SGD,
+    SGDW,
+    Adam,
+    AdamW,
+    CombinedOptimizer,
+    CosineScheduler,
+    CustomScheduler,
+    ExponentialScheduler,
+    InverseTimeScheduler,
+    LazyAdam,
+    LinearCosineScheduler,
+    NaturalExpScheduler,
+    PiecewiseConstantScheduler,
+    PiecewiseScalingScheduler,
+    PolynomialSchduler,
+    PolynomialScheduler,
+    RMSProp,
+)
+
+from . import grad_clipping, loss_scale, warmup
diff --git a/python/oneflow/compatible/single_client/optimizer/grad_clipping.py b/python/oneflow/compatible/single_client/optimizer/grad_clipping.py
new file mode 100644
index 0000000000000000000000000000000000000000..218a7c21c032c0916378143b6e56e8ec27aaa687
--- /dev/null
+++ b/python/oneflow/compatible/single_client/optimizer/grad_clipping.py
@@ -0,0 +1,16 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.ops.optimizer import by_global_norm
diff --git a/python/oneflow/compatible/single_client/optimizer/loss_scale.py b/python/oneflow/compatible/single_client/optimizer/loss_scale.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba3c73d90b57a682cd95f65525ca07e247fc5aa0
--- /dev/null
+++ b/python/oneflow/compatible/single_client/optimizer/loss_scale.py
@@ -0,0 +1,21 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.ops.optimizer import (
+    DynamicLossScalePolicy as dynamic_loss_scale,
+)
+from oneflow.compatible.single_client.ops.optimizer import (
+    StaticLossScalePolicy as static_loss_scale,
+)
diff --git a/python/oneflow/compatible/single_client/optimizer/warmup.py b/python/oneflow/compatible/single_client/optimizer/warmup.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa94ccbda59a5290db9bda601ee284b448303d57
--- /dev/null
+++ b/python/oneflow/compatible/single_client/optimizer/warmup.py
@@ -0,0 +1,16 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.ops.optimizer import constant, linear
diff --git a/python/oneflow/compatible/single_client/profiler.py b/python/oneflow/compatible/single_client/profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..685fe48ed79c4fa320f2e8317c539f4288f60025
--- /dev/null
+++ b/python/oneflow/compatible/single_client/profiler.py
@@ -0,0 +1,18 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.framework.profiler import RangePop as range_pop
+from oneflow.compatible.single_client.framework.profiler import RangePush as range_push
+from oneflow.compatible.single_client.ops.array_ops import nvtx_end, nvtx_start
diff --git a/python/oneflow/compatible/single_client/quantization.py b/python/oneflow/compatible/single_client/quantization.py
new file mode 100644
index 0000000000000000000000000000000000000000..0652ed7aaf920ae57dcdfb361e268af44ac1a94b
--- /dev/null
+++ b/python/oneflow/compatible/single_client/quantization.py
@@ -0,0 +1,20 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.ops.quantize_ops import (
+    fake_quantization,
+    min_max_observer,
+    moving_average_min_max_observer,
+)
diff --git a/python/oneflow/compatible/single_client/random.py b/python/oneflow/compatible/single_client/random.py
new file mode 100644
index 0000000000000000000000000000000000000000..04518ec3b27f5209b5626587f32980586d75e219
--- /dev/null
+++ b/python/oneflow/compatible/single_client/random.py
@@ -0,0 +1,27 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.ops.array_ops import (
+    generate_random_batch_permutation_indices,
+    shuffle,
+)
+from oneflow.compatible.single_client.ops.random_ops import Bernoulli as bernoulli
+from oneflow.compatible.single_client.ops.random_util import (
+    api_gen_random_seed as gen_seed,
+)
+from oneflow.compatible.single_client.ops.user_data_ops import api_coin_flip as CoinFlip
+from oneflow.compatible.single_client.ops.user_data_ops import (
+    api_coin_flip as coin_flip,
+)
diff --git a/python/oneflow/compatible/single_client/regularizers.py b/python/oneflow/compatible/single_client/regularizers.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d1136aa5d455044a69523e13671af096a64d76c
--- /dev/null
+++ b/python/oneflow/compatible/single_client/regularizers.py
@@ -0,0 +1,20 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.ops.regularizer_util import (
+    l1_l2_regularizer as l1_l2,
+)
+from oneflow.compatible.single_client.ops.regularizer_util import l1_regularizer as l1
+from oneflow.compatible.single_client.ops.regularizer_util import l2_regularizer as l2
diff --git a/python/oneflow/compatible/single_client/saved_model.py b/python/oneflow/compatible/single_client/saved_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..16d43e04f228d79aabe9ddf488178342defe7113
--- /dev/null
+++ b/python/oneflow/compatible/single_client/saved_model.py
@@ -0,0 +1,20 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.serving.saved_model_builder import (
+    GraphBuilder,
+    ModelBuilder,
+    SignatureBuilder,
+)
diff --git a/python/oneflow/compatible/single_client/sbp.py b/python/oneflow/compatible/single_client/sbp.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e9058953ac143070a9f659a5b4fc4a8c558396b
--- /dev/null
+++ b/python/oneflow/compatible/single_client/sbp.py
@@ -0,0 +1,19 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.framework.distribute import split_sbp as split
+
+broadcast = oneflow._oneflow_internal.sbp.broadcast()
+partial_sum = oneflow._oneflow_internal.sbp.partial_sum()
diff --git a/python/oneflow/compatible/single_client/scope.py b/python/oneflow/compatible/single_client/scope.py
new file mode 100644
index 0000000000000000000000000000000000000000..e007ba0530a1640c6c2c2231ce2487f22b53b0ed
--- /dev/null
+++ b/python/oneflow/compatible/single_client/scope.py
@@ -0,0 +1,36 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.experimental.namescope import (
+    name_scope as namespace,
+)
+from oneflow.compatible.single_client.framework.distribute import (
+    ConsistentStrategyEnabled as consistent_view_enabled,
+)
+from oneflow.compatible.single_client.framework.distribute import (
+    DistributeConsistentStrategy as consistent_view,
+)
+from oneflow.compatible.single_client.framework.distribute import (
+    DistributeMirroredStrategy as mirrored_view,
+)
+from oneflow.compatible.single_client.framework.distribute import (
+    MirroredStrategyEnabled as mirrored_view_enabled,
+)
+from oneflow.compatible.single_client.framework.placement_util import (
+    api_placement as placement,
+)
+from oneflow.compatible.single_client.framework.scope_util import (
+    deprecated_current_scope as current_scope,
+)
diff --git a/python/oneflow/compatible/single_client/serving/__init__.py b/python/oneflow/compatible/single_client/serving/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fe893a6b61cd2cd141efda753d6af3b49a196d5
--- /dev/null
+++ b/python/oneflow/compatible/single_client/serving/__init__.py
@@ -0,0 +1,21 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from oneflow.compatible.single_client.serving.inference_session import (
+    InferenceSession,
+    ModelVersionPolicy,
+    SessionOption,
+)
diff --git a/python/oneflow/compatible/single_client/serving/inference_session.py b/python/oneflow/compatible/single_client/serving/inference_session.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce083cf34a1d19d6e988ed436cd2c72378eebee8
--- /dev/null
+++ b/python/oneflow/compatible/single_client/serving/inference_session.py
@@ -0,0 +1,489 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import asyncio
+import contextlib
+import enum
+import inspect
+import os
+
+import numpy as np
+from google.protobuf import text_format as text_format
+
+import oneflow._oneflow_internal
+from oneflow._oneflow_internal.oneflow.core.common import data_type as dtype_proto_cfg
+from oneflow._oneflow_internal.oneflow.core.common import shape as shape_proto_cfg
+from oneflow._oneflow_internal.oneflow.core.job import job_conf as job_conf_proto_cfg
+from oneflow._oneflow_internal.oneflow.core.job import sbp_parallel as sbp_parallel_cfg
+from oneflow._oneflow_internal.oneflow.core.operator import (
+    interface_blob_conf as interface_blob_conf_proto_cfg,
+)
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import c_api_util as c_api_util
+from oneflow.compatible.single_client.framework import compile_context as compile_ctx
+from oneflow.compatible.single_client.framework import dtype as dtype_util
+from oneflow.compatible.single_client.framework import input_blob_def as input_blob_util
+from oneflow.compatible.single_client.framework import job_instance as job_instance_util
+from oneflow.compatible.single_client.framework import placement_util as placement_util
+from oneflow.compatible.single_client.framework import runtime_mode as runtime_mode
+from oneflow.compatible.single_client.framework import scope_util as scope_util
+from oneflow.compatible.single_client.framework import session_util as session_util
+from oneflow.core.job import job_conf_pb2 as job_conf_proto
+from oneflow.core.operator import interface_blob_conf_pb2 as interface_blob_conf_proto
+from oneflow.core.serving import saved_model_pb2 as saved_model_pb
+
+
+def _is_int(val):
+    try:
+        num = int(val)
+    except ValueError:
+        return False
+    return True
+
+
+def _find_model_latest_version(saved_model_dir):
+    version_dirs = []
+    for f in os.listdir(saved_model_dir):
+        if os.path.isdir(os.path.join(saved_model_dir, f)) and _is_int(f):
+            version_dirs.append(f)
+    version_dirs.sort(reverse=True, key=lambda x: int(x))
+    return version_dirs[0]
+
+
+def _need_check_device_tag(op_conf):
+    if op_conf.HasField("return_conf"):
+        return False
+    return op_conf.HasField("device_tag")
+
+
+def _signature_proto_to_cfg(signature_proto, mut_signature_cfg):
+    assert isinstance(signature_proto, job_conf_proto.JobSignatureDef)
+    assert isinstance(mut_signature_cfg, job_conf_proto_cfg.JobSignatureDef)
+    for (input_name, input_def) in signature_proto.inputs.items():
+        input_def_cfg = job_conf_proto_cfg.JobInputDef()
+        input_def_cfg.mutable_lbi().set_op_name(input_def.lbi.op_name)
+        input_def_cfg.mutable_lbi().set_blob_name(input_def.lbi.blob_name)
+        _inferface_blob_conf_proto_to_cfg(
+            input_def.blob_conf, input_def_cfg.mutable_blob_conf()
+        )
+        mut_signature_cfg.mutable_inputs()[input_name].CopyFrom(input_def_cfg)
+    for (output_name, output_def) in signature_proto.outputs.items():
+        output_def_cfg = job_conf_proto_cfg.JobOutputDef()
+        output_def_cfg.mutable_lbi().set_op_name(output_def.lbi.op_name)
+        output_def_cfg.mutable_lbi().set_blob_name(output_def.lbi.blob_name)
+        mut_signature_cfg.mutable_outputs()[output_name].CopyFrom(output_def_cfg)
+
+
+def _inferface_blob_conf_proto_to_cfg(
+    inferface_blob_conf_proto, mut_inferface_blob_conf_cfg
+):
+    assert isinstance(
+        inferface_blob_conf_proto, interface_blob_conf_proto.InterfaceBlobConf
+    )
+    assert isinstance(
+        mut_inferface_blob_conf_cfg, interface_blob_conf_proto_cfg.InterfaceBlobConf
+    )
+    shape = shape_proto_cfg.ShapeProto()
+    for dim in inferface_blob_conf_proto.shape.dim:
+        shape.add_dim(dim)
+    mut_inferface_blob_conf_cfg.mutable_shape().CopyFrom(shape)
+    dtype = dtype_proto_cfg.DataType(int(inferface_blob_conf_proto.data_type))
+    mut_inferface_blob_conf_cfg.set_data_type(dtype)
+    if inferface_blob_conf_proto.HasField("parallel_distribution"):
+        assert len(inferface_blob_conf_proto.parallel_distribution.sbp_parallel) == 1
+        sbp_proto = inferface_blob_conf_proto.parallel_distribution.sbp_parallel[0]
+        if sbp_proto.HasField("split_parallel"):
+            split_axis = sbp_proto.split_parallel.axis
+            sbp = sbp_parallel_cfg.SbpParallel()
+            sbp.mutable_split_parallel().set_axis(split_axis)
+            mut_inferface_blob_conf_cfg.mutable_parallel_distribution().mutable_sbp_parallel().Add().CopyFrom(
+                sbp
+            )
+    mut_inferface_blob_conf_cfg.set_is_dynamic(inferface_blob_conf_proto.is_dynamic)
+
+
+class ModelVersionPolicy(enum.Enum):
+    LATEST = 1
+
+
+class SessionOption(object):
+    def __init__(self):
+        self.device_tag = "gpu"
+        self.device_num = 1
+        self.is_mirrored_view = False
+
+
+class InferenceSession(object):
+    class SessionStatus(enum.Enum):
+        OPEN = 1
+        RUNNING = 2
+        CLOSED = 3
+
+    def __init__(self, option=None):
+        if option is None:
+            self.option_ = SessionOption()
+        else:
+            assert isinstance(option, SessionOption)
+            self.option_ = option
+        self.is_mirrored_ = self.option_.is_mirrored_view
+        self.checkpoint_path_ = None
+        self.config_proto_ = None
+        self.job_name2job_conf_ = {}
+        self.inter_user_job_info_ = None
+        self.cur_job_name_ = None
+        self.inferface_name2info_ = {}
+        self.output_name2future_ = {}
+        self.job_futures_ = []
+        self.status_ = None
+        self._init_event_loop()
+        self.init()
+
+    def __del__(self):
+        if self.status_ != self.SessionStatus.CLOSED:
+            self.close()
+
+    def _init_event_loop(self):
+        self.event_loop_ = asyncio.get_event_loop()
+        if self.event_loop_.is_closed():
+            asyncio.set_event_loop(asyncio.new_event_loop())
+            self.event_loop_ = asyncio.get_event_loop()
+
+    def init(self):
+        if not oneflow._oneflow_internal.IsEnvInited():
+            flow.env.init()
+        if not oneflow._oneflow_internal.IsSessionInited():
+            self._make_config_proto()
+            session_util._TryCompleteConfigProto(self.config_proto_)
+            c_api_util.InitLazyGlobalSession(self.config_proto_)
+        self.status_ = self.SessionStatus.OPEN
+
+    def close(self):
+        self.event_loop_.run_until_complete(self.wait_for_all_jobs_finished())
+        self.event_loop_.close()
+        if self.status_ == self.SessionStatus.RUNNING:
+            oneflow._oneflow_internal.StopLazyGlobalSession()
+            oneflow._oneflow_internal.DestroyLazyGlobalSession()
+        elif self.status_ == self.SessionStatus.OPEN:
+            oneflow._oneflow_internal.DestroyLazyGlobalSession()
+        else:
+            pass
+        self.status_ = self.SessionStatus.CLOSED
+
+    def _check_status(self, *status):
+        check_success = False
+        for stat in status:
+            if self.status_ == stat:
+                check_success = True
+                break
+        if check_success is False:
+            caller_func_name = inspect.stack()[1].function
+            allowed_status = ",".join(status)
+            raise ValueError(
+                "The calling to {} is only allowed when status is {}, current status is {}".format(
+                    caller_func_name, allowed_status, self.status_
+                )
+            )
+
+    def _make_config_proto(self):
+        if self.config_proto_ is None:
+            self.config_proto_ = session_util._GetDefaultConfigProto()
+        if self.option_.device_tag == "gpu":
+            self.config_proto_.resource.gpu_device_num = self.option_.device_num
+        elif self.option_.device_tag == "cpu":
+            self.config_proto_.resource.cpu_device_num = self.option_.device_num
+            self.config_proto_.resource.gpu_device_num = 0
+        else:
+            raise NotImplementedError(
+                "not supported device tag {}".format(self.option_.device_tag)
+            )
+        self.config_proto_.resource.enable_legacy_model_io = True
+
+    def set_checkpoint_path(self, checkpoint_path):
+        self._check_status(self.SessionStatus.OPEN)
+        self.checkpoint_path_ = checkpoint_path
+
+    def set_job_signature(self, job_name, signature):
+        assert isinstance(signature, job_conf_proto.JobSignatureDef)
+        job_conf = self._get_job_conf(job_name)
+        _signature_proto_to_cfg(signature, job_conf.mutable_signature())
+
+    def set_job_batch_size(self, job_name, batch_size):
+        self._check_status(self.SessionStatus.OPEN)
+        job_conf = self._get_job_conf(job_name)
+        for (_, mut_input_def) in job_conf.mutable_signature().mutable_inputs().items():
+            mut_shape = mut_input_def.mutable_blob_conf().mutable_shape()
+            mut_shape.mutable_dim()[0] = batch_size
+
+    def _get_job_conf(self, job_name):
+        if job_name in self.job_name2job_conf_:
+            return self.job_name2job_conf_[job_name]
+        else:
+            job_conf = job_conf_proto_cfg.JobConfigProto()
+            job_conf.set_job_name(job_name)
+            job_conf.mutable_predict_conf()
+            self.job_name2job_conf_[job_name] = job_conf
+            return job_conf
+
+    @contextlib.contextmanager
+    def open(self, job_name, signature=None, batch_size=None):
+        self._check_status(self.SessionStatus.OPEN)
+        c_api_util.JobBuildAndInferCtx_Open(job_name)
+        if signature is not None:
+            self.set_job_signature(job_name, signature)
+        if isinstance(batch_size, int):
+            self.set_job_batch_size(job_name, batch_size)
+        job_conf = self._get_job_conf(job_name)
+        c_api_util.CurJobBuildAndInferCtx_SetJobConf(job_conf)
+        tag_and_dev_ids = placement_util.GetDefaultMachineDeviceIds(
+            self.config_proto_.resource
+        )
+        scope = scope_util.MakeInitialScope(
+            job_conf, *tag_and_dev_ids, None, self.is_mirrored_
+        )
+        with runtime_mode.ModeScope(runtime_mode.GLOBAL_MODE):
+            with scope_util.ScopeContext(scope):
+                self.cur_job_name_ = job_name
+                yield self
+                self.cur_job_name_ = None
+        oneflow._oneflow_internal.JobBuildAndInferCtx_Close()
+
+    def compile(self, op_list):
+        self._check_status(self.SessionStatus.OPEN)
+        scope = flow.current_scope()
+        device_tag = scope.device_parallel_desc_symbol.device_tag
+        for op_conf in op_list:
+            if _need_check_device_tag(op_conf) and op_conf.device_tag != device_tag:
+                print(
+                    "WARNING: the device_tag of op {} is not equal to the device_tag of seesion's current scope ({} vs. {}), which may cause the op graph to be incompatible".format(
+                        op_conf.name, op_conf.device_tag, device_tag
+                    )
+                )
+            compile_ctx.CurJobAddOp(op_conf)
+        oneflow._oneflow_internal.CurJobBuildAndInferCtx_Complete()
+        oneflow._oneflow_internal.CurJobBuildAndInferCtx_Rebuild()
+
+    def launch(self):
+        self._check_status(self.SessionStatus.OPEN)
+        oneflow._oneflow_internal.StartLazyGlobalSession()
+        self.inter_user_job_info_ = c_api_util.GetInterUserJobInfo()
+        self._run_load_checkpoint_job()
+        self.status_ = self.SessionStatus.RUNNING
+
+    def load_saved_model(
+        self,
+        saved_model_dir,
+        model_version=ModelVersionPolicy.LATEST,
+        saved_model_meta_file_basename="saved_model",
+        graph_name=None,
+        signature_name=None,
+    ):
+        if not os.path.isdir(saved_model_dir):
+            raise ValueError("{} is not a valid directory".format(saved_model_dir))
+        if isinstance(model_version, int):
+            pass
+        elif model_version == ModelVersionPolicy.LATEST:
+            model_version = _find_model_latest_version(saved_model_dir)
+        else:
+            raise NotImplementedError
+        saved_model_path = os.path.join(saved_model_dir, str(model_version))
+        if not os.path.isdir(saved_model_path):
+            raise ValueError(
+                "version {} of saved model in dir {} do not exist".format(
+                    model_version, saved_model_dir
+                )
+            )
+        subfiles = list(os.listdir(saved_model_path))
+        saved_model_meta_pb_filename = saved_model_meta_file_basename + ".pb"
+        saved_model_meta_prototxt_filename = (
+            saved_model_meta_file_basename + ".prototxt"
+        )
+        saved_model_proto = saved_model_pb.SavedModel()
+        if saved_model_meta_pb_filename in subfiles:
+            saved_model_meta_file_path = os.path.join(
+                saved_model_path, saved_model_meta_pb_filename
+            )
+            with open(saved_model_meta_file_path, "rb") as f:
+                saved_model_proto.ParseFromString(f.read())
+        elif saved_model_meta_prototxt_filename in subfiles:
+            saved_model_meta_file_path = os.path.join(
+                saved_model_path, saved_model_meta_prototxt_filename
+            )
+            with open(saved_model_meta_file_path, "rt") as f:
+                text_format.Merge(f.read(), saved_model_proto)
+        else:
+            raise ValueError(
+                "saved model meta file {} do not exist in {}".format(
+                    saved_model_meta_file_basename, saved_model_path
+                )
+            )
+        self.set_checkpoint_path(
+            os.path.join(saved_model_path, saved_model_proto.checkpoint_dir)
+        )
+        signature = None
+        if graph_name is None:
+            graph_name = saved_model_proto.default_graph_name
+        elif graph_name not in saved_model_proto.graphs:
+            raise ValueError("graph {} do not exist".format(graph_name))
+        graph_def = saved_model_proto.graphs[graph_name]
+        if signature_name is None and graph_def.HasField("default_signature_name"):
+            signature_name = graph_def.default_signature_name
+        if signature_name is not None:
+            if signature_name not in graph_def.signatures:
+                raise ValueError("signature {} do not exist".format(signature_name))
+            else:
+                signature = graph_def.signatures[signature_name]
+        with self.open(graph_name, signature):
+            self.compile(graph_def.op_list)
+
+    def print_job_set(self):
+        self._check_status(self.SessionStatus.OPEN, self.SessionStatus.RUNNING)
+        job_set = c_api_util.GetJobSet()
+        for job in job_set.job:
+            print("job_name:", job.job_conf.job_name)
+            for op_conf in job.net.op:
+                print("\top_name:", op_conf.name)
+
+    def list_jobs(self):
+        self._check_status(self.SessionStatus.RUNNING)
+        return list(self.job_name2job_conf_.keys())
+
+    def list_inputs(self):
+        self._check_status(self.SessionStatus.RUNNING)
+        input_names = []
+        for (
+            input_name,
+            _,
+        ) in self.inter_user_job_info_.input_or_var_op_name2push_job_name.items():
+            input_names.append(input_name)
+        return tuple(input_names)
+
+    def list_outputs(self):
+        self._check_status(self.SessionStatus.RUNNING)
+        output_names = []
+        for (
+            output_name,
+            _,
+        ) in self.inter_user_job_info_.output_or_var_op_name2pull_job_name.items():
+            output_names.append(output_name)
+        return tuple(output_names)
+
+    def input_info(self, input_name, job_name=None):
+        return self._get_op_blob_info(job_name, input_name, "out")
+
+    def output_info(self, output_name, job_name=None):
+        return self._get_op_blob_info(job_name, output_name, "in")
+
+    def _get_op_blob_info(self, job_name, op_name, blob_name):
+        self._check_status(self.SessionStatus.OPEN, self.SessionStatus.RUNNING)
+        if op_name in self.inferface_name2info_:
+            return self.inferface_name2info_[op_name]
+        job_name = job_name or self.cur_job_name_
+        if job_name is None:
+            raise ValueError("please specify job_name")
+        lbn = oneflow._oneflow_internal.JobBuildAndInferCtx_GetOpBlobLbn(
+            job_name, op_name, blob_name
+        )
+        shape = c_api_util.JobBuildAndInferCtx_GetStaticShape(job_name, lbn)
+        dtype = c_api_util.JobBuildAndInferCtx_GetDataType(job_name, lbn)
+        dtype = dtype_util.convert_proto_dtype_to_oneflow_dtype(dtype)
+        info = dict(shape=shape, dtype=dtype)
+        self.inferface_name2info_[op_name] = info
+        return info
+
+    def run(self, job_name, **kwargs):
+        self._check_status(self.SessionStatus.RUNNING)
+        return self.event_loop_.run_until_complete(self.async_run(job_name, **kwargs))
+
+    async def async_run(self, job_name, **kwargs):
+        self._check_status(self.SessionStatus.RUNNING)
+        self._run_push_jobs(**kwargs)
+        job_inst = job_instance_util.MakeUserJobInstance(job_name)
+        self._run_job(job_inst)
+        output_futures = tuple(self._run_pull_jobs(job_name).values())
+        return await asyncio.gather(*output_futures)
+
+    def _run_job(self, job_inst):
+        future = self.event_loop_.create_future()
+
+        def job_finish_cb(_):
+            self.event_loop_.call_soon_threadsafe(future.set_result, None)
+
+        job_inst.AddPostFinishCallback(job_finish_cb)
+        oneflow._oneflow_internal.LaunchJob(job_inst)
+        self.job_futures_.append(future)
+
+    def _run_push_jobs(self, **kwargs):
+        for (
+            input_name,
+            push_job_name,
+        ) in self.inter_user_job_info_.input_or_var_op_name2push_job_name.items():
+            if input_name not in kwargs:
+                raise ValueError('input "{}" is absent'.format(input_name))
+            input_numpy = kwargs[input_name]
+            if not isinstance(input_numpy, np.ndarray):
+                raise ValueError('input "{}" requires numpy.ndarray'.format(input_name))
+            push_fn = input_blob_util._MakePushNdarrayCallback(input_numpy)
+            push_job_inst = job_instance_util.MakePushJobInstance(
+                push_job_name, input_name, push_fn
+            )
+            self._run_job(push_job_inst)
+
+    def _run_pull_jobs(self, user_job_name):
+        output_futures = {}
+        for (
+            output_name,
+            pull_job_name,
+        ) in self.inter_user_job_info_.output_or_var_op_name2pull_job_name.items():
+            future = self.event_loop_.create_future()
+            pull_fn = self._make_pull_job_cb(output_name, user_job_name, future)
+            pull_job_inst = job_instance_util.MakePullJobInstance(
+                pull_job_name, output_name, pull_fn
+            )
+            self._run_job(pull_job_inst)
+            output_futures[output_name] = future
+        return output_futures
+
+    def _make_pull_job_cb(self, output_name, user_job_name, future):
+        output_lbn = oneflow._oneflow_internal.JobBuildAndInferCtx_GetOpBlobLbn(
+            user_job_name, output_name, "out"
+        )
+        split_axis = c_api_util.JobBuildAndInferCtx_GetSplitAxisFromProducerView(
+            user_job_name, output_lbn
+        )
+
+        def pull_fn(ofblob):
+            ndarray = ofblob.CopyToNdarray()
+            self.event_loop_.call_soon_threadsafe(future.set_result, ndarray)
+
+        return pull_fn
+
+    def _run_load_checkpoint_job(self):
+        if self.checkpoint_path_ is None:
+            raise ValueError("checkpoint path not set")
+
+        def copy_model_load_path(ofblob):
+            ofblob.CopyFromNdarray(
+                np.frombuffer(self.checkpoint_path_.encode("ascii"), dtype=np.int8)
+            )
+
+        load_checkpoint_job_inst = job_instance_util.MakeJobInstance(
+            self.inter_user_job_info_.global_model_load_job_name,
+            push_cb=copy_model_load_path,
+        )
+        self._run_job(load_checkpoint_job_inst)
+
+    async def wait_for_all_jobs_finished(self):
+        await asyncio.gather(*self.job_futures_)
+        self.job_futures_ = []
diff --git a/python/oneflow/compatible/single_client/serving/saved_model_builder.py b/python/oneflow/compatible/single_client/serving/saved_model_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c587f6d1fb8ec189852e81a2396707be25a76d9
--- /dev/null
+++ b/python/oneflow/compatible/single_client/serving/saved_model_builder.py
@@ -0,0 +1,312 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+import typing
+
+from google.protobuf import text_format
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client.framework import c_api_util as c_api_util
+from oneflow.compatible.single_client.framework import session_context as session_ctx
+from oneflow.core.job import job_conf_pb2 as job_conf_pb
+from oneflow.core.job import sbp_parallel_pb2 as sbp_parallel_pb
+from oneflow.core.operator import interface_blob_conf_pb2 as interface_blob_conf_pb
+from oneflow.core.register import logical_blob_id_pb2 as logical_blob_id_pb
+from oneflow.core.serving import saved_model_pb2 as saved_model_pb
+
+
+class ModelBuilder(object):
+    DEFAULT_CHECKPOINT_DIR = "variables"
+    DEFAULT_SAVED_MODEL_FILE_BASENAME = "saved_model"
+
+    def __init__(self, save_path: str):
+        if not isinstance(save_path, str):
+            raise ValueError(
+                "param 'save_path' must be str, but got {}".format(save_path)
+            )
+        self.version_ = None
+        self.checkpoint_dir_ = self.DEFAULT_CHECKPOINT_DIR
+        self.saved_model_dir_ = save_path
+        self.saved_model_pb_filename_ = "{}.pb".format(
+            self.DEFAULT_SAVED_MODEL_FILE_BASENAME
+        )
+        self.saved_model_pbtxt_filename_ = "{}.prototxt".format(
+            self.DEFAULT_SAVED_MODEL_FILE_BASENAME
+        )
+        self.saved_model_proto_ = saved_model_pb.SavedModel()
+        self.graph_builders_ = {}
+
+    @property
+    def proto(self):
+        return self.saved_model_proto_
+
+    def ModelName(self, model_name: str):
+        assert isinstance(model_name, str)
+        self.proto.name = model_name
+        return self
+
+    def Version(self, version: int):
+        assert isinstance(version, int)
+        self.version_ = version
+        return self
+
+    def AddFunction(self, func):
+        func_name = func.__name__
+        if func_name in self.graph_builders_:
+            raise ValueError("function with name {} already exists".format(func_name))
+        graph_builder = GraphBuilder(func_name, self)
+        self.graph_builders_[func_name] = graph_builder
+        if not self.proto.HasField("default_graph_name"):
+            self.proto.default_graph_name = func_name
+        return graph_builder
+
+    def _check_input_output_name_conflict(self):
+        name_set = set()
+        lbn_set = set()
+
+        def check_name_conflict(name, interface_def):
+            if name in name_set:
+                raise ValueError("input conflict, {} already exist".format(name))
+            name_set.add(name)
+            lbn = Lbi2Lbn(interface_def.lbi)
+            if lbn in lbn_set:
+                raise ValueError(
+                    "input conflict, {} already bind to other input".format(lbn)
+                )
+            lbn_set.add(lbn)
+
+        for (_, graph_def) in self.proto.graphs.items():
+            for (_, signature_def) in graph_def.signatures.items():
+                for (input_name, input_def) in signature_def.inputs.items():
+                    check_name_conflict(input_name, input_def)
+                for (output_name, output_def) in signature_def.outputs.items():
+                    check_name_conflict(output_name, output_def)
+
+    @session_ctx.try_init_default_session
+    def Save(self, save_model_before_graph_complete: bool = True):
+        self._check_input_output_name_conflict()
+        for (_, graph_builder) in self.graph_builders_.items():
+            if not graph_builder.finished:
+                graph_builder.Finish()
+        sess = session_ctx.GetDefaultSession()
+        for (graph_name, graph_def) in self.proto.graphs.items():
+            job = sess.Job(
+                graph_name
+                if save_model_before_graph_complete
+                else graph_name + "_after_complete"
+            )
+            graph_def.op_list.extend(list(job.net.op))
+        if not os.path.exists(self.saved_model_dir_):
+            os.makedirs(self.saved_model_dir_)
+        if self.version_ is None:
+            raise ValueError("model version is not set")
+        version_dir = os.path.join(self.saved_model_dir_, str(self.version_))
+        if os.path.exists(version_dir):
+            raise ValueError(
+                'Directory of model "{}" version "{}" already exist.'.format(
+                    self.saved_model_dir_, self.version_
+                )
+            )
+        os.makedirs(version_dir)
+        self.proto.version = self.version_
+        checkpoint_path = os.path.join(version_dir, self.checkpoint_dir_)
+        flow.checkpoint.save(checkpoint_path)
+        self.proto.checkpoint_dir = self.checkpoint_dir_
+        saved_model_pb_path = os.path.join(version_dir, self.saved_model_pb_filename_)
+        with open(saved_model_pb_path, "wb") as writer:
+            writer.write(self.saved_model_proto_.SerializeToString())
+        saved_model_pbtxt_path = os.path.join(
+            version_dir, self.saved_model_pbtxt_filename_
+        )
+        with open(saved_model_pbtxt_path, "wt") as writer:
+            writer.write(text_format.MessageToString(self.saved_model_proto_))
+
+
+class GraphBuilder(object):
+    def __init__(self, name: str, model_builder: typing.Optional[ModelBuilder] = None):
+        if not isinstance(name, str):
+            raise ValueError("param 'name' must be str, but got {}".format(name))
+        if not isinstance(model_builder, ModelBuilder) and model_builder is not None:
+            raise ValueError(
+                "param 'model_builder' must be a type of ModelBuilder or None"
+            )
+        if model_builder is not None:
+            if name in model_builder.proto.graphs:
+                raise ValueError(
+                    "graph function ({}) is already added to model ({})".format(
+                        name, model_builder.proto.name
+                    )
+                )
+            self.proto_ = model_builder.proto.graphs[name]
+            self.owner_ = model_builder
+        else:
+            self.proto_ = saved_model_pb.GraphDef()
+            self.owner_ = None
+        self.name_ = name
+        self.finished_ = False
+        self.signature_builders_ = {}
+
+    @property
+    def name(self):
+        return self.name_
+
+    @property
+    def proto(self):
+        return self.proto_
+
+    @property
+    def finished(self):
+        return self.finished_
+
+    def AddSignature(self, signature_name: str):
+        assert isinstance(signature_name, str)
+        if signature_name in self.signature_builders_:
+            raise ValueError("signature name {} already exists".format(signature_name))
+        signature_builder = SignatureBuilder(signature_name, self)
+        self.signature_builders_[signature_name] = signature_builder
+        if not self.proto.HasField("default_signature_name"):
+            self.proto.default_signature_name = signature_name
+        return signature_builder
+
+    def Finish(self):
+        assert self.finished is False
+        for (_, signature_def) in self.proto.signatures.items():
+            for (_, input_def) in signature_def.inputs.items():
+                input_lbn = Lbi2Lbn(input_def.lbi)
+                oneflow._oneflow_internal.JobBuildAndInferCtx_CheckLbnValidAndExist(
+                    self.name, input_lbn
+                )
+                GetInterfaceBlobConf(self.name, input_lbn, input_def.blob_conf)
+            for (_, output_def) in signature_def.outputs.items():
+                oneflow._oneflow_internal.JobBuildAndInferCtx_CheckLbnValidAndExist(
+                    self.name, Lbi2Lbn(output_def.lbi)
+                )
+        self.finished_ = True
+
+    def OwnerModelBuilder(self):
+        return self.owner_
+
+    def AsDefault(self):
+        if self.owner_ is not None:
+            self.owner_.proto.default_graph_name = self.name
+        return self
+
+
+class SignatureBuilder(object):
+    def __init__(self, name: str, graph_builder: typing.Optional[GraphBuilder] = None):
+        if not isinstance(name, str):
+            raise ValueError("param 'name' must be str, but got {}".format(name))
+        if not isinstance(graph_builder, GraphBuilder) and graph_builder is not None:
+            raise ValueError(
+                "param 'graph_builder' must be a type of GraphBuilder or None"
+            )
+        if graph_builder is not None:
+            if name in graph_builder.proto.signatures:
+                raise ValueError(
+                    "signature ({}) already exist in graph ({})".format(
+                        name, graph_builder.name
+                    )
+                )
+            self.proto_ = graph_builder.proto.signatures[name]
+            self.owner_ = graph_builder
+        else:
+            self.proto_ = job_conf_pb.JobSignatureDef()
+            self.owner_ = None
+        self.name_ = name
+
+    @property
+    def name(self):
+        return self.name_
+
+    @property
+    def proto(self):
+        return self.proto_
+
+    def Input(self, input_name: str, lbn: str):
+        assert isinstance(input_name, str)
+        assert isinstance(lbn, str)
+        assert "/" in lbn
+        if input_name in self.proto.inputs:
+            raise ValueError(
+                "input_name ({}) already exist in signature ({}) of graph ({})".format(
+                    input_name, self.name, self.graph_builder_.name
+                )
+            )
+        input_def = self.proto.inputs[input_name]
+        Lbn2Lbi(lbn, input_def.lbi)
+        return self
+
+    def Output(self, output_name: str, lbn: str):
+        assert isinstance(output_name, str)
+        assert isinstance(lbn, str)
+        assert "/" in lbn
+        if output_name in self.proto.outputs:
+            raise ValueError(
+                "output_name ({}) already exist in signature ({}) of graph ({})".format(
+                    output_name, self.name, self.graph_builder_.name
+                )
+            )
+        output_def = self.proto.outputs[output_name]
+        Lbn2Lbi(lbn, output_def.lbi)
+        return self
+
+    def OwnerGraphBuilder(self):
+        return self.owner_
+
+    def AsDefault(self):
+        if self.owner_ is not None:
+            self.owner_.proto.default_signature_name = self.name
+        return self
+
+
+def GetInterfaceBlobConf(job_name, lbn, blob_conf=None):
+    assert isinstance(job_name, str)
+    assert isinstance(lbn, str)
+    if blob_conf is None:
+        blob_conf = interface_blob_conf_pb.InterfaceBlobConf()
+    else:
+        assert isinstance(blob_conf, interface_blob_conf_pb.InterfaceBlobConf)
+    shape = c_api_util.JobBuildAndInferCtx_GetStaticShape(job_name, lbn)
+    dtype = c_api_util.JobBuildAndInferCtx_GetDataType(job_name, lbn)
+    split_axis = c_api_util.JobBuildAndInferCtx_GetSplitAxisFromProducerView(
+        job_name, lbn
+    )
+    is_dynamic = c_api_util.JobBuildAndInferCtx_IsDynamic(job_name, lbn)
+    blob_conf.shape.dim.extend(shape)
+    blob_conf.data_type = dtype
+    if split_axis is not None:
+        sbp_parallel = sbp_parallel_pb.SbpParallel()
+        sbp_parallel.split_parallel.axis = split_axis
+        blob_conf.parallel_distribution.sbp_parallel.extend([sbp_parallel])
+    blob_conf.is_dynamic = is_dynamic
+    return blob_conf
+
+
+def Lbn2Lbi(lbn, lbi=None):
+    assert isinstance(lbn, str)
+    assert "/" in lbn, 'invalid lbn "{}"'.format(lbn)
+    [op_name, blob_name] = lbn.split("/")
+    if lbi is None:
+        lbi = logical_blob_id_pb.LogicalBlobId()
+    lbi.op_name = op_name
+    lbi.blob_name = blob_name
+    return lbi
+
+
+def Lbi2Lbn(lbi):
+    assert isinstance(lbi, logical_blob_id_pb.LogicalBlobId)
+    return "{}/{}".format(lbi.op_name, lbi.blob_name)
diff --git a/python/oneflow/compatible/single_client/summary/__init__.py b/python/oneflow/compatible/single_client/summary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e88ce6e0ab8985f2462878d62fb826bc3ec3cdc
--- /dev/null
+++ b/python/oneflow/compatible/single_client/summary/__init__.py
@@ -0,0 +1,37 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from oneflow.compatible.single_client.ops.summary_ops import (
+    create_summary_writer,
+    flush_summary_writer,
+)
+from oneflow.compatible.single_client.ops.summary_ops import (
+    write_histogram as histogram,
+)
+from oneflow.compatible.single_client.ops.summary_ops import write_image as image
+from oneflow.compatible.single_client.ops.summary_ops import write_pb as pb
+from oneflow.compatible.single_client.ops.summary_ops import write_scalar as scalar
+from oneflow.compatible.single_client.summary.summary_graph import Graph
+from oneflow.compatible.single_client.summary.summary_hparams import (
+    HParam,
+    IntegerRange,
+    Metric,
+    RealRange,
+    ValueSet,
+    hparams,
+    text,
+)
+from oneflow.compatible.single_client.summary.summary_projector import Projector
diff --git a/python/oneflow/compatible/single_client/summary/summary_graph.py b/python/oneflow/compatible/single_client/summary/summary_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..7279ee8405219b3463c25da3a0d0556571752d6c
--- /dev/null
+++ b/python/oneflow/compatible/single_client/summary/summary_graph.py
@@ -0,0 +1,66 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import logging
+import os
+import time
+
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+from oneflow.core.summary import projector_pb2 as projector_pb2
+
+
+class Graph(object):
+    """The class of Graph
+
+    This class can write 'computing_graph' or 'structure_graph' into log file
+    """
+
+    def __init__(self, logdir=None):
+        """Create a Graph object
+
+        Args:
+            logdir: The log dir
+
+        Raises:
+            Exception: If log dir is None or illegal
+        """
+        if logdir is None:
+            raise Exception("logdir should not be None!")
+        logdir += "/graph"
+        if not os.path.exists(logdir):
+            os.makedirs(logdir)
+        self.logdir_ = logdir
+        self.structure_graph_filename_ = None
+        self.compute_graph_filename_ = None
+
+    def write_structure_graph(self):
+        if self.structure_graph_filename_ is not None and os.path.exists(
+            self.structure_graph_filename_
+        ):
+            raise OSError("You must create only one structure graph log file!")
+        self.structure_graph_filename_ = self.logdir_ + "/structure_graph.json"
+        struct_graph_str = oneflow._oneflow_internal.GetSerializedStructureGraph()
+        with open(self.structure_graph_filename_, "w", encoding="utf-8") as f:
+            f.write(str(struct_graph_str))
+            f.flush()
+
+    @property
+    def logdir(self):
+        return self.logdir_
+
+    @property
+    def structure_graph_filename(self):
+        return self.structure_graph_filename_
diff --git a/python/oneflow/compatible/single_client/summary/summary_hparams.py b/python/oneflow/compatible/single_client/summary/summary_hparams.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0a273c7a971e07e605ee77f5c866812356ed008
--- /dev/null
+++ b/python/oneflow/compatible/single_client/summary/summary_hparams.py
@@ -0,0 +1,339 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import hashlib
+import json
+import time
+
+import numpy as np
+import six
+
+from oneflow.compatible import single_client as flow
+from oneflow.core.summary import event_pb2 as event_pb2
+from oneflow.core.summary import plugin_data_pb2 as plugin_data_pb2
+from oneflow.core.summary import projector_pb2 as projector_pb2
+from oneflow.core.summary import summary_pb2 as summary_pb2
+from oneflow.core.summary import tensor_pb2 as tensor_pb2
+
+
+def text(text, tag=None):
+    """Add a text list to Summary
+
+    Args:
+        text: A str list
+        tag: The tag of summary
+
+    Returns:
+        A protobuf message [Summary]
+    """
+    if isinstance(text, (tuple, list)) and len(text) > 0:
+        if not isinstance(tag, str) or tag is None:
+            tag = "text"
+        text_size = len(text)
+        tensor_shape = tensor_pb2.TensorShapeProto()
+        dim = tensor_shape.dim.add()
+        dim.size = text_size
+        tensor = tensor_pb2.TensorProto(
+            dtype=tensor_pb2.DT_STRING, tensor_shape=tensor_shape
+        )
+        for idx in range(text_size):
+            tensor.string_val.append(text[idx].encode("utf-8"))
+        summary = summary_pb2.Summary()
+        value = summary.value.add(
+            tag=tag,
+            metadata=summary_pb2.SummaryMetadata(
+                plugin_data=summary_pb2.SummaryMetadata.PluginData(plugin_name="text")
+            ),
+            tensor=tensor,
+        )
+        return summary
+
+
+def _get_tensor(values, dtype=None, shape=None):
+    array = np.empty(shape, dtype=np.float)
+    tensor_shape = tensor_pb2.TensorShapeProto()
+    dim = tensor_shape.dim.add()
+    dim.size = 0
+    tensor_proto = tensor_pb2.TensorProto(
+        dtype=tensor_pb2.DT_FLOAT, tensor_shape=tensor_shape
+    )
+    proto_values = array.ravel()
+    tensor_proto.float_val.extend([np.asscalar(x) for x in proto_values])
+    return tensor_proto
+
+
+def hparams(hparams):
+    """Add hparams to Summary
+
+    Args:
+        hparams: A dict of Hparams
+
+    Raises:
+        TypeError: If the type of hparam not in (str, int, float, bool)
+        TypeError: If the type of metric not in (float, int)
+
+    Returns:
+        A protobuf message [Summary]
+    """
+    (hparams, metrics) = _get_hparams_dict(hparams)
+    jparams = json.dumps(hparams, sort_keys=True, separators=(",", ":"))
+    group_name = hashlib.sha256(jparams.encode("utf-8")).hexdigest()
+    session_start_info = plugin_data_pb2.SessionStartInfo(
+        group_name=group_name, start_time_secs=time.time()
+    )
+    for key in sorted(hparams):
+        value = hparams[key]
+        if isinstance(value, str):
+            session_start_info.hparams[key].string_value = value
+        elif isinstance(value, (float, int)):
+            session_start_info.hparams[key].number_value = value
+        elif isinstance(value, bool):
+            session_start_info.hparams[key].bool_value = value
+        else:
+            raise TypeError("the type of value: %r is not supported!" % value)
+    for key in metrics:
+        value = metrics[key]
+        if isinstance(value, (float, int)):
+            session_start_info.metrics[key].number_value = value
+        else:
+            raise TypeError("the type of value: %r is not supported!" % value)
+    summary = summary_pb2.Summary()
+    summary_metadata = _get_metadata(
+        plugin_data_pb2.HParamsPluginData(session_start_info=session_start_info)
+    )
+    summary.value.add(
+        tag="_hparams_/session_start_info",
+        metadata=summary_metadata,
+        tensor=_get_tensor([], tensor_pb2.DT_FLOAT, (0,)),
+    )
+    return summary
+
+
+def _get_metadata(hparams_plugin_data):
+    plugin_data = plugin_data_pb2.HParamsPluginData()
+    plugin_data.CopyFrom(hparams_plugin_data)
+    plugin_data.version = 0
+    return summary_pb2.SummaryMetadata(
+        plugin_data=summary_pb2.SummaryMetadata.PluginData(
+            plugin_name="hparams", content=plugin_data.SerializeToString()
+        )
+    )
+
+
+def _get_hparams_dict(hparams):
+    hparams_dict = {}
+    metrics_dict = {}
+    for (key, value) in dict.items(hparams):
+        if key in hparams_dict or key in metrics_dict:
+            raise ValueError("the key is already exist %r" % (key,))
+        if isinstance(key, HParam):
+            key = key.name
+        if isinstance(key, Metric):
+            metrics_dict[key.name] = _get_value(value)
+            continue
+        hparams_dict[key] = _get_value(value)
+    return (hparams_dict, metrics_dict)
+
+
+def _get_value(value):
+    if isinstance(value, np.generic):
+        return value.item()
+    else:
+        return value
+
+
+class HParam(object):
+    """The class of Hparam
+
+    This class describes the name and the type of Hparam
+    """
+
+    def __init__(self, name, dtype=None):
+        """Create a Hparam object
+
+        Args:
+            name: Hparam name
+            dtype: Hparam type
+
+        Raises:
+            ValueError: If Hparam type not in (IntegerRange, RealRange, ValueSet)
+        """
+        self.name_ = name
+        self.dtype_ = dtype
+        if not isinstance(self.dtype_, (IntegerRange, RealRange, ValueSet, type(None))):
+            raise ValueError(
+                "Hparam dtype must be: (IntegerRange, RealRange, ValueSet) : %r"
+                % (self.dtype_,)
+            )
+
+    @property
+    def name(self):
+        return self.name_
+
+    @property
+    def dtype(self):
+        return self.dtype_
+
+
+class IntegerRange(object):
+    """The class of IntegerRange
+
+    This class takes a integer range between min_value and max_value
+    """
+
+    def __init__(self, min_value, max_value):
+        """Create an 'IntegerRange' object
+
+        Args:
+            min_value: The min value of the range
+            max_value: The max value of the range
+
+        Raises:
+            TypeError: If 'min_value' or 'max_value' is not an int
+            ValueError: If 'min_value' > 'max_value'
+        """
+        if not isinstance(max_value, int):
+            raise TypeError("max_value is not an integer value: %r" % (max_value,))
+        if not isinstance(min_value, int):
+            raise TypeError("min_value is not an integer value: %r" % (min_value,))
+        if min_value > max_value:
+            raise ValueError(
+                "max_value must bigger than min_value: %r > %r" % (min_value, max_value)
+            )
+        self.min_value_ = min_value
+        self.max_value_ = max_value
+
+    @property
+    def min_value(self):
+        return self.min_value_
+
+    @property
+    def max_value(self):
+        return self.max_value_
+
+
+class RealRange(object):
+    """The class of RealRange
+
+    This class takes a realnumber range between min_value and max_value
+    """
+
+    def __init__(self, min_value, max_value):
+        """Create a 'RealRange' object
+
+        Args:
+            min_value: The min value of the range
+            max_value: The max value of the range
+
+        Raises:
+            TypeError: If 'min_value' or 'max_value' is not an float
+            ValueError: If 'min_value' > 'max_value'
+        """
+        if not isinstance(max_value, float):
+            raise TypeError("max_value is not an float value: %r" % (max_value,))
+        if not isinstance(min_value, float):
+            raise TypeError("min_value is not an float value: %r" % (min_value,))
+        if min_value > max_value:
+            raise ValueError(
+                "max_value must bigger than min_value: %r > %r" % (min_value, max_value)
+            )
+        self.min_value_ = min_value
+        self.max_value_ = max_value
+
+    @property
+    def min_value(self):
+        return self.min_value_
+
+    @property
+    def max_value(self):
+        return self.max_value_
+
+
+class ValueSet(object):
+    """The class of ValueSet
+
+    This class takes a list of value
+    """
+
+    def __init__(self, values, dtype=None):
+        """Create a ValueSet object
+
+        Args:
+            values: a list of values
+            dtype: the value type
+
+        Raises:
+            ValueError: If the value type not in (int, float, bool, str)
+            TypeError: If the value in the list is not same
+        """
+        self.values_ = list(values)
+        if dtype is None:
+            if self.values_:
+                dtype = type(self.values_[0])
+        if dtype not in (int, float, bool, str):
+            raise ValueError(
+                "Value type must in (int, float, bool, str), %r is not supported!"
+                % (dtype,)
+            )
+        self.dtype_ = dtype
+        for value in self.values_:
+            if not isinstance(value, self.dtype_):
+                raise TypeError(
+                    "The type of value is not supported! value: %r type: %s"
+                    % (value, self.dtype_.__name__)
+                )
+        self.values_.sort()
+
+    @property
+    def dtype(self):
+        return self.dtype_
+
+    @property
+    def values(self):
+        return list(self.values_)
+
+
+class Metric(object):
+    """The class of Metric
+
+    This class takes a 'int' or 'float' value
+    """
+
+    def __init__(self, name, dtype=None):
+        """Create a Metric object
+
+        Args:
+            name: Metric name
+            dtype: Value type
+
+        Raises:
+            ValueError: If type is not 'int' or 'float'
+        """
+        self.name_ = name
+        if dtype is None:
+            dtype = float
+        if dtype not in (int, float):
+            raise ValueError(
+                "Value type must in (int, float), %r is not supported!" % (dtype,)
+            )
+        self.dtype_ = dtype
+
+    @property
+    def name(self):
+        return self.name_
+
+    @property
+    def dtype(self):
+        return self.dtype_
diff --git a/python/oneflow/compatible/single_client/summary/summary_projector.py b/python/oneflow/compatible/single_client/summary/summary_projector.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ab1c52b5780d56c67332bbe56b2673000cef923
--- /dev/null
+++ b/python/oneflow/compatible/single_client/summary/summary_projector.py
@@ -0,0 +1,158 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+import time
+
+from oneflow.compatible import single_client as flow
+from oneflow.core.summary import projector_pb2 as projector_pb2
+
+
+class Projector(object):
+    """The class of Projector
+
+    This class can create an 'embedding_projector' or 'exception_projector'
+    """
+
+    def __init__(self, logdir=None):
+        """Create a Projector objector
+
+        Args:
+            logdir: The log dir
+
+        Raises:
+            Exception: If 'logdir' is None or illegal
+        """
+        if logdir is None:
+            raise Exception("logdir should not be None!")
+        logdir += "/projector"
+        if not os.path.exists(logdir):
+            os.makedirs(logdir)
+        self.logdir_ = logdir
+        self.embedding_filename_ = None
+        self.exception_filename_ = None
+
+    def create_embedding_projector(self):
+        if self.embedding_filename_ is not None and os.path.exists(
+            self.embedding_filename_
+        ):
+            raise OSError("You must create only one embedding projector!")
+        self.embedding_filename_ = (
+            self.logdir_ + "/projector." + str(int(time.time())) + ".log"
+        )
+
+    def create_exception_projector(self):
+        if self.exception_filename_ is not None and os.path.exists(
+            self.exception_filename_
+        ):
+            raise OSError("You must create only one embedding projector!")
+        self.exception_filename_ = (
+            self.logdir_ + "/projector.gradit." + str(int(time.time())) + ".log"
+        )
+
+    @property
+    def logdir(self):
+        return self.logdir_
+
+    @property
+    def exception_filename(self):
+        return self.exception_filename_
+
+    @property
+    def embedding_filename(self):
+        return self.embedding_filename_
+
+    def write_projector(self, filename=None, projector=None):
+        with open(filename, "wb") as f:
+            f.write(projector.SerializeToString())
+            f.flush()
+
+    def set_tensor(self, tensor: projector_pb2.Tensor, value):
+        for d in value.shape:
+            td = tensor.shape.dim.add()
+            td.size = d
+        tensor.dtype = str(value.dtype)
+        tensor.content = value.tobytes()
+
+    def set_projector(self, pro, tag, step, value, label=None):
+        pro.tag = str(tag)
+        pro.step = step
+        pro.WALL_TIME = time.time()
+        self.set_tensor(pro.value, value)
+        if label is not None:
+            self.set_tensor(pro.label, label)
+
+    def set_sample(self, sample, name, x, sample_type):
+        if name is not None:
+            sample.name = name
+        if sample_type == "image" or sample_type == "IMAGE":
+            sample.type = projector_pb2.Sample.SampleType.IMAGE
+        elif sample_type == "audio" or sample_type == "AUDIO":
+            sample.type = projector_pb2.Sample.SampleType.AUDIO
+        elif sample_type == "text" or sample_type == "TEXT":
+            sample.type = projector_pb2.Sample.SampleType.TEXT
+        else:
+            raise NotImplementedError
+        if x is not None:
+            self.set_tensor(sample.X, x)
+
+    def embedding_projector(
+        self,
+        value=None,
+        label=None,
+        tag=None,
+        step=None,
+        sample_name=None,
+        sample_type=None,
+        x=None,
+    ):
+        if tag is None:
+            tag = "embedding_projector"
+        summary_projector = projector_pb2.SummaryProjector()
+        summary_projector.metadata.type = projector_pb2.MetaData.ProjectorType.EMBEDDING
+        projector = summary_projector.projector.add()
+        self.set_projector(pro=projector, tag=tag, step=step, value=value, label=label)
+        if sample_name is not None and sample_type is not None:
+            self.set_sample(
+                sample=summary_projector.sample,
+                name=sample_name,
+                x=x,
+                sample_type=sample_type,
+            )
+        self.write_projector(self.embedding_filename_, summary_projector)
+
+    def exception_projector(
+        self,
+        value=None,
+        tag=None,
+        step=None,
+        sample_name=None,
+        sample_type=None,
+        x=None,
+    ):
+        if tag is None:
+            tag = "exception_projector"
+        summary_projector = projector_pb2.SummaryProjector()
+        summary_projector.metadata.type = projector_pb2.MetaData.ProjectorType.EXCEPTION
+        projector = summary_projector.projector.add()
+        self.set_projector(pro=projector, tag=tag, step=step, value=value)
+        if sample_name is not None and sample_type is not None:
+            self.set_sample(
+                sample=summary_projector.sample,
+                name=sample_name,
+                x=x,
+                sample_type=sample_type,
+            )
+        self.write_projector(self.exception_filename_, summary_projector)
diff --git a/python/oneflow/compatible/single_client/support/__init__.py b/python/oneflow/compatible/single_client/support/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/python/oneflow/compatible/single_client/support/async_util.py b/python/oneflow/compatible/single_client/support/async_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..8143316f54ae66eb5e431715bb009a4f140321f2
--- /dev/null
+++ b/python/oneflow/compatible/single_client/support/async_util.py
@@ -0,0 +1,38 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import threading
+
+
+def Await(counter, func):
+    assert counter > 0
+    cond_var = threading.Condition()
+    counter_box = [counter]
+    result_list = []
+
+    def Yield(result=None):
+        result_list.append(result)
+        cond_var.acquire()
+        assert counter_box[0] > 0
+        counter_box[0] -= 1
+        cond_var.notify()
+        cond_var.release()
+
+    func(Yield)
+    cond_var.acquire()
+    while counter_box[0] > 0:
+        cond_var.wait()
+    cond_var.release()
+    return result_list
diff --git a/python/oneflow/compatible/single_client/support/box.py b/python/oneflow/compatible/single_client/support/box.py
new file mode 100644
index 0000000000000000000000000000000000000000..91442247fc7b793255d6121467595229aa729f7e
--- /dev/null
+++ b/python/oneflow/compatible/single_client/support/box.py
@@ -0,0 +1,40 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+
+class Box(object):
+    def __init__(self, *arg):
+        assert len(arg) <= 1
+        self.has_value_ = len(arg) > 0
+        self.value_ = None
+        if self.has_value_:
+            self.value_ = arg[0]
+
+    @property
+    def value(self):
+        assert self.has_value_
+        return self.value_
+
+    @property
+    def value_setter(self):
+        return lambda val: self.set_value(val)
+
+    def set_value(self, val):
+        self.value_ = val
+        self.has_value_ = True
+
+    def has_value(self):
+        return self.has_value_
diff --git a/python/oneflow/compatible/single_client/support/enable_if.py b/python/oneflow/compatible/single_client/support/enable_if.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f2ddca00914205333c95540541c6ec9fa895927
--- /dev/null
+++ b/python/oneflow/compatible/single_client/support/enable_if.py
@@ -0,0 +1,103 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import inspect
+
+from oneflow.compatible.single_client.support import traceinfo as traceinfo
+
+
+def condition(hob_expr):
+    def Decorator(func):
+        func.__oneflow_condition_hob__ = hob_expr
+        return func
+
+    return Decorator
+
+
+def get_condition_hob(func):
+    assert hasattr(func, "__oneflow_condition_hob__")
+    return func.__oneflow_condition_hob__
+
+
+def set_condition_hob(func, hob):
+    func.__oneflow_condition_hob__ = hob
+
+
+def unique(arg_funcs, context=None, default=None):
+    assert isinstance(arg_funcs, (list, tuple))
+    conditional_functions = []
+    for arg_func in arg_funcs:
+        if isinstance(arg_func, tuple):
+            (func, hob_expr) = arg_func
+        elif inspect.isfunction(arg_func):
+            func = arg_func
+            assert hasattr(func, "__oneflow_condition_hob__")
+            hob_expr = func.__oneflow_condition_hob__
+        else:
+            raise NotImplementedError
+        debug_str = func.__name__
+        if hasattr(func, "__debug_str__"):
+            debug_str = func.__debug_str__
+        conditional_functions.append((hob_expr, func, debug_str))
+    if default is None:
+
+        def default(get_failed_info, *args, **kwargs):
+            raise NotImplementedError(get_failed_info())
+
+    matched_func = GetMatchedFunction(default, conditional_functions, context=context)
+    if matched_func is not None:
+        return matched_func
+    return MakeDefaultFunction(default, conditional_functions, context=context)
+
+
+def GetMatchedFunction(default, conditional_functions, context=None):
+    select_triple = (None, None, None)
+    for triple in conditional_functions:
+        if not triple[0](context):
+            continue
+        if select_triple[1] is not None:
+            return _MultiMatchedErrorFunction(
+                default, [select_triple, triple], context=context
+            )
+        select_triple = triple
+    return select_triple[1]
+
+
+def MakeDefaultFunction(default, conditional_functions, context=None):
+    def get_failed_info(customized_prompt=None):
+        failed_info = "no avaliable function found.\n"
+        for (bf, func, location) in conditional_functions:
+            prompt = location if customized_prompt is None else customized_prompt
+            failed_info += "\n%s: \x1b[1;31mFAILED\x1b[0m\n\t%s\n" % (
+                prompt,
+                bf.debug_str(context),
+            )
+        return failed_info
+
+    return lambda *args, **kwargs: default(get_failed_info, *args, **kwargs)
+
+
+def _MultiMatchedErrorFunction(default, matched_functions, context=None):
+    def get_failed_info(customized_prompt=None):
+        failed_info = "at least two conditional functions matched.\n"
+        for (bf, func, location) in matched_functions:
+            prompt = location if customized_prompt is None else customized_prompt
+            failed_info += "\n%s: \x1b[1;31mPASSED\x1b[0m\n\t%s\n" % (
+                prompt,
+                bf.debug_str(context),
+            )
+        return failed_info
+
+    return lambda *args, **kwargs: default(get_failed_info, *args, **kwargs)
diff --git a/python/oneflow/compatible/single_client/support/func_inspect_util.py b/python/oneflow/compatible/single_client/support/func_inspect_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..acfb0ced42062f6104d93523835ff0b0890d48fc
--- /dev/null
+++ b/python/oneflow/compatible/single_client/support/func_inspect_util.py
@@ -0,0 +1,49 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import inspect
+import sys
+
+if sys.version_info > (2, 7) and sys.version_info < (3, 0):
+
+    def GetArgNameAndDefaultTuple(func):
+        """
+      returns a dictionary of arg_name:default_values for the input function
+      """
+        (args, varargs, keywords, defaults) = inspect.getargspec(func)
+        defaults = list(defaults) if defaults is not None else []
+        while len(defaults) < len(args):
+            defaults.insert(0, None)
+        return tuple(zip(args, defaults))
+
+
+elif sys.version_info >= (3, 0):
+
+    def GetArgNameAndDefaultTuple(func):
+        signature = inspect.signature(func)
+        return tuple(
+            [
+                (k, v.default if v.default is not inspect.Parameter.empty else None)
+                for (k, v) in signature.parameters.items()
+            ]
+        )
+
+
+else:
+    raise NotImplementedError
+
+
+def GetArgDefaults(func):
+    return tuple(map(lambda x: x[1], GetArgNameAndDefaultTuple(func)))
diff --git a/python/oneflow/compatible/single_client/support/high_order_bool.py b/python/oneflow/compatible/single_client/support/high_order_bool.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf507cab030921b053ad5dea1ea01c440f484330
--- /dev/null
+++ b/python/oneflow/compatible/single_client/support/high_order_bool.py
@@ -0,0 +1,207 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow._oneflow_internal
+from oneflow.compatible import single_client as flow
+
+
+def bool_functor(verbose_debug_str):
+    def Decorator(match_function):
+        return HighOrderBool(verbose_debug_str, match_function)
+
+    return Decorator
+
+
+def hob_context_attr(attr_name):
+    def Decorator(attr_getter):
+        return HobContextAttr(attr_name, attr_getter)
+
+    return Decorator
+
+
+class BoolFunctor(object):
+    def debug_str(self, ctx, display_result=True):
+        if hasattr(self, "__debug_str__"):
+            if display_result:
+                return '"%s"[%s]' % (self.__debug_str__, self(ctx))
+            else:
+                return '"%s"' % self.__debug_str__
+        return self.verbose_debug_str(ctx, display_result=display_result)
+
+    def verbose_debug_str(self, ctx, display_result=True):
+        raise NotImplementedError
+
+    def __call__(self, ctx):
+        raise NotImplementedError
+
+    def __and__(self, rhs):
+        return _AndBoolFunctor(self, rhs)
+
+    def __or__(self, rhs):
+        return _OrBoolFunctor(self, rhs)
+
+    def __invert__(self):
+        return _NotBoolFunctor(self)
+
+
+class HighOrderBool(BoolFunctor):
+    def __init__(self, verbose_debug_str, function):
+        self.verbose_debug_str_ = verbose_debug_str
+        self.function_ = function
+
+    def verbose_debug_str(self, ctx, display_result=True):
+        if display_result:
+            return '"%s"[%s]' % (self.verbose_debug_str_, self.function_(ctx))
+        else:
+            return '"%s"' % self.verbose_debug_str_
+
+    def __call__(self, ctx):
+        return self.function_(ctx)
+
+
+always_true = HighOrderBool("Always true", lambda: True)
+always_false = HighOrderBool("Always false", lambda: False)
+
+
+class _AndBoolFunctor(BoolFunctor):
+    def __init__(self, lhs, rhs):
+        assert isinstance(lhs, BoolFunctor)
+        assert isinstance(rhs, BoolFunctor)
+        self.lhs_ = lhs
+        self.rhs_ = rhs
+
+    def verbose_debug_str(self, ctx, display_result=True):
+        left_display = self.lhs_.debug_str(ctx, display_result)
+        display_result = display_result and self.lhs_(ctx)
+        right_display = self.rhs_.debug_str(ctx, display_result)
+        return "(%s and %s)" % (left_display, right_display)
+
+    def __call__(self, ctx):
+        return self.lhs_(ctx) and self.rhs_(ctx)
+
+
+class _OrBoolFunctor(BoolFunctor):
+    def __init__(self, lhs, rhs):
+        assert isinstance(lhs, BoolFunctor)
+        assert isinstance(rhs, BoolFunctor)
+        self.lhs_ = lhs
+        self.rhs_ = rhs
+
+    def verbose_debug_str(self, ctx, display_result=True):
+        left_display = self.lhs_.debug_str(ctx, display_result)
+        display_result = display_result and (not self.lhs_(ctx))
+        right_display = self.rhs_.debug_str(ctx, display_result)
+        return "(%s or %s)" % (left_display, right_display)
+
+    def __call__(self, ctx):
+        return self.lhs_(ctx) or self.rhs_(ctx)
+
+
+class _NotBoolFunctor(BoolFunctor):
+    def __init__(self, x):
+        assert isinstance(x, BoolFunctor)
+        self.x_ = x
+
+    def verbose_debug_str(self, ctx, display_result=True):
+        return "(not %s)" % self.x_.debug_str(ctx, display_result)
+
+    def __call__(self, ctx):
+        return not self.x_(ctx)
+
+
+class HobContextGetter(object):
+    def __init__(self, attr_name, attr_getter):
+        self.attr_name_ = attr_name
+        self.attr_getter_ = attr_getter
+
+    @property
+    def attr_name(self):
+        return self.attr_name_
+
+    @property
+    def attr_getter(self):
+        return self.attr_getter_
+
+    def __eq__(self, other):
+        if not isinstance(other, HobContextGetter):
+            other = HobContextConstant(other)
+        return self._MakeHob(other, "==", lambda a, b: a == b)
+
+    def __ne__(self, other):
+        if not isinstance(other, HobContextGetter):
+            other = HobContextConstant(other)
+        return self._MakeHob(other, "!=", lambda a, b: a != b)
+
+    def __gt__(self, other):
+        if not isinstance(other, HobContextGetter):
+            other = HobContextConstant(other)
+        return self._MakeHob(other, ">", lambda a, b: a > b)
+
+    def __ge__(self, other):
+        if not isinstance(other, HobContextGetter):
+            other = HobContextConstant(other)
+        return self._MakeHob(other, ">=", lambda a, b: a >= b)
+
+    def __lt__(self, other):
+        if not isinstance(other, HobContextGetter):
+            other = HobContextConstant(other)
+        return self._MakeHob(other, "<", lambda a, b: a < b)
+
+    def __le__(self, other):
+        if not isinstance(other, HobContextGetter):
+            other = HobContextConstant(other)
+        return self._MakeHob(other, "<=", lambda a, b: a <= b)
+
+    def _MakeHob(self, other, cmp_str, cmp_func):
+        @bool_functor("%s %s %s" % (self.attr_name, cmp_str, other.attr_name))
+        def HobHob(context):
+            return cmp_func(self.attr_getter(context), other.attr_getter(context))
+
+        return HobHob
+
+
+class HobContextConstant(HobContextGetter):
+    def __init__(self, value):
+        HobContextGetter.__init__(self, str(value), lambda ctx: value)
+
+
+class HobContextAttr(HobContextGetter):
+    def __init__(self, attr_name, attr_getter):
+        HobContextGetter.__init__(self, attr_name, attr_getter)
+
+    def __getattr__(self, attr_name):
+        @hob_context_attr("%s.%s" % (self.attr_name, attr_name))
+        def HobCtxAttr(ctx):
+            obj = self.attr_getter(ctx)
+            if isinstance(obj, oneflow._oneflow_internal.CfgMessage):
+                return getattr(obj, attr_name)()
+            else:
+                return getattr(obj, attr_name)
+
+        return HobCtxAttr
+
+    def HasField(self, attr_name):
+        @bool_functor('%s.HasField("%s")' % (self.attr_name, attr_name))
+        def BoolFunctor(ctx):
+            obj = self.attr_getter(ctx)
+            if isinstance(obj, oneflow._oneflow_internal.CfgMessage):
+                assert hasattr(obj, "has_" + attr_name), type(obj)
+                return getattr(obj, "has_" + attr_name)()
+            elif hasattr(obj, "HasField"):
+                return obj.HasField(attr_name)
+            else:
+                return hasattr(obj, attr_name)
+
+        return BoolFunctor
diff --git a/python/oneflow/compatible/single_client/support/lazy.py b/python/oneflow/compatible/single_client/support/lazy.py
new file mode 100644
index 0000000000000000000000000000000000000000..27660fd03d5676f0c922128188101b471b82e8e1
--- /dev/null
+++ b/python/oneflow/compatible/single_client/support/lazy.py
@@ -0,0 +1,29 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+
+class Lazy(object):
+    def __init__(self, get_value):
+        self.value_ = None
+        self.has_value_ = False
+        self.get_value_ = get_value
+
+    @property
+    def value(self):
+        if not self.has_value_:
+            self.value_ = self.get_value_()
+            self.has_value_ = True
+        return self.value_
diff --git a/python/oneflow/compatible/single_client/support/pb_util.py b/python/oneflow/compatible/single_client/support/pb_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1299a91f2395a251dc078da5d9c8c39e6a4884ad
--- /dev/null
+++ b/python/oneflow/compatible/single_client/support/pb_util.py
@@ -0,0 +1,92 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+
+def PythonDict2CFG(value, msg):
+    def extend_dict(values, msg):
+        for (k, v) in values.items():
+            if type(v) is dict:
+                extend_dict(v, getattr(msg, "mutable_" + k)())
+            elif type(v) is list or type(v) is tuple:
+                extend_list_or_tuple(v, msg, k)
+            else:
+                getattr(msg, "set_" + k)(v)
+
+    def extend_list_or_tuple(values, msg, attr):
+        if len(values) == 0 or type(values[0]) is dict:
+            msg = getattr(msg, "mutable_" + attr)()
+            for v in values:
+                cmd = msg.Add()
+                extend_dict(v, cmd)
+        else:
+            for v in values:
+                getattr(msg, "add_" + attr)(v)
+
+    extend_dict(value, msg)
+    return msg
+
+
+def PythonDict2PbMessage(value, msg):
+    def extend_dict(values, msg):
+        for (k, v) in values.items():
+            if type(v) is dict:
+                extend_dict(v, getattr(msg, k))
+            elif type(v) is list or type(v) is tuple:
+                extend_list_or_tuple(v, getattr(msg, k))
+            else:
+                setattr(msg, k, v)
+        else:
+            msg.SetInParent()
+
+    def extend_list_or_tuple(values, msg):
+        if len(values) == 0:
+            return
+        if type(values[0]) is dict:
+            for v in values:
+                cmd = msg.add()
+                extend_dict(v, cmd)
+        else:
+            msg.extend(values)
+
+    extend_dict(value, msg)
+    return msg
+
+
+def MergePbMessage(dst, src):
+    assert type(dst) is type(src)
+    for field in dst.DESCRIPTOR.fields:
+        field_name = field.name
+        if field.containing_oneof is not None:
+            if dst.WhichOneof(field.containing_oneof.name) is not None:
+                continue
+            src_field_name = src.WhichOneof(field.containing_oneof.name)
+            if src_field_name is None:
+                continue
+            if field_name != src_field_name:
+                continue
+        else:
+            if dst.HasField(field_name):
+                continue
+            if not src.HasField(field_name):
+                continue
+        _MergePbMessageField(dst, src, field)
+
+
+def _MergePbMessageField(dst, src, field):
+    if field.message_type is None:
+        setattr(dst, field.name, getattr(src, field.name))
+    else:
+        MergePbMessage(getattr(dst, field.name), getattr(src, field.name))
diff --git a/python/oneflow/compatible/single_client/support/scope_stack.py b/python/oneflow/compatible/single_client/support/scope_stack.py
new file mode 100644
index 0000000000000000000000000000000000000000..28014d06035a6133ed37a2cf9b254e5a35372066
--- /dev/null
+++ b/python/oneflow/compatible/single_client/support/scope_stack.py
@@ -0,0 +1,34 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from contextlib import contextmanager
+
+
+class ScopeStack(object):
+    def __init__(self, init=[]):
+        if not isinstance(init, list):
+            init = [init]
+        assert isinstance(init, list)
+        self.stack_ = init
+
+    def Current(self):
+        assert len(self.stack_) > 0
+        return self.stack_[0]
+
+    @contextmanager
+    def NewScope(self, scope):
+        self.stack_.insert(0, scope)
+        yield
+        self.stack_.pop(0)
diff --git a/python/oneflow/compatible/single_client/support/traceinfo.py b/python/oneflow/compatible/single_client/support/traceinfo.py
new file mode 100644
index 0000000000000000000000000000000000000000..94cae3f84e6250e6cc2a1dd3a3fbe4c2450bda34
--- /dev/null
+++ b/python/oneflow/compatible/single_client/support/traceinfo.py
@@ -0,0 +1,34 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+import traceback
+
+
+def GetFrameLocationStr(depth=-1):
+    assert depth < 0
+    frame = traceback.extract_stack()[depth - 1]
+    return "%s:%d" % (frame[0], frame[1])
+
+
+def GetStackInfoExcludeOneflowPythonFile():
+    from oneflow.compatible import single_client as flow
+
+    dirname = os.path.dirname(oneflow.__file__)
+    stack_info = traceback.extract_stack()
+    filtered_stack_info = filter(
+        lambda x: x[0].startswith(dirname) == False, stack_info
+    )
+    return list(filtered_stack_info)
diff --git a/python/oneflow/compatible/single_client/sysconfig.py b/python/oneflow/compatible/single_client/sysconfig.py
new file mode 100644
index 0000000000000000000000000000000000000000..64238aaa8463c6d54028ae448623d24eaf99c8a1
--- /dev/null
+++ b/python/oneflow/compatible/single_client/sysconfig.py
@@ -0,0 +1,25 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.framework.sysconfig import (
+    get_compile_flags,
+    get_include,
+    get_lib,
+    get_link_flags,
+    has_rpc_backend_grpc,
+    has_rpc_backend_local,
+    with_cuda,
+    with_xla,
+)
diff --git a/python/oneflow/compatible/single_client/system.py b/python/oneflow/compatible/single_client/system.py
new file mode 100644
index 0000000000000000000000000000000000000000..182165c19da45690f10c53be0857b55b92daaa39
--- /dev/null
+++ b/python/oneflow/compatible/single_client/system.py
@@ -0,0 +1,16 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.ops.assign_op import api_system_assign as assign
diff --git a/python/oneflow/compatible/single_client/tensorrt.py b/python/oneflow/compatible/single_client/tensorrt.py
new file mode 100644
index 0000000000000000000000000000000000000000..294baa0075314187d6d9f8e372f118079bea9595
--- /dev/null
+++ b/python/oneflow/compatible/single_client/tensorrt.py
@@ -0,0 +1,19 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.contrib.tensorrt.tensorrt_api import (
+    cache_int8_calibration,
+    write_int8_calibration,
+)
diff --git a/python/oneflow/compatible/single_client/test/custom_ops/test_user_sigmoid.py b/python/oneflow/compatible/single_client/test/custom_ops/test_user_sigmoid.py
new file mode 100644
index 0000000000000000000000000000000000000000..f67ba76b234f1700504546f899b8734c8ebd7923
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/custom_ops/test_user_sigmoid.py
@@ -0,0 +1,120 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import math
+import os
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client as flow
+import oneflow.compatible.single_client.typing as oft
+import oneflow.compatible.single_client.unittest
+
+func_config = flow.FunctionConfig()
+func_config.default_data_type(flow.float)
+module_path = os.path.dirname(os.path.abspath(__file__))
+print("module_path:", module_path)
+print("pwd_path:", os.getcwd())
+user_sigmoid_op = flow.experimental.custom_op_module("user_sigmoid", module_path)
+user_sigmoid_op.py_api().cpp_def().py_kernel().build_load()
+
+
+def numpy_sigmoid(x):
+    return 1 / (1 + np.exp(-x))
+
+
+def numpy_sigmoid_grad(y, dy):
+    return y * (1 - y) * dy
+
+
+def make_job(input_shape, dtype=flow.float32):
+    @flow.global_function(function_config=func_config)
+    def sigmoid_job(x: oft.Numpy.Placeholder(input_shape, dtype=dtype)):
+        return flow.math.sigmoid(x)
+
+    return sigmoid_job
+
+
+def make_grad_job(y_shape, dy_shape, dtype=flow.float32):
+    @flow.global_function(function_config=func_config)
+    def sigmoid_grad_job(
+        y: oft.Numpy.Placeholder(y_shape, dtype=dtype),
+        dy: oft.Numpy.Placeholder(dy_shape, dtype=dtype),
+    ):
+        return flow.math.sigmoid_grad(y, dy)
+
+    return sigmoid_grad_job
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestUserSigmoid(flow.unittest.TestCase):
+    def test_user_sigmoid(test_case):
+        flow.clear_default_session()
+
+        def make_py_job(input_shape, dtype=flow.float32):
+            @flow.global_function(function_config=func_config)
+            def sigmoid_py_job(x: oft.Numpy.Placeholder(input_shape, dtype=dtype)):
+                with flow.scope.placement("cpu", "0:0"):
+                    return user_sigmoid_op.api.user_sigmoid_forward(x)
+
+            return sigmoid_py_job
+
+        x = np.ones((1, 10), dtype=np.float32)
+        sig_job = make_job(x.shape)
+        py_sig_job = make_py_job(x.shape)
+        sig = sig_job(x).get().numpy()
+        py_sig = py_sig_job(x).get().numpy()
+        numpy_sig = numpy_sigmoid(x)
+        print("sig : ", sig)
+        print("py_sig : ", py_sig)
+        print("numpy_sig : ", numpy_sig)
+        test_case.assertTrue(np.allclose(sig, py_sig, rtol=0.001, atol=1e-05))
+        test_case.assertTrue(np.allclose(py_sig, numpy_sig, rtol=0.001, atol=1e-05))
+
+    def test_user_sigmoid_grad(test_case):
+        flow.clear_default_session()
+
+        def make_py_grad_job(y_shape, dy_shape, dtype=flow.float32):
+            @flow.global_function(function_config=func_config)
+            def sigmoid_py_grad_job(
+                y: oft.Numpy.Placeholder(y_shape, dtype=dtype),
+                dy: oft.Numpy.Placeholder(dy_shape, dtype=dtype),
+            ):
+                with flow.scope.placement("cpu", "0:0"):
+                    return user_sigmoid_op.api.user_sigmoid_backward(y, dy)
+
+            return sigmoid_py_grad_job
+
+        x = np.ones((1, 10), dtype=np.float32)
+        y = 0.5 * np.ones((1, 10), dtype=np.float32)
+        dy = 0.2 * np.ones((1, 10), dtype=np.float32)
+        sig_grad_job = make_grad_job(y.shape, dy.shape)
+        py_sig_grad_job = make_py_grad_job(y.shape, dy.shape)
+        sig_grad = sig_grad_job(y, dy).get().numpy()
+        py_sig_grad = py_sig_grad_job(y, dy).get().numpy()
+        numpy_sig_grad = numpy_sigmoid_grad(y, dy)
+        print("sig_grad", sig_grad)
+        print("py_sig_grad", py_sig_grad)
+        print("numpy_sig_grad", numpy_sig_grad)
+        test_case.assertTrue(np.allclose(sig_grad, py_sig_grad, rtol=0.001, atol=1e-05))
+        test_case.assertTrue(
+            np.allclose(py_sig_grad, numpy_sig_grad, rtol=0.001, atol=1e-05)
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/custom_ops/user_sigmoid/user_sigmoid_cpp_def.cpp b/python/oneflow/compatible/single_client/test/custom_ops/user_sigmoid/user_sigmoid_cpp_def.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5d786666c92fd9f94d46f406f413ec53664facea
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/custom_ops/user_sigmoid/user_sigmoid_cpp_def.cpp
@@ -0,0 +1,94 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/framework.h"
+
+namespace oneflow {
+
+namespace {
+
+REGISTER_USER_OP("user_sigmoid_forward")
+    .Input("x")
+    .Output("y")
+    .Attr<std::string>("device_sub_tag", "py")
+    .SetTensorDescInferFn([](user_op::InferContext* ctx) -> Maybe<void> {
+      const Shape& in_shape = ctx->InputShape("x", 0);
+      Shape* out_shape = ctx->OutputShape("y", 0);
+      *out_shape = in_shape;
+      return Maybe<void>::Ok();
+    })
+    .SetGetSbpFn([](user_op::SbpContext* ctx) -> Maybe<void> {
+      const user_op::TensorDesc& in_tensor = ctx->LogicalTensorDesc4InputArgNameAndIndex("x", 0);
+      FOR_RANGE(int64_t, i, 0, in_tensor.shape().NumAxes()) {
+        ctx->NewBuilder().Split(user_op::OpArg("x", 0), i).Split(user_op::OpArg("y", 0), i).Build();
+      }
+      return Maybe<void>::Ok();
+    })
+    .SetDataTypeInferFn([](user_op::InferContext* ctx) -> Maybe<void> {
+      *ctx->OutputDType("y", 0) = ctx->InputDType("x", 0);
+      return Maybe<void>::Ok();
+    });
+
+REGISTER_USER_OP("user_sigmoid_backward")
+    .Input("y")
+    .Input("dy")
+    .Output("dx")
+    .Attr<std::string>("device_sub_tag", "py")
+    .SetTensorDescInferFn([](user_op::InferContext* ctx) -> Maybe<void> {
+      const Shape& y_shape = ctx->InputShape("y", 0);
+      const Shape& dy_shape = ctx->InputShape("dy", 0);
+      Shape* dx_shape = ctx->OutputShape("dx", 0);
+      CHECK(dy_shape == y_shape);
+      *dx_shape = dy_shape;
+      return Maybe<void>::Ok();
+    })
+    .SetGetSbpFn([](user_op::SbpContext* ctx) -> Maybe<void> {
+      const user_op::TensorDesc& y_tensor = ctx->LogicalTensorDesc4InputArgNameAndIndex("y", 0);
+      FOR_RANGE(int64_t, i, 0, y_tensor.shape().NumAxes()) {
+        ctx->NewBuilder()
+            .Split(user_op::OpArg("y", 0), i)
+            .Split(user_op::OpArg("dy", 0), i)
+            .Split(user_op::OpArg("dx", 0), i)
+            .Build();
+      }
+      return Maybe<void>::Ok();
+    })
+    .SetDataTypeInferFn([](user_op::InferContext* ctx) -> Maybe<void> {
+      *ctx->OutputDType("dx", 0) = ctx->InputDType("y", 0);
+      return Maybe<void>::Ok();
+    });
+
+REGISTER_USER_OP_GRAD("user_sigmoid_forward")
+    .SetBackwardOpConfGenFn([](user_op::BackwardOpConfContext* ctx) -> Maybe<void> {
+      const auto grad_op_name = ctx->FwOp().op_name() + "_grad";
+      const auto& grad_op_func = [&ctx](user_op::BackwardOpBuilder& builder) {
+        return builder.OpTypeName("user_sigmoid_backward")
+            .InputBind("y", ctx->FwOp().output("y", 0))
+            .InputBind("dy", ctx->FwOp().output_grad("y", 0))
+            .Output("dx")
+            .Build();
+      };
+      ctx->DefineOp(grad_op_name, grad_op_func);
+
+      const auto& dx_get_func = [&ctx, &grad_op_name]() -> const std::string& {
+        return ctx->GetOp(grad_op_name).output("dx", 0);
+      };
+      ctx->FwOp().InputGradBind(user_op::OpArg("x", 0), dx_get_func);
+      return Maybe<void>::Ok();
+    });
+
+}  // namespace
+
+}  // namespace oneflow
diff --git a/python/oneflow/compatible/single_client/test/custom_ops/user_sigmoid/user_sigmoid_py_api.py b/python/oneflow/compatible/single_client/test/custom_ops/user_sigmoid/user_sigmoid_py_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..e99de616d07ba4258799d45a23f1c421795df472
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/custom_ops/user_sigmoid/user_sigmoid_py_api.py
@@ -0,0 +1,48 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+from typing import Callable, List, Optional, Sequence, Tuple, Union
+
+import oneflow.compatible.single_client as flow
+
+
+def user_sigmoid_forward(x, name: Optional[str] = None):
+    return (
+        flow.user_op_builder(
+            name if name is not None else flow.util.unique_str("UserSigmoidForward_")
+        )
+        .Op("user_sigmoid_forward")
+        .Input("x", [x])
+        .Output("y")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def user_sigmoid_backward(y, dy, name: Optional[str] = None):
+    return (
+        flow.user_op_builder(
+            name if name is not None else flow.util.unique_str("UerSigmoidBackward_")
+        )
+        .Op("user_sigmoid_backward")
+        .Input("y", [y])
+        .Input("dy", [dy])
+        .Output("dx")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
diff --git a/python/oneflow/compatible/single_client/test/custom_ops/user_sigmoid/user_sigmoid_py_kernel.py b/python/oneflow/compatible/single_client/test/custom_ops/user_sigmoid/user_sigmoid_py_kernel.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea14e1f3d0d53b1b9e180792617f4555f63155f9
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/custom_ops/user_sigmoid/user_sigmoid_py_kernel.py
@@ -0,0 +1,29 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import numpy as np
+
+
+def forward(args):
+    print("user sigmoid forward args", args)
+    (x,) = args
+    y = 1 / (1 + np.exp(-x))
+    return y
+
+
+def backward(args):
+    print("user sigmoid backward args", args)
+    (y, dy) = args
+    return y * (1 - y) * dy
diff --git a/python/oneflow/compatible/single_client/test/models/1node_test.py b/python/oneflow/compatible/single_client/test/models/1node_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7c3e5a823dce2746ad4a26138e9bf5aa918f81a
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/models/1node_test.py
@@ -0,0 +1,62 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+
+import env_1node
+from absl import app
+from absl.testing import absltest
+from cnns_tests import (
+    TestAlexNetMixin,
+    TestInceptionV3Mixin,
+    TestResNet50Mixin,
+    TestVgg16Mixin,
+)
+from test_1node_mixin import Test1NodeMixin
+
+from oneflow.compatible import single_client as flow
+
+
+class TestAlexNet(Test1NodeMixin, TestAlexNetMixin, absltest.TestCase):
+    pass
+
+
+class TestResNet50(Test1NodeMixin, TestResNet50Mixin, absltest.TestCase):
+    pass
+
+
+class TestVgg16(Test1NodeMixin, TestVgg16Mixin, absltest.TestCase):
+    pass
+
+
+class TestInceptionV3(Test1NodeMixin, TestInceptionV3Mixin, absltest.TestCase):
+    pass
+
+
+flow.unittest.register_test_cases(
+    scope=globals(),
+    directory=os.path.dirname(os.path.realpath(__file__)),
+    filter_by_num_nodes=lambda x: x == 1,
+    base_class=absltest.TestCase,
+)
+
+
+def main(argv):
+    env_1node.Init()
+    absltest.main()
+
+
+if __name__ == "__main__":
+    app.run(main)
diff --git a/python/oneflow/compatible/single_client/test/models/2node_test.py b/python/oneflow/compatible/single_client/test/models/2node_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..419af764bcd872828ceb2506beb772b26852422f
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/models/2node_test.py
@@ -0,0 +1,60 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+
+import cnns_tests
+import env_2node
+import numpy
+from absl import app
+from absl.testing import absltest
+from test_2node_mixin import Test2NodeMixin
+
+from oneflow.compatible import single_client as flow
+
+
+class TestAlexNet(Test2NodeMixin, cnns_tests.TestAlexNetMixin, absltest.TestCase):
+    pass
+
+
+class TestResNet50(Test2NodeMixin, cnns_tests.TestResNet50Mixin, absltest.TestCase):
+    pass
+
+
+class TestVgg16(Test2NodeMixin, cnns_tests.TestVgg16Mixin, absltest.TestCase):
+    pass
+
+
+class TestInceptionV3(
+    Test2NodeMixin, cnns_tests.TestInceptionV3Mixin, absltest.TestCase
+):
+    pass
+
+
+flow.unittest.register_test_cases(
+    scope=globals(),
+    directory=os.path.dirname(os.path.realpath(__file__)),
+    filter_by_num_nodes=lambda x: x == 2,
+    base_class=absltest.TestCase,
+)
+
+
+def main(argv):
+    env_2node.Init()
+    absltest.main()
+
+
+if __name__ == "__main__":
+    app.run(main)
diff --git a/python/oneflow/compatible/single_client/test/models/alexnet.py b/python/oneflow/compatible/single_client/test/models/alexnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8f51cc8be36f6536fe9d61e87389ec1bd9c0a39
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/models/alexnet.py
@@ -0,0 +1,288 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import argparse
+import os
+from datetime import datetime
+
+import numpy
+
+from oneflow.compatible import single_client as flow
+from oneflow.core.job import initializer_conf_pb2 as initializer_conf_util
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+
+_DATA_DIR = "/dataset/PNGS/PNG227/of_record_repeated"
+_MODEL_SAVE_DIR = "./model_save-{}".format(
+    str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))
+)
+_MODEL_LOAD = "/dataset/PNGS/cnns_model_for_test/alexnet/models/of_model_bk"
+NODE_LIST = "192.168.1.12,192.168.1.14"
+
+
+class DLNetSpec(object):
+    def __init__(self, enable_auto_mixed_precision):
+        self.batch_size = 8
+        self.data_part_num = 32
+        self.eval_dir = _DATA_DIR
+        self.train_dir = _DATA_DIR
+        self.model_save_dir = _MODEL_SAVE_DIR
+        self.model_load_dir = _MODEL_LOAD
+        self.num_nodes = 1
+        self.gpu_num_per_node = 1
+        self.iter_num = 10
+        self.enable_auto_mixed_precision = enable_auto_mixed_precision
+
+
+parser = argparse.ArgumentParser(description="flags for multi-node and resource")
+parser.add_argument("-nn", "--num_nodes", type=str, default=1, required=False)
+parser.add_argument("-g", "--gpu_num_per_node", type=int, default=1, required=False)
+parser.add_argument("-i", "--iter_num", type=int, default=10, required=False)
+parser.add_argument(
+    "-m", "--multinode", default=False, action="store_true", required=False
+)
+parser.add_argument("-n", "--node_list", type=str, default=NODE_LIST, required=False)
+parser.add_argument(
+    "-s", "--skip_scp_binary", default=False, action="store_true", required=False
+)
+parser.add_argument(
+    "-c",
+    "--scp_binary_without_uuid",
+    default=False,
+    action="store_true",
+    required=False,
+)
+parser.add_argument(
+    "-r", "--remote_by_hand", default=False, action="store_true", required=False
+)
+parser.add_argument("-e", "--eval_dir", type=str, default=_DATA_DIR, required=False)
+parser.add_argument("-t", "--train_dir", type=str, default=_DATA_DIR, required=False)
+parser.add_argument(
+    "-load", "--model_load_dir", type=str, default=_MODEL_LOAD, required=False
+)
+parser.add_argument(
+    "-save", "--model_save_dir", type=str, default=_MODEL_SAVE_DIR, required=False
+)
+parser.add_argument("-dn", "--data_part_num", type=int, default=32, required=False)
+parser.add_argument("-b", "--batch_size", type=int, default=8, required=False)
+
+
+def _conv2d_layer(
+    args,
+    name,
+    input,
+    filters,
+    kernel_size=3,
+    strides=1,
+    padding="SAME",
+    data_format="NCHW",
+    dilation_rate=1,
+    activation=op_conf_util.kRelu,
+    use_bias=False,
+    weight_initializer=flow.random_uniform_initializer(),
+    bias_initializer=flow.random_uniform_initializer(),
+):
+    weight_shape = (filters, input.shape[1], kernel_size, kernel_size)
+    weight = flow.get_variable(
+        name + "-weight",
+        shape=weight_shape,
+        dtype=input.dtype,
+        initializer=weight_initializer,
+    )
+    output = flow.nn.conv2d(
+        input, weight, strides, padding, None, data_format, dilation_rate, name=name
+    )
+    if use_bias:
+        bias = flow.get_variable(
+            name + "-bias",
+            shape=(filters,),
+            dtype=input.dtype,
+            initializer=bias_initializer,
+        )
+        output = flow.nn.bias_add(output, bias, data_format)
+    if activation is not None:
+        if activation == op_conf_util.kRelu:
+            output = flow.nn.relu(output)
+        else:
+            raise NotImplementedError
+    return output
+
+
+def _data_load_layer(args, data_dir):
+    node_num = args.num_nodes
+    total_batch_size = args.batch_size * args.gpu_num_per_node * node_num
+    rgb_mean = [123.68, 116.78, 103.94]
+    (image, label) = flow.data.ofrecord_image_classification_reader(
+        data_dir,
+        batch_size=total_batch_size,
+        data_part_num=args.data_part_num,
+        image_feature_name="encoded",
+        label_feature_name="class/label",
+        color_space="RGB",
+        name="decode",
+    )
+    rsz = flow.image.resize(image, resize_x=227, resize_y=227, color_space="RGB")
+    normal = flow.image.crop_mirror_normalize(
+        rsz,
+        color_space="RGB",
+        output_layout="NCHW",
+        mean=rgb_mean,
+        output_dtype=flow.float,
+    )
+    return (label, normal)
+
+
+def alexnet(args, images, labels, trainable=True):
+    conv1 = _conv2d_layer(
+        args, "conv1", images, filters=64, kernel_size=11, strides=4, padding="VALID"
+    )
+    pool1 = flow.nn.avg_pool2d(conv1, 3, 2, "VALID", "NCHW", name="pool1")
+    conv2 = _conv2d_layer(args, "conv2", pool1, filters=192, kernel_size=5)
+    pool2 = flow.nn.avg_pool2d(conv2, 3, 2, "VALID", "NCHW", name="pool2")
+    conv3 = _conv2d_layer(args, "conv3", pool2, filters=384)
+    conv4 = _conv2d_layer(args, "conv4", conv3, filters=384)
+    conv5 = _conv2d_layer(args, "conv5", conv4, filters=256)
+    pool5 = flow.nn.avg_pool2d(conv5, 3, 2, "VALID", "NCHW", name="pool5")
+
+    def _get_initializer():
+        kernel_initializer = initializer_conf_util.InitializerConf()
+        kernel_initializer.truncated_normal_conf.std = 0.816496580927726
+        return kernel_initializer
+
+    if len(pool5.shape) > 2:
+        pool5 = flow.reshape(pool5, shape=(pool5.shape[0], -1))
+    fc1 = flow.layers.dense(
+        inputs=pool5,
+        units=4096,
+        activation=flow.math.relu,
+        use_bias=False,
+        kernel_initializer=_get_initializer(),
+        bias_initializer=False,
+        trainable=trainable,
+        name="fc1",
+    )
+    dropout1 = fc1
+    fc2 = flow.layers.dense(
+        inputs=dropout1,
+        units=4096,
+        activation=flow.math.relu,
+        use_bias=False,
+        kernel_initializer=_get_initializer(),
+        bias_initializer=False,
+        trainable=trainable,
+        name="fc2",
+    )
+    dropout2 = fc2
+    fc3 = flow.layers.dense(
+        inputs=dropout2,
+        units=1001,
+        activation=None,
+        use_bias=False,
+        kernel_initializer=_get_initializer(),
+        bias_initializer=False,
+        trainable=trainable,
+        name="fc3",
+    )
+    loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+        labels, fc3, name="softmax_loss"
+    )
+    return loss
+
+
+def main(args):
+    flow.config.machine_num(args.num_nodes)
+    flow.config.gpu_device_num(args.gpu_num_per_node)
+    flow.config.enable_legacy_model_io(True)
+    func_config = flow.FunctionConfig()
+    func_config.default_logical_view(flow.scope.consistent_view())
+    func_config.default_data_type(flow.float)
+    func_config.cudnn_conv_force_fwd_algo(0)
+    func_config.cudnn_conv_force_bwd_data_algo(1)
+    func_config.cudnn_conv_force_bwd_filter_algo(1)
+    func_config.enable_auto_mixed_precision(args.enable_auto_mixed_precision)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def alexnet_train_job():
+        (labels, images) = _data_load_layer(args, args.train_dir)
+        loss = alexnet(args, images, labels)
+        flow.optimizer.SGD(
+            flow.optimizer.PiecewiseConstantScheduler([], [1e-05]), momentum=0
+        ).minimize(loss)
+        return loss
+
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.enable_auto_mixed_precision(args.enable_auto_mixed_precision)
+
+    @flow.global_function(function_config=func_config)
+    def alexnet_eval_job():
+        with flow.scope.consistent_view():
+            (labels, images) = _data_load_layer(args, args.eval_dir)
+            return alexnet(args, images, labels, False)
+
+    check_point = flow.train.CheckPoint()
+    if not args.model_load_dir:
+        check_point.init()
+    else:
+        check_point.load(args.model_load_dir)
+    num_nodes = args.num_nodes
+    print(
+        "Traning alexnet: num_gpu_per_node = {}, num_nodes = {}.".format(
+            args.gpu_num_per_node, num_nodes
+        )
+    )
+    print("{:>12}  {:>12}  {:>12}".format("iter", "loss type", "loss value"))
+    loss = []
+    for i in range(args.iter_num):
+        train_loss = alexnet_train_job().get().mean()
+        loss.append(train_loss)
+        fmt_str = "{:>12}  {:>12}  {:>12.6f}"
+        print(fmt_str.format(i, "train loss:", train_loss))
+        if (i + 1) % 100 == 0:
+            check_point.save(_MODEL_SAVE_DIR + str(i))
+    loss_file = "{}n{}c.npy".format(
+        str(num_nodes), str(args.gpu_num_per_node * num_nodes)
+    )
+    loss_path = "./of_loss/alexnet"
+    if not os.path.exists(loss_path):
+        os.makedirs(loss_path)
+    numpy.save(os.path.join(loss_path, loss_file), loss)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    args.num_nodes = len(args.node_list.strip().split(",")) if args.multinode else 1
+    flow.env.ctrl_port(9788)
+    if args.multinode:
+        flow.env.ctrl_port(12138)
+        nodes = []
+        for n in args.node_list.strip().split(","):
+            addr_dict = {}
+            addr_dict["addr"] = n
+            nodes.append(addr_dict)
+        flow.env.machine(nodes)
+        if args.remote_by_hand is False:
+            if args.scp_binary_without_uuid:
+                flow.deprecated.init_worker(scp_binary=True, use_uuid=False)
+            elif args.skip_scp_binary:
+                flow.deprecated.init_worker(scp_binary=False, use_uuid=False)
+            else:
+                flow.deprecated.init_worker(scp_binary=True, use_uuid=True)
+    main(args)
+    if (
+        args.multinode
+        and args.skip_scp_binary is False
+        and (args.scp_binary_without_uuid is False)
+    ):
+        flow.deprecated.delete_worker()
diff --git a/python/oneflow/compatible/single_client/test/models/alexnet_with_unpack.py b/python/oneflow/compatible/single_client/test/models/alexnet_with_unpack.py
new file mode 100644
index 0000000000000000000000000000000000000000..bef9480baf37b67dcd26bdd3b8b620ef43e4f227
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/models/alexnet_with_unpack.py
@@ -0,0 +1,348 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import argparse
+import os
+from datetime import datetime
+
+import numpy
+
+from oneflow.compatible import single_client as flow
+from oneflow.core.job import initializer_conf_pb2 as initializer_conf_util
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+
+_DATA_DIR = "/dataset/PNGS/PNG227/of_record_repeated"
+_MODEL_SAVE_DIR = "./model_save-{}".format(
+    str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))
+)
+_MODEL_LOAD = "/dataset/PNGS/cnns_model_for_test/alexnet/models/of_model_bk"
+NODE_LIST = "192.168.1.12,192.168.1.14"
+
+
+class DLNetSpec(object):
+    def __init__(self):
+        self.batch_size = 8
+        self.data_part_num = 32
+        self.eval_dir = _DATA_DIR
+        self.train_dir = _DATA_DIR
+        self.model_save_dir = _MODEL_SAVE_DIR
+        self.model_load_dir = _MODEL_LOAD
+        self.num_nodes = 1
+        self.gpu_num_per_node = 1
+        self.iter_num = 10
+        self.num_unpack = 2
+
+
+parser = argparse.ArgumentParser(description="flags for multi-node and resource")
+parser.add_argument("-nn", "--num_nodes", type=str, default=1, required=False)
+parser.add_argument("-g", "--gpu_num_per_node", type=int, default=1, required=False)
+parser.add_argument("-i", "--iter_num", type=int, default=10, required=False)
+parser.add_argument(
+    "-m", "--multinode", default=False, action="store_true", required=False
+)
+parser.add_argument("-n", "--node_list", type=str, default=NODE_LIST, required=False)
+parser.add_argument(
+    "-s", "--skip_scp_binary", default=False, action="store_true", required=False
+)
+parser.add_argument(
+    "-c",
+    "--scp_binary_without_uuid",
+    default=False,
+    action="store_true",
+    required=False,
+)
+parser.add_argument(
+    "-r", "--remote_by_hand", default=False, action="store_true", required=False
+)
+parser.add_argument("-e", "--eval_dir", type=str, default=_DATA_DIR, required=False)
+parser.add_argument("-t", "--train_dir", type=str, default=_DATA_DIR, required=False)
+parser.add_argument(
+    "-load", "--model_load_dir", type=str, default=_MODEL_LOAD, required=False
+)
+parser.add_argument(
+    "-save", "--model_save_dir", type=str, default=_MODEL_SAVE_DIR, required=False
+)
+parser.add_argument("-dn", "--data_part_num", type=int, default=32, required=False)
+parser.add_argument("-b", "--batch_size", type=int, default=8, required=False)
+parser.add_argument("-p", "--num_piece_in_batch", type=int, default=2, required=False)
+
+
+def _conv2d_layer(
+    args,
+    name,
+    input,
+    filters,
+    kernel_size=3,
+    strides=1,
+    padding="SAME",
+    data_format="NCHW",
+    dilation_rate=1,
+    activation=op_conf_util.kRelu,
+    use_bias=False,
+    weight_initializer=flow.random_uniform_initializer(),
+    bias_initializer=flow.random_uniform_initializer(),
+):
+    weight_shape = (filters, input.shape[1], kernel_size, kernel_size)
+    weight = flow.get_variable(
+        name + "-weight",
+        shape=weight_shape,
+        dtype=input.dtype,
+        initializer=weight_initializer,
+    )
+    weight = flow.identity(weight)
+    weight = flow.repeat(weight, args.num_piece_in_batch)
+    output = flow.nn.conv2d(
+        input, weight, strides, padding, None, data_format, dilation_rate, name=name
+    )
+    if use_bias:
+        bias = flow.get_variable(
+            name + "-bias",
+            shape=(filters,),
+            dtype=input.dtype,
+            initializer=bias_initializer,
+        )
+        bias = flow.identity(bias)
+        bias = flow.repeat(bias, args.num_piece_in_batch)
+        output = flow.nn.bias_add(output, bias, data_format)
+    if activation is not None:
+        if activation == op_conf_util.kRelu:
+            output = flow.math.relu(output)
+        else:
+            raise NotImplementedError
+    return output
+
+
+def _data_load_layer(args, data_dir):
+    node_num = args.num_nodes
+    total_batch_size = args.batch_size * args.gpu_num_per_node * node_num
+    rgb_mean = [123.68, 116.78, 103.94]
+    ofrecord = flow.data.ofrecord_reader(
+        data_dir,
+        batch_size=total_batch_size,
+        data_part_num=args.data_part_num,
+        name="decode",
+    )
+    image = flow.data.ofrecord_image_decoder(ofrecord, "encoded", color_space="RGB")
+    label = flow.data.ofrecord_raw_decoder(
+        ofrecord, "class/label", shape=(), dtype=flow.int32
+    )
+    rsz = flow.image.resize(image, resize_x=227, resize_y=227, color_space="RGB")
+    normal = flow.image.crop_mirror_normalize(
+        rsz,
+        color_space="RGB",
+        output_layout="NCHW",
+        mean=rgb_mean,
+        output_dtype=flow.float,
+    )
+    return (
+        flow.unpack(label, args.num_piece_in_batch),
+        flow.unpack(normal, args.num_piece_in_batch),
+    )
+
+
+def _dense_layer(
+    inputs,
+    units,
+    activation=None,
+    use_bias=True,
+    kernel_initializer=None,
+    bias_initializer=None,
+    trainable=True,
+    name=None,
+):
+    in_shape = inputs.shape
+    in_num_axes = len(in_shape)
+    assert in_num_axes >= 2
+    name_prefix = name if name is not None else id_util.UniqueStr("Dense_")
+    inputs = flow.reshape(inputs, (-1, in_shape[-1])) if in_num_axes > 2 else inputs
+    weight = flow.get_variable(
+        name="{}-weight".format(name_prefix),
+        shape=(units, inputs.shape[1]),
+        dtype=inputs.dtype,
+        initializer=kernel_initializer
+        if kernel_initializer is not None
+        else flow.constant_initializer(0),
+        trainable=trainable,
+        model_name="weight",
+    )
+    weight = flow.identity(weight)
+    weight = flow.repeat(weight, args.num_piece_in_batch)
+    out = flow.matmul(
+        a=inputs, b=weight, transpose_b=True, name="{}_matmul".format(name_prefix)
+    )
+    if use_bias:
+        bias = flow.get_variable(
+            name="{}-bias".format(name_prefix),
+            shape=(units,),
+            dtype=inputs.dtype,
+            initializer=bias_initializer
+            if bias_initializer is not None
+            else flow.constant_initializer(0),
+            trainable=trainable,
+            model_name="bias",
+        )
+        bias = flow.identity(bias)
+        bias = flow.repeat(bias, args.num_piece_in_batch)
+        out = flow.nn.bias_add(out, bias, name="{}_bias_add".format(name_prefix))
+    out = (
+        activation(out, name="{}_activation".format(name_prefix))
+        if activation is not None
+        else out
+    )
+    out = flow.reshape(out, in_shape[:-1] + (units,)) if in_num_axes > 2 else out
+    return out
+
+
+def alexnet(args, images, labels, trainable=True):
+    conv1 = _conv2d_layer(
+        args, "conv1", images, filters=64, kernel_size=11, strides=4, padding="VALID"
+    )
+    pool1 = flow.nn.avg_pool2d(conv1, 3, 2, "VALID", "NCHW", name="pool1")
+    conv2 = _conv2d_layer(args, "conv2", pool1, filters=192, kernel_size=5)
+    pool2 = flow.nn.avg_pool2d(conv2, 3, 2, "VALID", "NCHW", name="pool2")
+    conv3 = _conv2d_layer(args, "conv3", pool2, filters=384)
+    conv4 = _conv2d_layer(args, "conv4", conv3, filters=384)
+    conv5 = _conv2d_layer(args, "conv5", conv4, filters=256)
+    pool5 = flow.nn.avg_pool2d(conv5, 3, 2, "VALID", "NCHW", name="pool5")
+
+    def _get_initializer():
+        kernel_initializer = initializer_conf_util.InitializerConf()
+        kernel_initializer.truncated_normal_conf.std = 0.816496580927726
+        return kernel_initializer
+
+    if len(pool5.shape) > 2:
+        pool5 = flow.reshape(pool5, shape=(pool5.shape[0], -1))
+    fc1 = _dense_layer(
+        inputs=pool5,
+        units=4096,
+        activation=flow.math.relu,
+        use_bias=False,
+        kernel_initializer=_get_initializer(),
+        bias_initializer=False,
+        trainable=trainable,
+        name="fc1",
+    )
+    dropout1 = fc1
+    fc2 = _dense_layer(
+        inputs=dropout1,
+        units=4096,
+        activation=flow.math.relu,
+        use_bias=False,
+        kernel_initializer=_get_initializer(),
+        bias_initializer=False,
+        trainable=trainable,
+        name="fc2",
+    )
+    dropout2 = fc2
+    fc3 = _dense_layer(
+        inputs=dropout2,
+        units=1001,
+        activation=None,
+        use_bias=False,
+        kernel_initializer=_get_initializer(),
+        bias_initializer=False,
+        trainable=trainable,
+        name="fc3",
+    )
+    loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+        labels, fc3, name="softmax_loss"
+    )
+    return loss
+
+
+def main(args):
+    flow.config.machine_num(args.num_nodes)
+    flow.config.gpu_device_num(args.gpu_num_per_node)
+    func_config = flow.FunctionConfig()
+    func_config.default_logical_view(flow.scope.consistent_view())
+    func_config.default_data_type(flow.float)
+    func_config.cudnn_conv_force_fwd_algo(0)
+    func_config.cudnn_conv_force_bwd_data_algo(1)
+    func_config.cudnn_conv_force_bwd_filter_algo(1)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def alexnet_train_job():
+        (labels, images) = _data_load_layer(args, args.train_dir)
+        loss = alexnet(args, images, labels)
+        flow.optimizer.SGD(
+            flow.optimizer.PiecewiseConstantScheduler([], [1e-05]), momentum=0
+        ).minimize(loss)
+        return flow.pack(loss, args.num_piece_in_batch)
+
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(function_config=func_config)
+    def alexnet_eval_job():
+        with flow.scope.consistent_view():
+            (labels, images) = _data_load_layer(args, args.eval_dir)
+            loss = alexnet(args, images, labels)
+            return flow.pack(loss, args.num_piece_in_batch)
+
+    check_point = flow.train.CheckPoint()
+    if not args.model_load_dir:
+        check_point.init()
+    else:
+        check_point.load(args.model_load_dir)
+    num_nodes = args.num_nodes
+    print(
+        "Traning alexnet: num_gpu_per_node = {}, num_nodes = {}.".format(
+            args.gpu_num_per_node, num_nodes
+        )
+    )
+    print("{:>12}  {:>12}  {:>12}".format("iter", "loss type", "loss value"))
+    loss = []
+    for i in range(args.iter_num):
+        train_loss = alexnet_train_job().get().mean()
+        loss.append(train_loss)
+        fmt_str = "{:>12}  {:>12}  {:>12.6f}"
+        print(fmt_str.format(i, "train loss:", train_loss))
+        if (i + 1) % 100 == 0:
+            check_point.save(_MODEL_SAVE_DIR + str(i))
+    loss_file = "{}n{}c.npy".format(
+        str(num_nodes), str(args.gpu_num_per_node * num_nodes)
+    )
+    loss_path = "./of_loss/alexnet"
+    if not os.path.exists(loss_path):
+        os.makedirs(loss_path)
+    numpy.save(os.path.join(loss_path, loss_file), loss)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    args.num_nodes = len(args.node_list.strip().split(",")) if args.multinode else 1
+    flow.env.ctrl_port(9788)
+    if args.multinode:
+        flow.env.ctrl_port(12138)
+        nodes = []
+        for n in args.node_list.strip().split(","):
+            addr_dict = {}
+            addr_dict["addr"] = n
+            nodes.append(addr_dict)
+        flow.env.machine(nodes)
+        if args.remote_by_hand is False:
+            if args.scp_binary_without_uuid:
+                flow.deprecated.init_worker(scp_binary=True, use_uuid=False)
+            elif args.skip_scp_binary:
+                flow.deprecated.init_worker(scp_binary=False, use_uuid=False)
+            else:
+                flow.deprecated.init_worker(scp_binary=True, use_uuid=True)
+    main(args)
+    if (
+        args.multinode
+        and args.skip_scp_binary is False
+        and (args.scp_binary_without_uuid is False)
+    ):
+        flow.deprecated.delete_worker()
diff --git a/python/oneflow/compatible/single_client/test/models/bert.py b/python/oneflow/compatible/single_client/test/models/bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..696e6b81f41e1ee9b277eec6feb8ce0241167b62
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/models/bert.py
@@ -0,0 +1,399 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import math
+
+from oneflow.compatible import single_client as flow
+from oneflow.core.common import data_type_pb2 as data_type_util
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+
+
+class BertBackbone(object):
+    def __init__(
+        self,
+        input_ids_blob,
+        input_mask_blob,
+        token_type_ids_blob,
+        vocab_size,
+        seq_length=512,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        initializer_range=0.02,
+    ):
+        with flow.scope.namespace("bert"):
+            with flow.scope.namespace("embeddings"):
+                (self.embedding_output_, self.embedding_table_) = _EmbeddingLookup(
+                    input_ids_blob=input_ids_blob,
+                    vocab_size=vocab_size,
+                    embedding_size=hidden_size,
+                    initializer_range=initializer_range,
+                    word_embedding_name="word_embeddings",
+                )
+                self.embedding_output_ = _EmbeddingPostprocessor(
+                    input_blob=self.embedding_output_,
+                    seq_length=seq_length,
+                    embedding_size=hidden_size,
+                    use_token_type=True,
+                    token_type_ids_blob=token_type_ids_blob,
+                    token_type_vocab_size=type_vocab_size,
+                    token_type_embedding_name="token_type_embeddings",
+                    use_position_embeddings=True,
+                    position_embedding_name="position_embeddings",
+                    initializer_range=initializer_range,
+                    max_position_embeddings=max_position_embeddings,
+                    dropout_prob=hidden_dropout_prob,
+                )
+            with flow.scope.namespace("encoder"):
+                addr_blob = _CreateAttentionMaskFromInputMask(
+                    input_mask_blob,
+                    from_seq_length=seq_length,
+                    to_seq_length=seq_length,
+                )
+                self.all_encoder_layers_ = _TransformerModel(
+                    input_blob=self.embedding_output_,
+                    addr_blob=addr_blob,
+                    seq_length=seq_length,
+                    hidden_size=hidden_size,
+                    num_hidden_layers=num_hidden_layers,
+                    num_attention_heads=num_attention_heads,
+                    intermediate_size=intermediate_size,
+                    intermediate_act_fn=GetActivation(hidden_act),
+                    hidden_dropout_prob=hidden_dropout_prob,
+                    attention_probs_dropout_prob=attention_probs_dropout_prob,
+                    initializer_range=initializer_range,
+                    do_return_all_layers=False,
+                )
+            self.sequence_output_ = self.all_encoder_layers_[-1]
+
+    def embedding_output(self):
+        return self.embedding_output_
+
+    def all_encoder_layers(self):
+        return self.all_encoder_layers_
+
+    def sequence_output(self):
+        return self.sequence_output_
+
+    def embedding_table(self):
+        return self.embedding_table_
+
+
+def CreateInitializer(std):
+    return flow.truncated_normal(std)
+
+
+def _Gelu(in_blob):
+    return flow.math.gelu(in_blob)
+
+
+def _TransformerModel(
+    input_blob,
+    addr_blob,
+    seq_length,
+    hidden_size=768,
+    num_hidden_layers=12,
+    num_attention_heads=12,
+    intermediate_size=3072,
+    intermediate_act_fn=_Gelu,
+    hidden_dropout_prob=0.1,
+    attention_probs_dropout_prob=0.1,
+    initializer_range=0.02,
+    do_return_all_layers=False,
+):
+    assert hidden_size % num_attention_heads == 0
+    attention_head_size = int(hidden_size / num_attention_heads)
+    input_width = hidden_size
+    prev_output_blob = flow.reshape(input_blob, (-1, input_width))
+    all_layer_output_blobs = []
+    for layer_idx in range(num_hidden_layers):
+        with flow.scope.namespace("layer_%d" % layer_idx):
+            layer_input_blob = prev_output_blob
+            with flow.scope.namespace("attention"):
+                with flow.scope.namespace("self"):
+                    attention_output_blob = _AttentionLayer(
+                        from_blob=layer_input_blob,
+                        to_blob=layer_input_blob,
+                        addr_blob=addr_blob,
+                        num_attention_heads=num_attention_heads,
+                        size_per_head=attention_head_size,
+                        attention_probs_dropout_prob=attention_probs_dropout_prob,
+                        initializer_range=initializer_range,
+                        do_return_2d_tensor=True,
+                        from_seq_length=seq_length,
+                        to_seq_length=seq_length,
+                    )
+                with flow.scope.namespace("output"):
+                    attention_output_blob = _FullyConnected(
+                        attention_output_blob,
+                        input_size=num_attention_heads * attention_head_size,
+                        units=hidden_size,
+                        weight_initializer=CreateInitializer(initializer_range),
+                        name="dense",
+                    )
+                    attention_output_blob = _Dropout(
+                        attention_output_blob, hidden_dropout_prob
+                    )
+                    attention_output_blob = attention_output_blob + layer_input_blob
+                    attention_output_blob = _LayerNorm(
+                        attention_output_blob, hidden_size
+                    )
+            with flow.scope.namespace("intermediate"):
+                if callable(intermediate_act_fn):
+                    act_fn = op_conf_util.kNone
+                else:
+                    act_fn = intermediate_act_fn
+                intermediate_output_blob = _FullyConnected(
+                    attention_output_blob,
+                    input_size=num_attention_heads * attention_head_size,
+                    units=intermediate_size,
+                    activation=act_fn,
+                    weight_initializer=CreateInitializer(initializer_range),
+                    name="dense",
+                )
+                if callable(intermediate_act_fn):
+                    intermediate_output_blob = intermediate_act_fn(
+                        intermediate_output_blob
+                    )
+            with flow.scope.namespace("output"):
+                layer_output_blob = _FullyConnected(
+                    intermediate_output_blob,
+                    input_size=intermediate_size,
+                    units=hidden_size,
+                    weight_initializer=CreateInitializer(initializer_range),
+                    name="dense",
+                )
+                layer_output_blob = _Dropout(layer_output_blob, hidden_dropout_prob)
+                layer_output_blob = layer_output_blob + attention_output_blob
+                layer_output_blob = _LayerNorm(layer_output_blob, hidden_size)
+                prev_output_blob = layer_output_blob
+                all_layer_output_blobs.append(layer_output_blob)
+    input_shape = (-1, seq_length, hidden_size)
+    if do_return_all_layers:
+        final_output_blobs = []
+        for layer_output_blob in all_layer_output_blobs:
+            final_output_blob = flow.reshape(layer_output_blob, input_shape)
+            final_output_blobs.append(final_output_blob)
+        return final_output_blobs
+    else:
+        final_output_blob = flow.reshape(prev_output_blob, input_shape)
+        return [final_output_blob]
+
+
+def _AttentionLayer(
+    from_blob,
+    to_blob,
+    addr_blob,
+    num_attention_heads=1,
+    size_per_head=512,
+    query_act=op_conf_util.kNone,
+    key_act=op_conf_util.kNone,
+    value_act=op_conf_util.kNone,
+    attention_probs_dropout_prob=0.0,
+    initializer_range=0.02,
+    do_return_2d_tensor=False,
+    batch_size=None,
+    from_seq_length=None,
+    to_seq_length=None,
+):
+    def TransposeForScores(input_blob, num_attention_heads, seq_length, width):
+        output_blob = flow.reshape(
+            input_blob, [-1, seq_length, num_attention_heads, width]
+        )
+        output_blob = flow.transpose(output_blob, perm=[0, 2, 1, 3])
+        return output_blob
+
+    from_blob_2d = flow.reshape(from_blob, [-1, num_attention_heads * size_per_head])
+    to_blob_2d = flow.reshape(to_blob, [-1, num_attention_heads * size_per_head])
+    query_blob = _FullyConnected(
+        from_blob_2d,
+        input_size=num_attention_heads * size_per_head,
+        units=num_attention_heads * size_per_head,
+        activation=query_act,
+        name="query",
+        weight_initializer=CreateInitializer(initializer_range),
+    )
+    key_blob = _FullyConnected(
+        to_blob_2d,
+        input_size=num_attention_heads * size_per_head,
+        units=num_attention_heads * size_per_head,
+        activation=key_act,
+        name="key",
+        weight_initializer=CreateInitializer(initializer_range),
+    )
+    value_blob = _FullyConnected(
+        to_blob_2d,
+        input_size=num_attention_heads * size_per_head,
+        units=num_attention_heads * size_per_head,
+        activation=value_act,
+        name="value",
+        weight_initializer=CreateInitializer(initializer_range),
+    )
+    query_blob = TransposeForScores(
+        query_blob, num_attention_heads, from_seq_length, size_per_head
+    )
+    key_blob = TransposeForScores(
+        key_blob, num_attention_heads, to_seq_length, size_per_head
+    )
+    attention_scores_blob = flow.matmul(query_blob, key_blob, transpose_b=True)
+    attention_scores_blob = attention_scores_blob * (
+        1.0 / math.sqrt(float(size_per_head))
+    )
+    attention_scores_blob = attention_scores_blob + addr_blob
+    attention_probs_blob = flow.nn.softmax(attention_scores_blob)
+    attention_probs_blob = _Dropout(attention_probs_blob, attention_probs_dropout_prob)
+    value_blob = flow.reshape(
+        value_blob, [-1, to_seq_length, num_attention_heads, size_per_head]
+    )
+    value_blob = flow.transpose(value_blob, perm=[0, 2, 1, 3])
+    context_blob = flow.matmul(attention_probs_blob, value_blob)
+    context_blob = flow.transpose(context_blob, perm=[0, 2, 1, 3])
+    if do_return_2d_tensor:
+        context_blob = flow.reshape(
+            context_blob, [-1, num_attention_heads * size_per_head]
+        )
+    else:
+        context_blob = flow.reshape(
+            context_blob, [-1, from_seq_length, num_attention_heads * size_per_head]
+        )
+    return context_blob
+
+
+def _FullyConnected(
+    input_blob, input_size, units, activation=None, name=None, weight_initializer=None
+):
+    weight_blob = flow.get_variable(
+        name=name + "-weight",
+        shape=[input_size, units],
+        dtype=input_blob.dtype,
+        model_name="weight",
+        initializer=weight_initializer,
+    )
+    bias_blob = flow.get_variable(
+        name=name + "-bias",
+        shape=[units],
+        dtype=input_blob.dtype,
+        model_name="bias",
+        initializer=flow.constant_initializer(0.0),
+    )
+    output_blob = flow.matmul(input_blob, weight_blob)
+    output_blob = flow.nn.bias_add(output_blob, bias_blob)
+    return output_blob
+
+
+def _Dropout(input_blob, dropout_prob):
+    if dropout_prob == 0.0:
+        return input_blob
+    return flow.nn.dropout(input_blob, rate=dropout_prob)
+
+
+def _LayerNorm(input_blob, hidden_size):
+    return flow.layers.layer_norm(
+        input_blob, name="LayerNorm", begin_norm_axis=-1, begin_params_axis=-1
+    )
+
+
+def _CreateAttentionMaskFromInputMask(to_mask_blob, from_seq_length, to_seq_length):
+    output = flow.cast(to_mask_blob, dtype=flow.float)
+    output = flow.reshape(output, [-1, 1, to_seq_length])
+    zeros = flow.constant(0.0, dtype=flow.float, shape=[from_seq_length, to_seq_length])
+    attention_mask_blob = zeros + output
+    attention_mask_blob = flow.reshape(
+        attention_mask_blob, [-1, 1, from_seq_length, to_seq_length]
+    )
+    attention_mask_blob = flow.cast(attention_mask_blob, dtype=flow.float)
+    addr_blob = (attention_mask_blob - 1.0) * 10000.0
+    return addr_blob
+
+
+def _EmbeddingPostprocessor(
+    input_blob,
+    seq_length,
+    embedding_size,
+    use_token_type=False,
+    token_type_ids_blob=None,
+    token_type_vocab_size=16,
+    token_type_embedding_name="token_type_embeddings",
+    use_position_embeddings=True,
+    position_embedding_name="position_embeddings",
+    initializer_range=0.02,
+    max_position_embeddings=512,
+    dropout_prob=0.1,
+):
+    output = input_blob
+    if use_token_type:
+        assert token_type_ids_blob is not None
+        token_type_table = flow.get_variable(
+            name=token_type_embedding_name,
+            shape=[token_type_vocab_size, embedding_size],
+            dtype=input_blob.dtype,
+            initializer=CreateInitializer(initializer_range),
+        )
+        token_type_embeddings = flow.gather(
+            params=token_type_table, indices=token_type_ids_blob, axis=0
+        )
+        output = output + token_type_embeddings
+    if use_position_embeddings:
+        position_table = flow.get_variable(
+            name=position_embedding_name,
+            shape=[1, max_position_embeddings, embedding_size],
+            dtype=input_blob.dtype,
+            initializer=CreateInitializer(initializer_range),
+        )
+        assert seq_length <= max_position_embeddings
+        if seq_length != max_position_embeddings:
+            position_table = flow.slice(
+                position_table, begin=[None, 0, 0], size=[None, seq_length, -1]
+            )
+        output = output + position_table
+    output = _LayerNorm(output, embedding_size)
+    output = _Dropout(output, dropout_prob)
+    return output
+
+
+def _EmbeddingLookup(
+    input_ids_blob,
+    vocab_size,
+    embedding_size=128,
+    initializer_range=0.02,
+    word_embedding_name="word_embeddings",
+):
+    embedding_table = flow.get_variable(
+        name=word_embedding_name,
+        shape=[vocab_size, embedding_size],
+        dtype=flow.float,
+        initializer=CreateInitializer(initializer_range),
+    )
+    output = flow.gather(params=embedding_table, indices=input_ids_blob, axis=0)
+    return (output, embedding_table)
+
+
+def GetActivation(name):
+    if name == "linear":
+        return None
+    elif name == "relu":
+        return flow.math.relu
+    elif name == "tanh":
+        return flow.math.tanh
+    elif name == "gelu":
+        return flow.math.gelu
+    else:
+        raise Exception("unsupported activation")
diff --git a/python/oneflow/compatible/single_client/test/models/cnns_tests.py b/python/oneflow/compatible/single_client/test/models/cnns_tests.py
new file mode 100644
index 0000000000000000000000000000000000000000..12231eae0751e22ab703cf601cde947f13edebf6
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/models/cnns_tests.py
@@ -0,0 +1,197 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import imp
+import os
+import sys
+
+import numpy
+from absl import app, flags
+
+from oneflow.compatible import single_client as flow
+
+FLAGS = flags.FLAGS
+flags.DEFINE_string("python_bin", "python3", "python binary program name or filepath.")
+flags.DEFINE_boolean(
+    "enable_auto_mixed_precision",
+    False,
+    "automatically change float net to mixed precision net",
+)
+
+
+class TestNetMixin:
+    """
+    Base Tester
+  """
+
+    def setUp(self):
+        self.net = ""
+        self.tf_loss_dir = ""
+        self.of_loss_dir = ""
+        self.num_iter = 10
+        if os.getenv("ONEFLOW_TEST_CPU_ONLY"):
+            self.num_iter = 3
+        self.set_params()
+        flow.clear_default_session()
+
+    def set_params(self):
+        pass
+
+    def assert_tolerance_4_mixed_precision(self):
+        raise AssertionError
+
+    def run_net(self, num_gpu_per_node, num_node=1, node_list=""):
+        net_modudle = _Import(self.net)
+        spec = net_modudle.DLNetSpec(FLAGS.enable_auto_mixed_precision)
+        spec.num_nodes = num_node
+        spec.gpu_num_per_node = num_gpu_per_node
+        if os.getenv("ONEFLOW_TEST_CPU_ONLY"):
+            spec.iter_num = 3
+        net_modudle.main(spec)
+        return
+        if num_node > 1:
+            os.system(
+                "{} {}.py -g {} -m -n {}".format(
+                    FLAGS.python_bin, self.net, num_gpu_per_node, node_list
+                )
+            )
+        else:
+            os.system(
+                "{} {}.py -g {}".format(FLAGS.python_bin, self.net, num_gpu_per_node)
+            )
+
+    def load_tf_loss(self):
+        tf_loss = numpy.load(os.path.join(self.tf_loss_dir, "1n1c.npy"))
+        return tf_loss[0 : self.num_iter]
+
+    def load_of_loss(self, test_type):
+        path = os.path.join(self.of_loss_dir, test_type + ".npy")
+        if os.path.exists(path):
+            of_loss = numpy.load(path)
+        else:
+            of_loss = numpy.zeros(self.num_iter)
+        return of_loss[0 : self.num_iter]
+
+    def print_and_check_result(self, result_name):
+        if os.getenv("ONEFLOW_TEST_CPU_ONLY"):
+            if self.net == "resnet50":
+                print("WARNING: skipping check for resnet50 cpu due to GEMM NaN")
+                return
+        loss_dict = {}
+        loss_dict["tensorflow"] = self.load_tf_loss()
+        loss_dict["oneflow"] = self.load_of_loss(result_name)
+        print("==".ljust(64, "="))
+        print(" ".ljust(2, " ") + self.net + " loss report")
+        print("==".ljust(64, "="))
+        fmt_str = "{:>6}  {:>12}  {:>12}"
+        print(fmt_str.format("iter", "tensorflow", "oneflow-" + result_name))
+        for i in range(self.num_iter):
+            fmt_str = "{:>6}  {:>12.6f}  {:>12.6f}"
+            print(
+                fmt_str.format(i, loss_dict["tensorflow"][i], loss_dict["oneflow"][i])
+            )
+        if FLAGS.enable_auto_mixed_precision:
+            tolerance = self.assert_tolerance_4_mixed_precision()
+            rtol = tolerance["rtol"]
+            atol = tolerance["atol"]
+            print(
+                "assert tolerance for mixed_precision are: rtol", rtol, ", atol", atol
+            )
+            self.assertTrue(
+                numpy.allclose(
+                    loss_dict["tensorflow"], loss_dict["oneflow"], rtol=rtol, atol=atol
+                )
+            )
+        else:
+            self.assertTrue(
+                numpy.allclose(loss_dict["tensorflow"], loss_dict["oneflow"])
+            )
+
+
+class TestAlexNetMixin(TestNetMixin):
+    """
+    AlexNet Tester
+  """
+
+    def set_params(self):
+        self.net = "alexnet"
+        self.tf_loss_dir = os.path.join(
+            "/dataset/PNGS/cnns_model_for_test/tf_loss", self.net
+        )
+        self.of_loss_dir = os.path.join("./of_loss", self.net)
+
+    def assert_tolerance_4_mixed_precision(self):
+        return {"rtol": 1e-05, "atol": 0.01}
+
+
+class TestResNet50Mixin(TestNetMixin):
+    """
+    AlexNet Tester
+  """
+
+    def set_params(self):
+        self.net = "resnet50"
+        self.tf_loss_dir = os.path.join(
+            "/dataset/PNGS/cnns_model_for_test/tf_loss", self.net
+        )
+        self.of_loss_dir = os.path.join("./of_loss", self.net)
+
+    def assert_tolerance_4_mixed_precision(self):
+        return {"rtol": 1e-08, "atol": 1e-05}
+
+
+class TestVgg16Mixin(TestNetMixin):
+    """
+    Vgg16 Tester
+  """
+
+    def set_params(self):
+        self.net = "vgg16"
+        self.tf_loss_dir = os.path.join(
+            "/dataset/PNGS/cnns_model_for_test/tf_loss", self.net
+        )
+        self.of_loss_dir = os.path.join("./of_loss", self.net)
+
+    def assert_tolerance_4_mixed_precision(self):
+        return {"rtol": 0.0001, "atol": 0.1}
+
+
+class TestInceptionV3Mixin(TestNetMixin):
+    """
+    InceptionV3 Tester
+  """
+
+    def set_params(self):
+        self.net = "inceptionv3"
+        self.tf_loss_dir = os.path.join(
+            "/dataset/PNGS/cnns_model_for_test/tf_loss", self.net
+        )
+        self.of_loss_dir = os.path.join("./of_loss", self.net)
+
+    def assert_tolerance_4_mixed_precision(self):
+        return {"rtol": 1e-05, "atol": 0.01}
+
+
+def _Import(name, globals=None, locals=None, fromlist=None):
+    try:
+        return sys.modules[name]
+    except KeyError:
+        pass
+    (fp, pathname, description) = imp.find_module(name)
+    try:
+        return imp.load_module(name, fp, pathname, description)
+    finally:
+        if fp:
+            fp.close()
diff --git a/python/oneflow/compatible/single_client/test/models/eager_1node_test.py b/python/oneflow/compatible/single_client/test/models/eager_1node_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..266e381a04272ab08f8928974fc00b8c51bb2794
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/models/eager_1node_test.py
@@ -0,0 +1,77 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+
+import env_1node
+from absl import app
+from absl.testing import absltest
+from cnns_tests import (
+    TestAlexNetMixin,
+    TestInceptionV3Mixin,
+    TestResNet50Mixin,
+    TestVgg16Mixin,
+)
+from test_1node_mixin import Test1NodeMixin
+
+from oneflow.compatible import single_client as flow
+
+
+class TestAlexNet(Test1NodeMixin, TestAlexNetMixin, absltest.TestCase):
+    def setUp(self):
+        super().setUp()
+        flow.enable_eager_execution(True)
+
+
+class TestResNet50(Test1NodeMixin, TestResNet50Mixin, absltest.TestCase):
+    def setUp(self):
+        super().setUp()
+        flow.enable_eager_execution(True)
+
+
+class TestVgg16(Test1NodeMixin, TestVgg16Mixin, absltest.TestCase):
+    def setUp(self):
+        super().setUp()
+        flow.enable_eager_execution(True)
+
+
+class TestInceptionV3(Test1NodeMixin, TestInceptionV3Mixin, absltest.TestCase):
+    def setUp(self):
+        super().setUp()
+        flow.enable_eager_execution(True)
+
+
+class TestEagerMixin(object):
+    def setUp(self):
+        flow.clear_default_session()
+        flow.enable_eager_execution(True)
+
+
+flow.unittest.register_test_cases(
+    scope=globals(),
+    directory=os.path.dirname(os.path.realpath(__file__)),
+    filter_by_num_nodes=lambda x: x == 1,
+    base_class=absltest.TestCase,
+    test_case_mixin=TestEagerMixin,
+)
+
+
+def main(argv):
+    env_1node.Init()
+    absltest.main()
+
+
+if __name__ == "__main__":
+    app.run(main)
diff --git a/python/oneflow/compatible/single_client/test/models/env_1node.py b/python/oneflow/compatible/single_client/test/models/env_1node.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0dfccda41a26c7b1cc5fb40ad1eeafe76c0aa62
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/models/env_1node.py
@@ -0,0 +1,20 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+
+
+def Init():
+    flow.env.init()
diff --git a/python/oneflow/compatible/single_client/test/models/env_2node.py b/python/oneflow/compatible/single_client/test/models/env_2node.py
new file mode 100644
index 0000000000000000000000000000000000000000..133a292ef6ce35df577ae44e6d040455d8f4fabe
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/models/env_2node.py
@@ -0,0 +1,33 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import atexit
+
+from absl import flags
+
+from oneflow.compatible import single_client as flow
+
+FLAGS = flags.FLAGS
+flags.DEFINE_string(
+    "nodes_list", "192.168.1.15,192.168.1.16", "nodes list seperated by comma"
+)
+flags.DEFINE_integer("ctrl_port", "9524", "control port")
+
+
+def Init():
+    flow.env.machine(FLAGS.nodes_list.split(","))
+    flow.env.ctrl_port(FLAGS.ctrl_port)
+    flow.deprecated.init_worker(scp_binary=True, use_uuid=True)
+    atexit.register(flow.deprecated.delete_worker)
diff --git a/python/oneflow/compatible/single_client/test/models/inceptionv3.py b/python/oneflow/compatible/single_client/test/models/inceptionv3.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d21d0bb9706d09272939bb3f79c43a1f15dd23e
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/models/inceptionv3.py
@@ -0,0 +1,637 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import argparse
+import os
+from datetime import datetime
+
+import numpy
+
+from oneflow.compatible import single_client as flow
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+
+_DATA_DIR = "/dataset/PNGS/PNG299/of_record_repeated"
+_EVAL_DIR = _DATA_DIR
+_TRAIN_DIR = _DATA_DIR
+_MODEL_LOAD = "/dataset/PNGS/cnns_model_for_test/inceptionv3/models/of_model"
+_MODEL_SAVE_DIR = "./model_save-{}".format(
+    str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))
+)
+NODE_LIST = "192.168.1.12,192.168.1.14"
+
+
+class DLNetSpec(object):
+    def __init__(self, enable_auto_mixed_precision):
+        self.batch_size = 8
+        self.data_part_num = 32
+        self.eval_dir = _DATA_DIR
+        self.train_dir = _DATA_DIR
+        self.model_save_dir = _MODEL_SAVE_DIR
+        self.model_load_dir = _MODEL_LOAD
+        self.num_nodes = 1
+        self.gpu_num_per_node = 1
+        self.iter_num = 10
+        self.enable_auto_mixed_precision = enable_auto_mixed_precision
+
+
+parser = argparse.ArgumentParser(description="flags for multi-node and resource")
+parser.add_argument("-g", "--gpu_num_per_node", type=int, default=1, required=False)
+parser.add_argument("-i", "--iter_num", type=int, default=10, required=False)
+parser.add_argument("-b", "--batch_size", type=int, default=8, required=False)
+parser.add_argument(
+    "-m", "--multinode", default=False, action="store_true", required=False
+)
+parser.add_argument("-n", "--node_list", type=str, default=NODE_LIST, required=False)
+parser.add_argument(
+    "-s", "--skip_scp_binary", default=False, action="store_true", required=False
+)
+parser.add_argument(
+    "-c",
+    "--scp_binary_without_uuid",
+    default=False,
+    action="store_true",
+    required=False,
+)
+parser.add_argument(
+    "-r", "--remote_by_hand", default=False, action="store_true", required=False
+)
+parser.add_argument("-e", "--eval_dir", type=str, default=_DATA_DIR, required=False)
+parser.add_argument("-t", "--train_dir", type=str, default=_DATA_DIR, required=False)
+parser.add_argument(
+    "-load", "--model_load_dir", type=str, default=_MODEL_LOAD, required=False
+)
+parser.add_argument(
+    "-save", "--model_save_dir", type=str, default=_MODEL_SAVE_DIR, required=False
+)
+parser.add_argument("-dn", "--data_part_num", type=int, default=32, required=False)
+
+
+def _conv2d_layer(
+    name,
+    input,
+    filters,
+    kernel_size=3,
+    strides=1,
+    padding="SAME",
+    data_format="NCHW",
+    dilation_rate=1,
+    activation=op_conf_util.kSigmoid,
+    use_bias=True,
+    weight_initializer=flow.random_uniform_initializer(),
+    bias_initializer=flow.constant_initializer(),
+):
+    if isinstance(kernel_size, int):
+        kernel_size = (kernel_size, kernel_size)
+    else:
+        kernel_size = tuple(kernel_size)
+    weight_shape = (filters, input.shape[1]) + kernel_size
+    weight = flow.get_variable(
+        name + "-weight",
+        shape=weight_shape,
+        dtype=input.dtype,
+        initializer=weight_initializer,
+    )
+    output = flow.nn.conv2d(
+        input, weight, strides, padding, None, data_format, dilation_rate, name=name
+    )
+    if use_bias:
+        bias = flow.get_variable(
+            name + "-bias",
+            shape=(filters,),
+            dtype=input.dtype,
+            initializer=bias_initializer,
+        )
+        output = flow.nn.bias_add(output, bias, data_format)
+    if activation is not None:
+        if activation == op_conf_util.kRelu:
+            output = flow.math.relu(output)
+        elif activation == op_conf_util.kSigmoid:
+            output = flow.math.sigmoid(output)
+        else:
+            raise NotImplementedError
+    return output
+
+
+def _data_load_layer(args, data_dir):
+    node_num = args.num_nodes
+    total_batch_size = args.batch_size * args.gpu_num_per_node * node_num
+    rgb_mean = [123.68, 116.78, 103.94]
+    ofrecord = flow.data.ofrecord_reader(
+        data_dir,
+        batch_size=total_batch_size,
+        data_part_num=args.data_part_num,
+        name="decode",
+    )
+    image = flow.data.ofrecord_image_decoder(ofrecord, "encoded", color_space="RGB")
+    label = flow.data.ofrecord_raw_decoder(
+        ofrecord, "class/label", shape=(), dtype=flow.int32
+    )
+    rsz = flow.image.resize(image, resize_x=299, resize_y=299, color_space="RGB")
+    normal = flow.image.crop_mirror_normalize(
+        rsz,
+        color_space="RGB",
+        output_layout="NCHW",
+        mean=rgb_mean,
+        output_dtype=flow.float,
+    )
+    return (normal, label)
+
+
+def InceptionA(in_blob, index):
+    with flow.scope.namespace("mixed_{}".format(index)):
+        with flow.scope.namespace("branch1x1"):
+            branch1x1 = _conv2d_layer(
+                "conv0", in_blob, filters=64, kernel_size=1, strides=1, padding="SAME"
+            )
+        with flow.scope.namespace("branch5x5"):
+            branch5x5_1 = _conv2d_layer(
+                "conv0", in_blob, filters=48, kernel_size=1, strides=1, padding="SAME"
+            )
+            branch5x5_2 = _conv2d_layer(
+                "conv1",
+                branch5x5_1,
+                filters=64,
+                kernel_size=5,
+                strides=1,
+                padding="SAME",
+            )
+        with flow.scope.namespace("branch3x3dbl"):
+            branch3x3dbl_1 = _conv2d_layer(
+                "conv0", in_blob, filters=64, kernel_size=1, strides=1, padding="SAME"
+            )
+            branch3x3dbl_2 = _conv2d_layer(
+                "conv1",
+                branch3x3dbl_1,
+                filters=96,
+                kernel_size=3,
+                strides=1,
+                padding="SAME",
+            )
+            branch3x3dbl_3 = _conv2d_layer(
+                "conv2",
+                branch3x3dbl_2,
+                filters=96,
+                kernel_size=3,
+                strides=1,
+                padding="SAME",
+            )
+        with flow.scope.namespace("branch_pool"):
+            branch_pool_1 = flow.nn.avg_pool2d(
+                in_blob,
+                ksize=3,
+                strides=1,
+                padding="SAME",
+                data_format="NCHW",
+                name="pool",
+            )
+            branch_pool_2 = _conv2d_layer(
+                "conv",
+                branch_pool_1,
+                filters=32 if index == 0 else 64,
+                kernel_size=1,
+                strides=1,
+                padding="SAME",
+            )
+        inceptionA_bn = []
+        inceptionA_bn.append(branch1x1)
+        inceptionA_bn.append(branch5x5_2)
+        inceptionA_bn.append(branch3x3dbl_3)
+        inceptionA_bn.append(branch_pool_2)
+        mixed_concat = flow.concat(values=inceptionA_bn, axis=1, name="concat")
+    return mixed_concat
+
+
+def InceptionB(in_blob, index):
+    with flow.scope.namespace("mixed_{}".format(index)):
+        with flow.scope.namespace("branch3x3"):
+            branch3x3 = _conv2d_layer(
+                "conv0", in_blob, filters=384, kernel_size=3, strides=2, padding="VALID"
+            )
+        with flow.scope.namespace("branch3x3dbl"):
+            branch3x3dbl_1 = _conv2d_layer(
+                "conv0", in_blob, filters=64, kernel_size=1, strides=1, padding="SAME"
+            )
+            branch3x3dbl_2 = _conv2d_layer(
+                "conv1",
+                branch3x3dbl_1,
+                filters=96,
+                kernel_size=3,
+                strides=1,
+                padding="SAME",
+            )
+            branch3x3dbl_3 = _conv2d_layer(
+                "conv2",
+                branch3x3dbl_2,
+                filters=96,
+                kernel_size=3,
+                strides=2,
+                padding="VALID",
+            )
+        with flow.scope.namespace("branch_pool"):
+            branch_pool = flow.nn.max_pool2d(
+                in_blob,
+                ksize=3,
+                strides=2,
+                padding="VALID",
+                data_format="NCHW",
+                name="pool0",
+            )
+        inceptionB_bn = []
+        inceptionB_bn.append(branch3x3)
+        inceptionB_bn.append(branch3x3dbl_3)
+        inceptionB_bn.append(branch_pool)
+        mixed_concat = flow.concat(values=inceptionB_bn, axis=1, name="concat")
+    return mixed_concat
+
+
+def InceptionC(in_blob, index, filters):
+    with flow.scope.namespace("mixed_{}".format(index)):
+        with flow.scope.namespace("branch1x1"):
+            branch1x1 = _conv2d_layer(
+                "conv0", in_blob, filters=192, kernel_size=1, strides=1, padding="SAME"
+            )
+        with flow.scope.namespace("branch7x7"):
+            branch7x7_1 = _conv2d_layer(
+                "conv0",
+                in_blob,
+                filters=filters,
+                kernel_size=1,
+                strides=1,
+                padding="SAME",
+            )
+            branch7x7_2 = _conv2d_layer(
+                "conv1",
+                branch7x7_1,
+                filters=filters,
+                kernel_size=[1, 7],
+                strides=1,
+                padding="SAME",
+            )
+            branch7x7_3 = _conv2d_layer(
+                "conv2",
+                branch7x7_2,
+                filters=192,
+                kernel_size=[7, 1],
+                strides=[1, 1],
+                padding="SAME",
+            )
+        with flow.scope.namespace("branch7x7dbl"):
+            branch7x7dbl_1 = _conv2d_layer(
+                "conv0",
+                in_blob,
+                filters=filters,
+                kernel_size=1,
+                strides=1,
+                padding="SAME",
+            )
+            branch7x7dbl_2 = _conv2d_layer(
+                "conv1",
+                branch7x7dbl_1,
+                filters=filters,
+                kernel_size=[7, 1],
+                strides=1,
+                padding="SAME",
+            )
+            branch7x7dbl_3 = _conv2d_layer(
+                "conv2",
+                branch7x7dbl_2,
+                filters=filters,
+                kernel_size=[1, 7],
+                strides=1,
+                padding="SAME",
+            )
+            branch7x7dbl_4 = _conv2d_layer(
+                "conv3",
+                branch7x7dbl_3,
+                filters=filters,
+                kernel_size=[7, 1],
+                strides=1,
+                padding="SAME",
+            )
+            branch7x7dbl_5 = _conv2d_layer(
+                "conv4",
+                branch7x7dbl_4,
+                filters=192,
+                kernel_size=[1, 7],
+                strides=1,
+                padding="SAME",
+            )
+        with flow.scope.namespace("branch_pool"):
+            branch_pool_1 = flow.nn.avg_pool2d(
+                in_blob,
+                ksize=3,
+                strides=1,
+                padding="SAME",
+                data_format="NCHW",
+                name="pool",
+            )
+            branch_pool_2 = _conv2d_layer(
+                "conv",
+                branch_pool_1,
+                filters=192,
+                kernel_size=[1, 1],
+                strides=1,
+                padding="SAME",
+            )
+        inceptionC_bn = []
+        inceptionC_bn.append(branch1x1)
+        inceptionC_bn.append(branch7x7_3)
+        inceptionC_bn.append(branch7x7dbl_5)
+        inceptionC_bn.append(branch_pool_2)
+        mixed_concat = flow.concat(values=inceptionC_bn, axis=1, name="concat")
+    return mixed_concat
+
+
+def InceptionD(in_blob, index):
+    with flow.scope.namespace("mixed_{}".format(index)):
+        with flow.scope.namespace("branch3x3"):
+            branch3x3_1 = _conv2d_layer(
+                "conv0", in_blob, filters=192, kernel_size=1, strides=1, padding="SAME"
+            )
+            branch3x3_2 = _conv2d_layer(
+                "conv1",
+                branch3x3_1,
+                filters=320,
+                kernel_size=3,
+                strides=2,
+                padding="VALID",
+            )
+        with flow.scope.namespace("branch7x7x3"):
+            branch7x7x3_1 = _conv2d_layer(
+                "conv0", in_blob, filters=192, kernel_size=1, strides=1, padding="SAME"
+            )
+            branch7x7x3_2 = _conv2d_layer(
+                "conv1",
+                branch7x7x3_1,
+                filters=192,
+                kernel_size=[1, 7],
+                strides=1,
+                padding="SAME",
+            )
+            branch7x7x3_3 = _conv2d_layer(
+                "conv2",
+                branch7x7x3_2,
+                filters=192,
+                kernel_size=[7, 1],
+                strides=1,
+                padding="SAME",
+            )
+            branch7x7x3_4 = _conv2d_layer(
+                "conv3",
+                branch7x7x3_3,
+                filters=192,
+                kernel_size=3,
+                strides=2,
+                padding="VALID",
+            )
+        with flow.scope.namespace("branch_pool"):
+            branch_pool = flow.nn.max_pool2d(
+                in_blob,
+                ksize=3,
+                strides=2,
+                padding="VALID",
+                data_format="NCHW",
+                name="pool",
+            )
+        inceptionD_bn = []
+        inceptionD_bn.append(branch3x3_2)
+        inceptionD_bn.append(branch7x7x3_4)
+        inceptionD_bn.append(branch_pool)
+        mixed_concat = flow.concat(values=inceptionD_bn, axis=1, name="concat")
+    return mixed_concat
+
+
+def InceptionE(in_blob, index):
+    with flow.scope.namespace("mixed_{}".format(index)):
+        with flow.scope.namespace("branch1x1"):
+            branch1x1 = _conv2d_layer(
+                "conv0", in_blob, filters=320, kernel_size=1, strides=1, padding="SAME"
+            )
+        with flow.scope.namespace("branch3x3"):
+            branch3x3_1 = _conv2d_layer(
+                "conv0", in_blob, filters=384, kernel_size=1, strides=1, padding="SAME"
+            )
+            branch3x3_2 = _conv2d_layer(
+                "conv1",
+                branch3x3_1,
+                filters=384,
+                kernel_size=[1, 3],
+                strides=1,
+                padding="SAME",
+            )
+            branch3x3_3 = _conv2d_layer(
+                "conv2",
+                branch3x3_1,
+                filters=384,
+                kernel_size=[3, 1],
+                strides=[1, 1],
+                padding="SAME",
+            )
+            inceptionE_1_bn = []
+            inceptionE_1_bn.append(branch3x3_2)
+            inceptionE_1_bn.append(branch3x3_3)
+            concat_branch3x3 = flow.concat(
+                values=inceptionE_1_bn, axis=1, name="concat"
+            )
+        with flow.scope.namespace("branch3x3dbl"):
+            branch3x3dbl_1 = _conv2d_layer(
+                "conv0", in_blob, filters=448, kernel_size=1, strides=1, padding="SAME"
+            )
+            branch3x3dbl_2 = _conv2d_layer(
+                "conv1",
+                branch3x3dbl_1,
+                filters=384,
+                kernel_size=3,
+                strides=1,
+                padding="SAME",
+            )
+            branch3x3dbl_3 = _conv2d_layer(
+                "conv2",
+                branch3x3dbl_2,
+                filters=384,
+                kernel_size=[1, 3],
+                strides=1,
+                padding="SAME",
+            )
+            branch3x3dbl_4 = _conv2d_layer(
+                "conv3",
+                branch3x3dbl_2,
+                filters=384,
+                kernel_size=[3, 1],
+                strides=1,
+                padding="SAME",
+            )
+            inceptionE_2_bn = []
+            inceptionE_2_bn.append(branch3x3dbl_3)
+            inceptionE_2_bn.append(branch3x3dbl_4)
+            concat_branch3x3dbl = flow.concat(
+                values=inceptionE_2_bn, axis=1, name="concat"
+            )
+        with flow.scope.namespace("branch_pool"):
+            branch_pool_1 = flow.nn.avg_pool2d(
+                in_blob,
+                ksize=3,
+                strides=1,
+                padding="SAME",
+                data_format="NCHW",
+                name="pool",
+            )
+            branch_pool_2 = _conv2d_layer(
+                "conv",
+                branch_pool_1,
+                filters=192,
+                kernel_size=[1, 1],
+                strides=1,
+                padding="SAME",
+            )
+        inceptionE_total_bn = []
+        inceptionE_total_bn.append(branch1x1)
+        inceptionE_total_bn.append(concat_branch3x3)
+        inceptionE_total_bn.append(concat_branch3x3dbl)
+        inceptionE_total_bn.append(branch_pool_2)
+        concat_total = flow.concat(values=inceptionE_total_bn, axis=1, name="concat")
+    return concat_total
+
+
+def InceptionV3(images, labels, trainable=True):
+    conv0 = _conv2d_layer(
+        "conv0", images, filters=32, kernel_size=3, strides=2, padding="VALID"
+    )
+    conv1 = _conv2d_layer(
+        "conv1", conv0, filters=32, kernel_size=3, strides=1, padding="VALID"
+    )
+    conv2 = _conv2d_layer(
+        "conv2", conv1, filters=64, kernel_size=3, strides=1, padding="SAME"
+    )
+    pool1 = flow.nn.max_pool2d(
+        conv2, ksize=3, strides=2, padding="VALID", data_format="NCHW", name="pool1"
+    )
+    conv3 = _conv2d_layer(
+        "conv3", pool1, filters=80, kernel_size=1, strides=1, padding="VALID"
+    )
+    conv4 = _conv2d_layer(
+        "conv4", conv3, filters=192, kernel_size=3, strides=1, padding="VALID"
+    )
+    pool2 = flow.nn.max_pool2d(
+        conv4, ksize=3, strides=2, padding="VALID", data_format="NCHW", name="pool2"
+    )
+    mixed_0 = InceptionA(pool2, 0)
+    mixed_1 = InceptionA(mixed_0, 1)
+    mixed_2 = InceptionA(mixed_1, 2)
+    mixed_3 = InceptionB(mixed_2, 3)
+    mixed_4 = InceptionC(mixed_3, 4, 128)
+    mixed_5 = InceptionC(mixed_4, 5, 160)
+    mixed_6 = InceptionC(mixed_5, 6, 160)
+    mixed_7 = InceptionC(mixed_6, 7, 192)
+    mixed_8 = InceptionD(mixed_7, 8)
+    mixed_9 = InceptionE(mixed_8, 9)
+    mixed_10 = InceptionE(mixed_9, 10)
+    pool3 = flow.nn.avg_pool2d(
+        mixed_10, ksize=8, strides=1, padding="VALID", data_format="NCHW", name="pool3"
+    )
+    with flow.scope.namespace("logits"):
+        pool3 = flow.reshape(pool3, [pool3.shape[0], -1])
+        weight = flow.get_variable(
+            "fc1-weight",
+            shape=(pool3.shape[1], 1001),
+            dtype=flow.float,
+            initializer=flow.truncated_normal(0.816496580927726),
+            model_name="weight",
+        )
+        bias = flow.get_variable(
+            "fc1-bias",
+            shape=(1001,),
+            dtype=flow.float,
+            initializer=flow.constant_initializer(),
+            model_name="bias",
+        )
+        fc1 = flow.matmul(pool3, weight)
+        fc1 = flow.nn.bias_add(fc1, bias)
+    loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+        labels=labels, logits=fc1, name="softmax_loss"
+    )
+    return loss
+
+
+def main(args):
+    flow.config.machine_num(args.num_nodes)
+    flow.config.gpu_device_num(args.gpu_num_per_node)
+    func_config = flow.FunctionConfig()
+    func_config.default_logical_view(flow.scope.consistent_view())
+    func_config.default_data_type(flow.float)
+    func_config.enable_auto_mixed_precision(args.enable_auto_mixed_precision)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def TrainNet():
+        (images, labels) = _data_load_layer(args, args.train_dir)
+        loss = InceptionV3(images, labels)
+        flow.optimizer.SGD(
+            flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+        ).minimize(loss)
+        return loss
+
+    check_point = flow.train.CheckPoint()
+    if not args.model_load_dir:
+        check_point.init()
+    else:
+        check_point.load(args.model_load_dir)
+    num_nodes = args.num_nodes
+    print(
+        "Traning inceptionv3: num_gpu_per_node = {}, num_nodes = {}.".format(
+            args.gpu_num_per_node, num_nodes
+        )
+    )
+    print("{:>12}  {:>12}  {:>12}".format("iter", "loss type", "loss value"))
+    loss = []
+    for i in range(args.iter_num):
+        train_loss = TrainNet().get().mean()
+        loss.append(train_loss)
+        fmt_str = "{:>12}  {:>12}  {:>12.6f}"
+        print(fmt_str.format(i, "train loss:", train_loss))
+        if (i + 1) % 100 == 0:
+            check_point.save(_MODEL_SAVE_DIR + str(i))
+    loss_file = "{}n{}c.npy".format(
+        str(num_nodes), str(args.gpu_num_per_node * num_nodes)
+    )
+    loss_path = "./of_loss/inceptionv3"
+    if not os.path.exists(loss_path):
+        os.makedirs(loss_path)
+    numpy.save(os.path.join(loss_path, loss_file), loss)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    if args.multinode:
+        flow.env.ctrl_port(12138)
+        nodes = []
+        for n in args.node_list.strip().split(","):
+            addr_dict = {}
+            addr_dict["addr"] = n
+            nodes.append(addr_dict)
+        flow.env.machine(nodes)
+        if args.remote_by_hand is False:
+            if args.scp_binary_without_uuid:
+                flow.deprecated.init_worker(scp_binary=True, use_uuid=False)
+            elif args.skip_scp_binary:
+                flow.deprecated.init_worker(scp_binary=False, use_uuid=False)
+            else:
+                flow.deprecated.init_worker(scp_binary=True, use_uuid=True)
+    main(args)
+    if (
+        args.multinode
+        and args.skip_scp_binary is False
+        and (args.scp_binary_without_uuid is False)
+    ):
+        flow.deprecated.delete_worker()
diff --git a/python/oneflow/compatible/single_client/test/models/pretrain.py b/python/oneflow/compatible/single_client/test/models/pretrain.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2c783ecb3a71191759341ac2013860c910cfd68
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/models/pretrain.py
@@ -0,0 +1,191 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import bert as bert_util
+
+from oneflow.compatible import single_client as flow
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+
+
+def PreTrain(
+    input_ids_blob,
+    input_mask_blob,
+    token_type_ids_blob,
+    masked_lm_positions_blob,
+    masked_lm_ids_blob,
+    masked_lm_weights_blob,
+    next_sentence_label_blob,
+    vocab_size,
+    seq_length=512,
+    hidden_size=768,
+    num_hidden_layers=12,
+    num_attention_heads=12,
+    intermediate_size=3072,
+    hidden_act="gelu",
+    hidden_dropout_prob=0.1,
+    attention_probs_dropout_prob=0.1,
+    max_position_embeddings=512,
+    type_vocab_size=16,
+    max_predictions_per_seq=20,
+    initializer_range=0.02,
+):
+    backbone = bert_util.BertBackbone(
+        input_ids_blob=input_ids_blob,
+        input_mask_blob=input_mask_blob,
+        token_type_ids_blob=token_type_ids_blob,
+        vocab_size=vocab_size,
+        seq_length=seq_length,
+        hidden_size=hidden_size,
+        num_hidden_layers=num_hidden_layers,
+        num_attention_heads=num_attention_heads,
+        intermediate_size=intermediate_size,
+        hidden_act=hidden_act,
+        hidden_dropout_prob=hidden_dropout_prob,
+        attention_probs_dropout_prob=attention_probs_dropout_prob,
+        max_position_embeddings=max_position_embeddings,
+        type_vocab_size=type_vocab_size,
+        initializer_range=initializer_range,
+    )
+    (lm_loss, _, _) = _AddMaskedLanguageModelLoss(
+        input_blob=backbone.sequence_output(),
+        output_weights_blob=backbone.embedding_table(),
+        positions_blob=masked_lm_positions_blob,
+        label_id_blob=masked_lm_ids_blob,
+        label_weight_blob=masked_lm_weights_blob,
+        seq_length=seq_length,
+        hidden_size=hidden_size,
+        vocab_size=vocab_size,
+        max_predictions_per_seq=max_predictions_per_seq,
+        hidden_act=bert_util.GetActivation(hidden_act),
+        initializer_range=initializer_range,
+    )
+    pooled_output = PooledOutput(
+        backbone.sequence_output(), hidden_size, initializer_range
+    )
+    (ns_loss, _, _) = _AddNextSentenceOutput(
+        input_blob=pooled_output,
+        label_blob=next_sentence_label_blob,
+        hidden_size=hidden_size,
+        initializer_range=initializer_range,
+    )
+    with flow.scope.namespace("cls-loss"):
+        total_loss = lm_loss + ns_loss
+    return total_loss
+
+
+def PooledOutput(sequence_output, hidden_size, initializer_range):
+    with flow.scope.namespace("bert-pooler"):
+        first_token_tensor = flow.slice(sequence_output, [None, 0, 0], [None, 1, -1])
+        first_token_tensor = flow.reshape(first_token_tensor, [-1, hidden_size])
+        pooled_output = bert_util._FullyConnected(
+            first_token_tensor,
+            input_size=hidden_size,
+            units=hidden_size,
+            weight_initializer=bert_util.CreateInitializer(initializer_range),
+            name="dense",
+        )
+        pooled_output = flow.math.tanh(pooled_output)
+    return pooled_output
+
+
+def _AddMaskedLanguageModelLoss(
+    input_blob,
+    output_weights_blob,
+    positions_blob,
+    label_id_blob,
+    label_weight_blob,
+    seq_length,
+    hidden_size,
+    vocab_size,
+    max_predictions_per_seq,
+    hidden_act,
+    initializer_range,
+):
+    with flow.scope.namespace("other"):
+        sum_label_weight_blob = flow.math.reduce_sum(label_weight_blob, axis=[-1])
+        ones = sum_label_weight_blob * 0.0 + 1.0
+        sum_label_weight_blob = flow.math.reduce_sum(sum_label_weight_blob)
+        batch_size = flow.math.reduce_sum(ones)
+        sum_label_weight_blob = sum_label_weight_blob / batch_size
+    with flow.scope.namespace("cls-predictions"):
+        input_blob = _GatherIndexes(input_blob, positions_blob, seq_length, hidden_size)
+        with flow.scope.namespace("transform"):
+            if callable(hidden_act):
+                act_fn = op_conf_util.kNone
+            else:
+                act_fn = hidden_act
+            input_blob = bert_util._FullyConnected(
+                input_blob,
+                input_size=hidden_size,
+                units=hidden_size,
+                activation=act_fn,
+                weight_initializer=bert_util.CreateInitializer(initializer_range),
+                name="dense",
+            )
+            if callable(hidden_act):
+                input_blob = hidden_act(input_blob)
+                input_blob = bert_util._LayerNorm(input_blob, hidden_size)
+        output_bias = flow.get_variable(
+            name="output_bias",
+            shape=[vocab_size],
+            dtype=input_blob.dtype,
+            initializer=flow.constant_initializer(1.0),
+        )
+        logit_blob = flow.matmul(input_blob, output_weights_blob, transpose_b=True)
+        logit_blob = flow.nn.bias_add(logit_blob, output_bias)
+        label_id_blob = flow.reshape(label_id_blob, [-1])
+        pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+            logits=logit_blob, labels=label_id_blob
+        )
+        pre_example_loss = flow.reshape(pre_example_loss, [-1, max_predictions_per_seq])
+        numerator = pre_example_loss * label_weight_blob
+        with flow.scope.namespace("loss"):
+            numerator = flow.math.reduce_sum(numerator, axis=[-1])
+            denominator = sum_label_weight_blob + 1e-05
+            loss = numerator / denominator
+        return (loss, pre_example_loss, logit_blob)
+
+
+def _GatherIndexes(sequence_blob, positions_blob, seq_length, hidden_size):
+    output = flow.gather(
+        params=sequence_blob, indices=positions_blob, axis=2, batch_dims=2
+    )
+    output = flow.reshape(output, [-1, hidden_size])
+    return output
+
+
+def _AddNextSentenceOutput(input_blob, label_blob, hidden_size, initializer_range):
+    with flow.scope.namespace("cls-seq_relationship"):
+        output_weight_blob = flow.get_variable(
+            name="output_weights",
+            shape=[2, hidden_size],
+            dtype=input_blob.dtype,
+            model_name="weight",
+            initializer=bert_util.CreateInitializer(initializer_range),
+        )
+        output_bias_blob = flow.get_variable(
+            name="output_bias",
+            shape=[2],
+            dtype=input_blob.dtype,
+            model_name="bias",
+            initializer=flow.constant_initializer(0.0),
+        )
+        logit_blob = flow.matmul(input_blob, output_weight_blob, transpose_b=True)
+        logit_blob = flow.nn.bias_add(logit_blob, output_bias_blob)
+        pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+            logits=logit_blob, labels=label_blob
+        )
+        loss = pre_example_loss
+        return (loss, pre_example_loss, logit_blob)
diff --git a/python/oneflow/compatible/single_client/test/models/resnet50.py b/python/oneflow/compatible/single_client/test/models/resnet50.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ccbadcb58c1c417b0d0febbfcfb24eddf54dde2
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/models/resnet50.py
@@ -0,0 +1,332 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import argparse
+import os
+from datetime import datetime
+
+import numpy
+
+from oneflow.compatible import single_client as flow
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+
+DATA_DIR = "/dataset/PNGS/PNG228/of_record_repeated"
+MODEL_LOAD = "/dataset/PNGS/cnns_model_for_test/resnet50/models/of_model"
+MODEL_SAVE = "./output/model_save-{}".format(
+    str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))
+)
+NODE_LIST = "192.168.1.12,192.168.1.14"
+IMAGE_SIZE = 228
+BLOCK_COUNTS = [3, 4, 6, 3]
+BLOCK_FILTERS = [256, 512, 1024, 2048]
+BLOCK_FILTERS_INNER = [64, 128, 256, 512]
+
+
+class DLNetSpec(object):
+    def __init__(self, enable_auto_mixed_precision):
+        self.batch_size = 8
+        self.data_part_num = 32
+        self.eval_dir = DATA_DIR
+        self.train_dir = DATA_DIR
+        self.model_save_dir = MODEL_SAVE
+        self.model_load_dir = MODEL_LOAD
+        self.num_nodes = 1
+        self.gpu_num_per_node = 1
+        self.iter_num = 10
+        self.enable_auto_mixed_precision = enable_auto_mixed_precision
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("-g", "--gpu_num_per_node", type=int, default=1, required=False)
+parser.add_argument("-i", "--iter_num", type=int, default=10, required=False)
+parser.add_argument(
+    "-m", "--multinode", default=False, action="store_true", required=False
+)
+parser.add_argument("-n", "--node_list", type=str, default=NODE_LIST, required=False)
+parser.add_argument(
+    "-s", "--skip_scp_binary", default=False, action="store_true", required=False
+)
+parser.add_argument(
+    "-c",
+    "--scp_binary_without_uuid",
+    default=False,
+    action="store_true",
+    required=False,
+)
+parser.add_argument(
+    "-r", "--remote_by_hand", default=False, action="store_true", required=False
+)
+parser.add_argument("-e", "--eval_dir", type=str, default=DATA_DIR, required=False)
+parser.add_argument("-t", "--train_dir", type=str, default=DATA_DIR, required=False)
+parser.add_argument(
+    "-load", "--model_load_dir", type=str, default=MODEL_LOAD, required=False
+)
+parser.add_argument(
+    "-save", "--model_save_dir", type=str, default=MODEL_SAVE, required=False
+)
+parser.add_argument("-dn", "--data_part_num", type=int, default=32, required=False)
+parser.add_argument("-b", "--batch_size", type=int, default=8, required=False)
+g_output_key = []
+g_trainable = True
+
+
+def _data_load(args, data_dir):
+    node_num = args.num_nodes
+    total_batch_size = args.batch_size * args.gpu_num_per_node * node_num
+    rgb_mean = [123.68, 116.78, 103.94]
+    ofrecord = flow.data.ofrecord_reader(
+        data_dir,
+        batch_size=total_batch_size,
+        data_part_num=args.data_part_num,
+        name="decode",
+    )
+    image = flow.data.ofrecord_image_decoder(ofrecord, "encoded", color_space="RGB")
+    label = flow.data.ofrecord_raw_decoder(
+        ofrecord, "class/label", shape=(), dtype=flow.int32
+    )
+    rsz = flow.image.resize(
+        image, resize_x=IMAGE_SIZE, resize_y=IMAGE_SIZE, color_space="RGB"
+    )
+    normal = flow.image.crop_mirror_normalize(
+        rsz,
+        color_space="RGB",
+        output_layout="NCHW",
+        mean=rgb_mean,
+        output_dtype=flow.float,
+    )
+    return (label, normal)
+
+
+def _conv2d(
+    name,
+    input,
+    filters,
+    kernel_size,
+    strides=1,
+    padding="SAME",
+    data_format="NCHW",
+    dilations=1,
+    weight_initializer=flow.variance_scaling_initializer(),
+):
+    weight = flow.get_variable(
+        name + "-weight",
+        shape=(filters, input.shape[1], kernel_size, kernel_size),
+        dtype=input.dtype,
+        initializer=weight_initializer,
+        trainable=g_trainable,
+    )
+    return flow.nn.conv2d(
+        input, weight, strides, padding, None, data_format, dilations, name=name
+    )
+
+
+def _batch_norm(inputs, name=None):
+    return flow.layers.batch_normalization(
+        inputs=inputs,
+        axis=1,
+        momentum=0.997,
+        epsilon=1e-05,
+        center=True,
+        scale=True,
+        trainable=g_trainable,
+        name=name,
+    )
+
+
+def conv2d_affine(
+    input, name, filters, kernel_size, strides, activation=op_conf_util.kNone
+):
+    padding = "SAME" if strides > 1 or kernel_size > 1 else "VALID"
+    output = _conv2d(name, input, filters, kernel_size, strides, padding)
+    return output
+
+
+def bottleneck_transformation(input, block_name, filters, filters_inner, strides):
+    a = conv2d_affine(
+        input,
+        block_name + "_branch2a",
+        filters_inner,
+        1,
+        1,
+        activation=op_conf_util.kRelu,
+    )
+    b = conv2d_affine(
+        a,
+        block_name + "_branch2b",
+        filters_inner,
+        1,
+        strides,
+        activation=op_conf_util.kRelu,
+    )
+    c = conv2d_affine(b, block_name + "_branch2c", filters, 1, 1)
+    return c
+
+
+def residual_block(input, block_name, filters, filters_inner, strides_init):
+    if strides_init != 1 or block_name == "res2_0":
+        shortcut = conv2d_affine(
+            input, block_name + "_branch1", filters, 1, strides_init
+        )
+    else:
+        shortcut = input
+    bottleneck = bottleneck_transformation(
+        input, block_name, filters, filters_inner, strides_init
+    )
+    return flow.math.relu(shortcut + bottleneck)
+
+
+def residual_stage(input, stage_name, counts, filters, filters_inner, stride_init=2):
+    output = input
+    for i in range(counts):
+        block_name = "%s_%d" % (stage_name, i)
+        output = residual_block(
+            output, block_name, filters, filters_inner, stride_init if i == 0 else 1
+        )
+    return output
+
+
+def resnet_conv_x_body(input, on_stage_end=lambda x: x):
+    output = input
+    for (i, (counts, filters, filters_inner)) in enumerate(
+        zip(BLOCK_COUNTS, BLOCK_FILTERS, BLOCK_FILTERS_INNER)
+    ):
+        stage_name = "res%d" % (i + 2)
+        output = residual_stage(
+            output, stage_name, counts, filters, filters_inner, 1 if i == 0 else 2
+        )
+        on_stage_end(output)
+        g_output_key.append(stage_name)
+    return output
+
+
+def resnet_stem(input):
+    conv1 = _conv2d("conv1", input, 64, 7, 2)
+    g_output_key.append("conv1")
+    conv1_bn = conv1
+    pool1 = flow.nn.avg_pool2d(
+        conv1_bn, ksize=3, strides=2, padding="VALID", data_format="NCHW", name="pool1"
+    )
+    g_output_key.append("pool1")
+    return pool1
+
+
+def resnet50(args, data_dir):
+    (labels, images) = _data_load(args, data_dir)
+    g_output_key.append("input_img")
+    with flow.scope.namespace("Resnet"):
+        stem = resnet_stem(images)
+        body = resnet_conv_x_body(stem, lambda x: x)
+        pool5 = flow.nn.avg_pool2d(
+            body, ksize=7, strides=1, padding="VALID", data_format="NCHW", name="pool5"
+        )
+        g_output_key.append("pool5")
+        fc1001 = flow.layers.dense(
+            flow.reshape(pool5, (pool5.shape[0], -1)),
+            units=1001,
+            use_bias=True,
+            kernel_initializer=flow.xavier_uniform_initializer(),
+            bias_initializer=flow.zeros_initializer(),
+            trainable=g_trainable,
+            name="fc1001",
+        )
+        g_output_key.append("fc1001")
+        loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+            labels, fc1001, name="softmax_loss"
+        )
+        g_output_key.append("cross_entropy")
+    return loss
+
+
+def _set_trainable(trainable):
+    global g_trainable
+    g_trainable = trainable
+
+
+def main(args):
+    flow.config.machine_num(args.num_nodes)
+    flow.config.gpu_device_num(args.gpu_num_per_node)
+    train_config = flow.FunctionConfig()
+    train_config.default_logical_view(flow.scope.consistent_view())
+    train_config.default_data_type(flow.float)
+    train_config.enable_auto_mixed_precision(args.enable_auto_mixed_precision)
+
+    @flow.global_function(type="train", function_config=train_config)
+    def TrainNet():
+        _set_trainable(True)
+        loss = resnet50(args, args.train_dir)
+        flow.optimizer.SGD(
+            flow.optimizer.PiecewiseConstantScheduler([], [0.0032]), momentum=0
+        ).minimize(loss)
+        return loss
+
+    eval_config = flow.FunctionConfig()
+    eval_config.default_data_type(flow.float)
+    eval_config.enable_auto_mixed_precision(args.enable_auto_mixed_precision)
+
+    @flow.global_function(function_config=eval_config)
+    def evaluate():
+        with flow.scope.consistent_view():
+            _set_trainable(False)
+            return resnet50(args, args.eval_dir)
+
+    check_point = flow.train.CheckPoint()
+    check_point.load(MODEL_LOAD)
+    loss = []
+    fmt_str = "{:>12}  {:>12}  {:.6f}"
+    print("{:>12}  {:>12}  {:>12}".format("iter", "loss type", "loss value"))
+    for i in range(args.iter_num):
+        train_loss = TrainNet().get().mean()
+        loss.append(train_loss)
+        print(fmt_str.format(i, "train loss:", train_loss))
+    loss_file = "{}n{}c.npy".format(
+        str(args.num_nodes), str(args.gpu_num_per_node * args.num_nodes)
+    )
+    loss_path = "./of_loss/resnet50"
+    if not os.path.exists(loss_path):
+        os.makedirs(loss_path)
+    numpy.save(os.path.join(loss_path, loss_file), loss)
+
+
+if __name__ == "__main__":
+    flow.env.log_dir("./output/log")
+    flow.env.ctrl_port(12138)
+    args = parser.parse_args()
+    if args.multinode:
+        flow.env.ctrl_port(12139)
+        nodes = []
+        for n in args.node_list.strip().split(","):
+            addr_dict = {}
+            addr_dict["addr"] = n
+            nodes.append(addr_dict)
+        flow.env.machine(nodes)
+        if args.scp_binary_without_uuid:
+            flow.deprecated.init_worker(scp_binary=True, use_uuid=False)
+        elif args.skip_scp_binary:
+            flow.deprecated.init_worker(scp_binary=False, use_uuid=False)
+        else:
+            flow.deprecated.init_worker(scp_binary=True, use_uuid=True)
+    num_nodes = len(args.node_list.strip().split(",")) if args.multinode else 1
+    print(
+        "Traning resnet50: num_gpu_per_node = {}, num_nodes = {}.".format(
+            args.gpu_num_per_node, num_nodes
+        )
+    )
+    main(args)
+    if (
+        args.multinode
+        and args.skip_scp_binary is False
+        and (args.scp_binary_without_uuid is False)
+    ):
+        flow.deprecated.delete_worker()
diff --git a/python/oneflow/compatible/single_client/test/models/run_cnns_test.py b/python/oneflow/compatible/single_client/test/models/run_cnns_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1add9a5fb9100d09724b0402e94f39d66c48b488
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/models/run_cnns_test.py
@@ -0,0 +1,55 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+
+import cnns_tests
+import env_2node
+import numpy
+from absl import app
+from absl.testing import absltest
+from test_1node_mixin import Test1NodeMixin
+from test_2node_mixin import Test2NodeMixin
+
+
+class TestNodeMixin(Test1NodeMixin, Test2NodeMixin):
+    pass
+
+
+class TestAlexNet(TestNodeMixin, cnns_tests.TestAlexNetMixin, absltest.TestCase):
+    pass
+
+
+class TestResNet50(TestNodeMixin, cnns_tests.TestResNet50Mixin, absltest.TestCase):
+    pass
+
+
+class TestVgg16(TestNodeMixin, cnns_tests.TestVgg16Mixin, absltest.TestCase):
+    pass
+
+
+class TestInceptionV3(
+    TestNodeMixin, cnns_tests.TestInceptionV3Mixin, absltest.TestCase
+):
+    pass
+
+
+def main(argv):
+    env_2node.Init()
+    absltest.main()
+
+
+if __name__ == "__main__":
+    app.run(main)
diff --git a/python/oneflow/compatible/single_client/test/models/test_1node_mixin.py b/python/oneflow/compatible/single_client/test/models/test_1node_mixin.py
new file mode 100644
index 0000000000000000000000000000000000000000..4da0f6c201f04aeec04996f99f1b178b29172242
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/models/test_1node_mixin.py
@@ -0,0 +1,27 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import oneflow.compatible.single_client.unittest
+
+
+class Test1NodeMixin:
+    def test_1n1c(self):
+        self.run_net(1)
+        self.print_and_check_result("1n1c")
+
+    def test_1n4c(self):
+        self.run_net(4)
+        self.print_and_check_result("1n4c")
diff --git a/python/oneflow/compatible/single_client/test/models/test_2node_mixin.py b/python/oneflow/compatible/single_client/test/models/test_2node_mixin.py
new file mode 100644
index 0000000000000000000000000000000000000000..baee191fa7dda2469592a104651c81b69b8f43c2
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/models/test_2node_mixin.py
@@ -0,0 +1,27 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from absl import flags
+
+import oneflow.compatible.single_client.unittest
+
+FLAGS = flags.FLAGS
+
+
+class Test2NodeMixin:
+    def test_2n8c(self):
+        self.run_net(4, 2, FLAGS.nodes_list)
+        self.print_and_check_result("2n8c")
diff --git a/python/oneflow/compatible/single_client/test/models/test_alexnet_model.py b/python/oneflow/compatible/single_client/test/models/test_alexnet_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac1098e03ed227f5cdf73a9482c20455a7da8291
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/models/test_alexnet_model.py
@@ -0,0 +1,255 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from datetime import datetime
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.core.job import initializer_conf_pb2 as initializer_conf_util
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+
+_DATA_DIR = "/dataset/PNGS/PNG227/of_record_repeated"
+_MODEL_SAVE_DIR = "./model_save-{}".format(
+    str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))
+)
+_MODEL_LOAD = "/dataset/PNGS/cnns_model_for_test/alexnet/models/of_model_bk"
+
+
+class DLNetSpec(object):
+    def __init__(self):
+        self.batch_size = 8
+        self.data_part_num = 32
+        self.eval_dir = _DATA_DIR
+        self.train_dir = _DATA_DIR
+        self.model_save_dir = _MODEL_SAVE_DIR
+        self.model_load_dir = _MODEL_LOAD
+        self.num_nodes = 1
+        self.node_list = None
+        self.gpu_num_per_node = 1
+        self.iter_num = 10
+
+
+global_specs = DLNetSpec()
+
+
+class TrainData(flow.model.DataModule):
+    def __init__(self, specs):
+        super().__init__()
+        self.specs = specs
+
+    def forward(self, *args):
+        return _data_load_layer(self.specs, self.specs.train_dir)
+
+
+class ValData(flow.model.DataModule):
+    def __init__(self, specs):
+        super().__init__()
+        self.specs = specs
+
+    def forward(self, *args):
+        return _data_load_layer(self.specs, self.specs.eval_dir)
+
+
+class AlexNet(flow.model.Model):
+    def __init__(self, specs, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.specs = specs
+
+    def forward(self, images, trainable=False):
+        conv1 = _conv2d_layer(
+            "conv1", images, filters=64, kernel_size=11, strides=4, padding="VALID"
+        )
+        pool1 = flow.nn.avg_pool2d(conv1, 3, 2, "VALID", "NCHW", name="pool1")
+        conv2 = _conv2d_layer("conv2", pool1, filters=192, kernel_size=5)
+        pool2 = flow.nn.avg_pool2d(conv2, 3, 2, "VALID", "NCHW", name="pool2")
+        conv3 = _conv2d_layer("conv3", pool2, filters=384)
+        conv4 = _conv2d_layer("conv4", conv3, filters=384)
+        conv5 = _conv2d_layer("conv5", conv4, filters=256)
+        pool5 = flow.nn.avg_pool2d(conv5, 3, 2, "VALID", "NCHW", name="pool5")
+
+        def _get_initializer():
+            kernel_initializer = initializer_conf_util.InitializerConf()
+            kernel_initializer.truncated_normal_conf.std = 0.816496580927726
+            return kernel_initializer
+
+        if len(pool5.shape) > 2:
+            pool5 = flow.reshape(pool5, shape=(pool5.shape[0], -1))
+        fc1 = flow.layers.dense(
+            inputs=pool5,
+            units=4096,
+            activation=flow.math.relu,
+            use_bias=False,
+            kernel_initializer=_get_initializer(),
+            bias_initializer=False,
+            trainable=trainable,
+            name="fc1",
+        )
+        dropout1 = fc1
+        fc2 = flow.layers.dense(
+            inputs=dropout1,
+            units=4096,
+            activation=flow.math.relu,
+            use_bias=False,
+            kernel_initializer=_get_initializer(),
+            bias_initializer=False,
+            trainable=trainable,
+            name="fc2",
+        )
+        dropout2 = fc2
+        fc3 = flow.layers.dense(
+            inputs=dropout2,
+            units=1001,
+            activation=None,
+            use_bias=False,
+            kernel_initializer=_get_initializer(),
+            bias_initializer=False,
+            trainable=trainable,
+            name="fc3",
+        )
+        return fc3
+
+    def training_step(self, batch, optimizer_idx):
+        assert optimizer_idx == 0
+        (images, labels) = batch
+        fc3 = self(images, True)
+        loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+            labels, fc3, name="softmax_loss"
+        )
+        return loss
+
+    def validation_step(self, batch):
+        (images, labels) = batch
+        fc3 = self(images, False)
+        loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+            labels, fc3, name="softmax_loss"
+        )
+        return loss
+
+    def configure_optimizers(self):
+        return flow.optimizer.SGD(
+            flow.optimizer.PiecewiseConstantScheduler([], [1e-05]), momentum=0
+        )
+
+
+class LossMoniter(flow.model.Callback):
+    def on_training_step_end(self, step_idx, outputs, optimizer_idx):
+        assert optimizer_idx == 0
+        loss = outputs.mean()
+        fmt_str = "{:>12}  {:>12}  {:>12.6f}"
+        print(fmt_str.format(step_idx, "train loss:", loss))
+
+    def on_validation_step_end(self, step_idx, outputs):
+        loss = outputs.mean()
+        fmt_str = "{:>12}  {:>12}  {:>12.6f}"
+        print(fmt_str.format(step_idx, "validation loss:", loss))
+
+
+@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+def test_1n1c(test_case):
+    flow.env.ctrl_port(9788)
+    flow.config.machine_num(global_specs.num_nodes)
+    flow.config.gpu_device_num(global_specs.gpu_num_per_node)
+    train_exe_config = flow.ExecutionConfig()
+    train_exe_config.default_logical_view(flow.scope.consistent_view())
+    train_exe_config.default_data_type(flow.float)
+    train_config = flow.model.TrainingConfig()
+    train_config.config_execution(train_exe_config)
+    train_config.config_data(TrainData(global_specs))
+    val_exe_config = flow.ExecutionConfig()
+    val_exe_config.default_logical_view(flow.scope.consistent_view())
+    val_exe_config.default_data_type(flow.float)
+    val_config = flow.model.ValidationConfig()
+    val_config.config_execution(val_exe_config)
+    val_config.config_data(ValData(global_specs))
+    val_config.config_step_interval(10)
+    ck_config = flow.model.CheckpointConfig()
+    ck_config.config_load(dirpath=global_specs.model_load_dir)
+    ck_config.config_save(dirpath=global_specs.model_save_dir, step_interval=10)
+    loss_monitor_cb = LossMoniter()
+    alexnet_md = AlexNet(global_specs, is_deprecated_function_style=True)
+    alexnet_md.fit(
+        training_config=train_config,
+        validation_config=val_config,
+        checkpoint_config=ck_config,
+        callbacks=[loss_monitor_cb],
+        max_steps=20,
+    )
+
+
+def _conv2d_layer(
+    name,
+    input,
+    filters,
+    kernel_size=3,
+    strides=1,
+    padding="SAME",
+    data_format="NCHW",
+    dilation_rate=1,
+    activation=op_conf_util.kRelu,
+    use_bias=False,
+    weight_initializer=flow.random_uniform_initializer(),
+    bias_initializer=flow.random_uniform_initializer(),
+):
+    weight_shape = (filters, input.shape[1], kernel_size, kernel_size)
+    weight = flow.get_variable(
+        name + "-weight",
+        shape=weight_shape,
+        dtype=input.dtype,
+        initializer=weight_initializer,
+    )
+    output = flow.nn.conv2d(
+        input, weight, strides, padding, None, data_format, dilation_rate, name=name
+    )
+    if use_bias:
+        bias = flow.get_variable(
+            name + "-bias",
+            shape=(filters,),
+            dtype=input.dtype,
+            initializer=bias_initializer,
+        )
+        output = flow.nn.bias_add(output, bias, data_format)
+    if activation is not None:
+        if activation == op_conf_util.kRelu:
+            output = flow.nn.relu(output)
+        else:
+            raise NotImplementedError
+    return output
+
+
+def _data_load_layer(args, data_dir):
+    node_num = args.num_nodes
+    total_batch_size = args.batch_size * args.gpu_num_per_node * node_num
+    rgb_mean = [123.68, 116.78, 103.94]
+    (image, label) = flow.data.ofrecord_image_classification_reader(
+        data_dir,
+        batch_size=total_batch_size,
+        data_part_num=args.data_part_num,
+        image_feature_name="encoded",
+        label_feature_name="class/label",
+        color_space="RGB",
+        name="decode",
+    )
+    rsz = flow.image.resize(image, target_size=[227, 227], color_space="RGB")
+    normal = flow.image.crop_mirror_normalize(
+        rsz,
+        color_space="RGB",
+        output_layout="NCHW",
+        mean=rgb_mean,
+        output_dtype=flow.float,
+    )
+    return (normal, label)
diff --git a/python/oneflow/compatible/single_client/test/models/test_bert.py b/python/oneflow/compatible/single_client/test/models/test_bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..7da44d3530b0c360be12bf40775c6b7b1dd3ec1b
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/models/test_bert.py
@@ -0,0 +1,277 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import copy
+import os
+import sys
+import unittest
+
+import numpy as np
+from absl import flags
+from pretrain import PreTrain
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+FLAGS = flags.FLAGS
+flags.DEFINE_string("data_dir", "/dataset/bert/bert_seq_len_128_repeat1024", "")
+flags.DEFINE_string(
+    "model_load_dir", "/dataset/bert_regression_test/of_random_init_L-12_H-768_A-12", ""
+)
+flags.DEFINE_string("model_save_dir", "snapshots", "")
+flags.DEFINE_float("lr", 0.0001, "learning rate")
+flags.DEFINE_float("weight_decay_rate", 0.01, "")
+flags.DEFINE_integer("batch_size", 24, "")
+flags.DEFINE_integer("data_part_num", 8, "")
+flags.DEFINE_integer("seq_length", 128, "")
+flags.DEFINE_integer("max_predictions_per_seq", 20, "")
+flags.DEFINE_integer("num_hidden_layers", 12, "")
+flags.DEFINE_integer("num_attention_heads", 12, "")
+flags.DEFINE_integer("max_position_embeddings", 512, "")
+flags.DEFINE_integer("type_vocab_size", 2, "")
+flags.DEFINE_integer("vocab_size", 30522, "")
+flags.DEFINE_float("attention_probs_dropout_prob", 0.0, "")
+flags.DEFINE_float("hidden_dropout_prob", 0.0, "")
+flags.DEFINE_integer("hidden_size_per_head", 64, "")
+FLAGS(sys.argv)
+
+
+def _blob_conf(name, shape, dtype=flow.int32):
+    return flow.data.BlobConf(
+        name=name, shape=shape, dtype=dtype, codec=flow.data.RawCodec()
+    )
+
+
+def BertDecoder(
+    data_dir, batch_size=1, data_part_num=1, seq_length=128, max_predictions_per_seq=20
+):
+    ofrecord = flow.data.ofrecord_reader(
+        data_dir, batch_size=batch_size, data_part_num=data_part_num, name="decode"
+    )
+    input_ids = flow.data.ofrecord_raw_decoder(
+        ofrecord, "input_ids", shape=(seq_length,), dtype=flow.int32
+    )
+    next_sentence_labels = flow.data.ofrecord_raw_decoder(
+        ofrecord, "next_sentence_labels", shape=(1,), dtype=flow.int32
+    )
+    input_mask = flow.data.ofrecord_raw_decoder(
+        ofrecord, "input_mask", shape=(seq_length,), dtype=flow.int32
+    )
+    segment_ids = flow.data.ofrecord_raw_decoder(
+        ofrecord, "segment_ids", shape=(seq_length,), dtype=flow.int32
+    )
+    masked_lm_ids = flow.data.ofrecord_raw_decoder(
+        ofrecord, "masked_lm_ids", shape=(max_predictions_per_seq,), dtype=flow.int32
+    )
+    masked_lm_positions = flow.data.ofrecord_raw_decoder(
+        ofrecord,
+        "masked_lm_positions",
+        shape=(max_predictions_per_seq,),
+        dtype=flow.int32,
+    )
+    masked_lm_weights = flow.data.ofrecord_raw_decoder(
+        ofrecord,
+        "masked_lm_weights",
+        shape=(max_predictions_per_seq,),
+        dtype=flow.float,
+    )
+    return (
+        input_ids,
+        next_sentence_labels,
+        input_mask,
+        segment_ids,
+        masked_lm_ids,
+        masked_lm_positions,
+        masked_lm_weights,
+    )
+
+
+def BuildPreTrainNet(
+    batch_size,
+    data_part_num,
+    seq_length=128,
+    max_position_embeddings=512,
+    num_hidden_layers=12,
+    num_attention_heads=12,
+    hidden_dropout_prob=0.1,
+    attention_probs_dropout_prob=0.1,
+    vocab_size=30522,
+    type_vocab_size=2,
+    max_predictions_per_seq=20,
+):
+    hidden_size = 64 * num_attention_heads
+    intermediate_size = hidden_size * 4
+    if data_part_num == 1:
+        with flow.scope.placement("cpu", "0:0"):
+            decoders = BertDecoder(
+                FLAGS.data_dir,
+                batch_size,
+                data_part_num,
+                seq_length,
+                max_predictions_per_seq,
+            )
+    else:
+        assert data_part_num > 1
+        decoders = BertDecoder(
+            FLAGS.data_dir,
+            batch_size,
+            data_part_num,
+            seq_length,
+            max_predictions_per_seq,
+        )
+    input_ids = decoders[0]
+    next_sentence_labels = decoders[1]
+    input_mask = decoders[2]
+    token_type_ids = decoders[3]
+    masked_lm_ids = decoders[4]
+    masked_lm_positions = decoders[5]
+    masked_lm_weights = decoders[6]
+    return PreTrain(
+        input_ids,
+        input_mask,
+        token_type_ids,
+        masked_lm_positions,
+        masked_lm_ids,
+        masked_lm_weights,
+        next_sentence_labels,
+        vocab_size,
+        seq_length=seq_length,
+        hidden_size=hidden_size,
+        num_hidden_layers=num_hidden_layers,
+        num_attention_heads=num_attention_heads,
+        intermediate_size=intermediate_size,
+        hidden_act="gelu",
+        hidden_dropout_prob=hidden_dropout_prob,
+        attention_probs_dropout_prob=attention_probs_dropout_prob,
+        max_position_embeddings=max_position_embeddings,
+        type_vocab_size=type_vocab_size,
+        max_predictions_per_seq=max_predictions_per_seq,
+        initializer_range=0.02,
+    )
+
+
+def CreateOptimizer():
+    lr_warmup = flow.optimizer.warmup.linear(1000, 0)
+    lr_scheduler = flow.optimizer.PolynomialScheduler(
+        FLAGS.lr, 100000, 0.0, warmup=lr_warmup
+    )
+    return flow.optimizer.AdamW(
+        lr_scheduler,
+        epsilon=1e-06,
+        weight_decay=FLAGS.weight_decay_rate,
+        weight_decay_excludes=["bias", "LayerNorm", "layer_norm"],
+        grad_clipping=flow.optimizer.grad_clipping.by_global_norm(1.0),
+    )
+
+
+def PretrainJob():
+    total_loss = BuildPreTrainNet(
+        batch_size=FLAGS.batch_size,
+        data_part_num=FLAGS.data_part_num,
+        seq_length=FLAGS.seq_length,
+        max_position_embeddings=FLAGS.max_position_embeddings,
+        num_hidden_layers=FLAGS.num_hidden_layers,
+        num_attention_heads=FLAGS.num_attention_heads,
+        hidden_dropout_prob=FLAGS.hidden_dropout_prob,
+        attention_probs_dropout_prob=FLAGS.attention_probs_dropout_prob,
+        vocab_size=FLAGS.vocab_size,
+        type_vocab_size=FLAGS.type_vocab_size,
+        max_predictions_per_seq=FLAGS.max_predictions_per_seq,
+    )
+    opt = CreateOptimizer()
+    opt.minimize(total_loss)
+    return total_loss
+
+
+func_config = flow.FunctionConfig()
+func_config.default_logical_view(flow.scope.consistent_view())
+func_config.enable_auto_mixed_precision(FLAGS.enable_auto_mixed_precision)
+
+
+@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+def test_1n1c(test_case):
+    flow.config.enable_debug_mode(True)
+    flow.config.gpu_device_num(1)
+    pretrain_job = flow.global_function(type="train", function_config=func_config)(
+        PretrainJob
+    )
+    check_point = flow.train.CheckPoint()
+    check_point.load(FLAGS.model_load_dir)
+    of_loss = [pretrain_job().get().mean() for _ in range(10)]
+    print(of_loss)
+
+
+@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+def test_1n4c(test_case):
+    flow.config.gpu_device_num(4)
+    pretrain_job = flow.global_function(type="train", function_config=func_config)(
+        PretrainJob
+    )
+    check_point = flow.train.CheckPoint()
+    check_point.load(FLAGS.model_load_dir)
+    of_loss = [pretrain_job().get().mean() for _ in range(10)]
+    print(of_loss)
+
+
+@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+@flow.unittest.num_nodes_required(2)
+def test_2n8c(test_case):
+    flow.config.gpu_device_num(4)
+    pretrain_job = flow.global_function(type="train", function_config=func_config)(
+        PretrainJob
+    )
+    check_point = flow.train.CheckPoint()
+    check_point.load(FLAGS.model_load_dir)
+    of_loss = [pretrain_job().get().mean() for _ in range(10)]
+    print(of_loss)
+
+
+def test_inplace(test_case):
+    test_case.assertTrue(
+        np.allclose(GetSeveralLossesAsNumpy(True), GetSeveralLossesAsNumpy(False))
+    )
+
+
+def GetSeveralLossesAsNumpy(enable_inplace, num_iters=10):
+    flow.config.enable_debug_mode(True)
+    flow.config.gpu_device_num(1)
+    train_config = flow.FunctionConfig()
+    train_config.default_logical_view(flow.scope.consistent_view())
+    train_config.enable_inplace(enable_inplace)
+
+    @flow.global_function(type="train", function_config=train_config)
+    def PretrainJob():
+        loss = BuildPreTrainNet(
+            batch_size=FLAGS.batch_size,
+            data_part_num=FLAGS.data_part_num,
+            seq_length=FLAGS.seq_length,
+            max_position_embeddings=FLAGS.max_position_embeddings,
+            num_hidden_layers=1,
+            num_attention_heads=FLAGS.num_attention_heads,
+            hidden_dropout_prob=FLAGS.hidden_dropout_prob,
+            attention_probs_dropout_prob=FLAGS.attention_probs_dropout_prob,
+            vocab_size=FLAGS.vocab_size,
+            type_vocab_size=FLAGS.type_vocab_size,
+            max_predictions_per_seq=FLAGS.max_predictions_per_seq,
+        )
+        CreateOptimizer().minimize(loss)
+        return loss
+
+    check_point = flow.train.CheckPoint()
+    check_point.load(FLAGS.model_load_dir)
+    ret = [PretrainJob().get().mean() for _ in range(num_iters)]
+    flow.clear_default_session()
+    return np.array(ret)
diff --git a/python/oneflow/compatible/single_client/test/models/test_dcgan.py b/python/oneflow/compatible/single_client/test/models/test_dcgan.py
new file mode 100644
index 0000000000000000000000000000000000000000..f160d1a33ff84fc8df8c7360b7bde81c654318cb
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/models/test_dcgan.py
@@ -0,0 +1,326 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+def test_1n1c(test_case):
+    dcgan = DCGAN()
+    dcgan.compare_with_tf(1)
+
+
+@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+def test_1n4c(test_case):
+    dcgan = DCGAN()
+    dcgan.compare_with_tf(4)
+
+
+class DCGAN:
+    def __init__(self):
+        self.lr = 0.0001
+        self.z_dim = 100
+        self.batch_size = 32
+
+    def compare_with_tf(self, gpu_num, result_dir="/dataset/gan_test/dcgan/"):
+        flow.config.gpu_device_num(gpu_num)
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(type="train", function_config=func_config)
+        def test_generator(
+            z: oft.Numpy.Placeholder((self.batch_size, self.z_dim)),
+            label1: oft.Numpy.Placeholder((self.batch_size, 1)),
+        ):
+            g_out = self.generator(z, trainable=True, const_init=True)
+            g_logits = self.discriminator(g_out, trainable=False, const_init=True)
+            g_loss = flow.nn.sigmoid_cross_entropy_with_logits(
+                flow.ones_like(g_logits),
+                g_logits,
+                name="Gloss_sigmoid_cross_entropy_with_logits",
+            )
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [self.lr]), momentum=0
+            ).minimize(g_loss)
+            return g_loss
+
+        @flow.global_function(type="train", function_config=func_config)
+        def test_discriminator(
+            z: oft.Numpy.Placeholder((self.batch_size, 100)),
+            images: oft.Numpy.Placeholder((self.batch_size, 1, 28, 28)),
+            label1: oft.Numpy.Placeholder((self.batch_size, 1)),
+            label0: oft.Numpy.Placeholder((self.batch_size, 1)),
+        ):
+            g_out = self.generator(z, trainable=False, const_init=True)
+            g_logits = self.discriminator(g_out, trainable=True, const_init=True)
+            d_loss_fake = flow.nn.sigmoid_cross_entropy_with_logits(
+                flow.zeros_like(g_logits),
+                g_logits,
+                name="Dloss_fake_sigmoid_cross_entropy_with_logits",
+            )
+            d_logits = self.discriminator(
+                images, trainable=True, reuse=True, const_init=True
+            )
+            d_loss_real = flow.nn.sigmoid_cross_entropy_with_logits(
+                flow.ones_like(d_logits),
+                d_logits,
+                name="Dloss_real_sigmoid_cross_entropy_with_logits",
+            )
+            d_loss = d_loss_fake + d_loss_real
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [self.lr]), momentum=0
+            ).minimize(d_loss)
+            return d_loss
+
+        check_point = flow.train.CheckPoint()
+        check_point.init()
+        z = np.load(os.path.join(result_dir, "z.npy"))
+        imgs = np.load(os.path.join(result_dir, "img.npy")).transpose(0, 3, 1, 2)
+        label1 = np.ones((self.batch_size, 1)).astype(np.float32)
+        label0 = np.zeros((self.batch_size, 1)).astype(np.float32)
+        g_loss = test_generator(z, label1).get()
+        d_loss = test_discriminator(z, imgs, label1, label0).get()
+        tf_g_loss = np.load(os.path.join(result_dir, "g_loss.npy"))
+        tf_d_loss = np.load(os.path.join(result_dir, "d_loss.npy"))
+        if gpu_num == 1:
+            assert np.allclose(
+                g_loss.numpy(), tf_g_loss, rtol=0.01, atol=0.1
+            ), "{}-{}".format(g_loss.ndarray().mean(), tf_g_loss.mean())
+            assert np.allclose(
+                d_loss.numpy(), tf_d_loss, rtol=0.01, atol=0.1
+            ), "{}-{}".format(d_loss.ndarray().mean(), tf_d_loss.mean())
+
+    def generator(self, z, const_init=False, trainable=True):
+        h0 = layers.dense(
+            z, 7 * 7 * 256, name="g_fc1", const_init=const_init, trainable=trainable
+        )
+        h0 = layers.batchnorm(h0, axis=1, name="g_bn1")
+        h0 = flow.nn.leaky_relu(h0, 0.3)
+        h0 = flow.reshape(h0, (-1, 256, 7, 7))
+        h1 = layers.deconv2d(
+            h0,
+            128,
+            5,
+            strides=1,
+            name="g_deconv1",
+            const_init=const_init,
+            trainable=trainable,
+        )
+        h1 = layers.batchnorm(h1, name="g_bn2")
+        h1 = flow.nn.leaky_relu(h1, 0.3)
+        h2 = layers.deconv2d(
+            h1,
+            64,
+            5,
+            strides=2,
+            name="g_deconv2",
+            const_init=const_init,
+            trainable=trainable,
+        )
+        h2 = layers.batchnorm(h2, name="g_bn3")
+        h2 = flow.nn.leaky_relu(h2, 0.3)
+        out = layers.deconv2d(
+            h2,
+            1,
+            5,
+            strides=2,
+            name="g_deconv3",
+            const_init=const_init,
+            trainable=trainable,
+        )
+        out = flow.math.tanh(out)
+        return out
+
+    def discriminator(self, img, const_init=False, trainable=True, reuse=False):
+        h0 = layers.conv2d(
+            img,
+            64,
+            5,
+            name="d_conv1",
+            const_init=const_init,
+            trainable=trainable,
+            reuse=reuse,
+        )
+        h0 = flow.nn.leaky_relu(h0, 0.3)
+        h1 = layers.conv2d(
+            h0,
+            128,
+            5,
+            name="d_conv2",
+            const_init=const_init,
+            trainable=trainable,
+            reuse=reuse,
+        )
+        h1 = flow.nn.leaky_relu(h1, 0.3)
+        out = flow.reshape(h1, (self.batch_size, -1))
+        out = layers.dense(
+            out, 1, name="d_fc", const_init=const_init, trainable=trainable, reuse=reuse
+        )
+        return out
+
+
+class layers:
+    @staticmethod
+    def deconv2d(
+        input,
+        filters,
+        size,
+        name,
+        strides=2,
+        trainable=True,
+        reuse=False,
+        const_init=False,
+        use_bias=False,
+    ):
+        name_ = name if reuse == False else name + "_reuse"
+        weight_shape = (input.shape[1], filters, size, size)
+        output_shape = (
+            input.shape[0],
+            filters,
+            input.shape[2] * strides,
+            input.shape[3] * strides,
+        )
+        weight = flow.get_variable(
+            name + "-weight",
+            shape=weight_shape,
+            dtype=input.dtype,
+            initializer=flow.random_normal_initializer(stddev=0.02)
+            if not const_init
+            else flow.constant_initializer(0.002),
+            trainable=trainable,
+            reuse=reuse,
+        )
+        output = flow.nn.conv2d_transpose(
+            input,
+            weight,
+            strides=[strides, strides],
+            output_shape=output_shape,
+            padding="SAME",
+            data_format="NCHW",
+            name=name_,
+        )
+        if use_bias:
+            bias = flow.get_variable(
+                name + "-bias",
+                shape=(filters,),
+                dtype=input.dtype,
+                initializer=flow.constant_initializer(0.0),
+                trainable=trainable,
+                reuse=reuse,
+            )
+            output = flow.nn.bias_add(output, bias, "NCHW")
+        return output
+
+    @staticmethod
+    def conv2d(
+        input,
+        filters,
+        size,
+        name,
+        strides=2,
+        padding="same",
+        trainable=True,
+        reuse=False,
+        const_init=False,
+        use_bias=True,
+    ):
+        name_ = name if reuse == False else name + "_reuse"
+        weight_shape = (filters, input.shape[1], size, size)
+        weight = flow.get_variable(
+            name + "-weight",
+            shape=weight_shape,
+            dtype=input.dtype,
+            initializer=flow.random_normal_initializer(stddev=0.02)
+            if not const_init
+            else flow.constant_initializer(0.002),
+            trainable=trainable,
+            reuse=reuse,
+        )
+        output = flow.nn.compat_conv2d(
+            input,
+            weight,
+            strides=[strides, strides],
+            padding=padding,
+            data_format="NCHW",
+            name=name_,
+        )
+        if use_bias:
+            bias = flow.get_variable(
+                name + "-bias",
+                shape=(filters,),
+                dtype=input.dtype,
+                initializer=flow.constant_initializer(0.0),
+                trainable=trainable,
+                reuse=reuse,
+            )
+            output = flow.nn.bias_add(output, bias, "NCHW")
+        return output
+
+    @staticmethod
+    def dense(
+        input,
+        units,
+        name,
+        use_bias=False,
+        trainable=True,
+        reuse=False,
+        const_init=False,
+    ):
+        name_ = name if reuse == False else name + "_reuse"
+        in_shape = input.shape
+        in_num_axes = len(in_shape)
+        assert in_num_axes >= 2
+        inputs = flow.reshape(input, (-1, in_shape[-1])) if in_num_axes > 2 else input
+        weight = flow.get_variable(
+            name="{}-weight".format(name),
+            shape=(units, inputs.shape[1]),
+            dtype=inputs.dtype,
+            initializer=flow.random_normal_initializer(stddev=0.02)
+            if not const_init
+            else flow.constant_initializer(0.002),
+            trainable=trainable,
+            model_name="weight",
+            reuse=reuse,
+        )
+        out = flow.matmul(a=inputs, b=weight, transpose_b=True, name=name_ + "matmul")
+        if use_bias:
+            bias = flow.get_variable(
+                name="{}-bias".format(name),
+                shape=(units,),
+                dtype=inputs.dtype,
+                initializer=flow.random_normal_initializer()
+                if not const_init
+                else flow.constant_initializer(0.002),
+                trainable=trainable,
+                model_name="bias",
+                reuse=reuse,
+            )
+            out = flow.nn.bias_add(out, bias, name=name_ + "_bias_add")
+        out = flow.reshape(out, in_shape[:-1] + (units,)) if in_num_axes > 2 else out
+        return out
+
+    @staticmethod
+    def batchnorm(input, name, axis=1, reuse=False):
+        name_ = name if reuse == False else name + "_reuse"
+        return flow.layers.batch_normalization(input, axis=axis, name=name_)
diff --git a/python/oneflow/compatible/single_client/test/models/test_dcgan_model.py b/python/oneflow/compatible/single_client/test/models/test_dcgan_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f9b947c84c4bde435cb4b7b3e9178e0624372a9
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/models/test_dcgan_model.py
@@ -0,0 +1,358 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+
+class DCGAN(flow.model.Model):
+    def __init__(self, gpu_num, batch_size, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.gpu_num = gpu_num
+        self.lr = 0.0001
+        self.z_dim = 100
+        self.batch_size = batch_size
+
+    def _generator(self, z, const_init=False, trainable=True):
+        h0 = Layers.dense(
+            z, 7 * 7 * 256, name="g_fc1", const_init=const_init, trainable=trainable
+        )
+        h0 = Layers.batchnorm(h0, axis=1, name="g_bn1")
+        h0 = flow.nn.leaky_relu(h0, 0.3)
+        h0 = flow.reshape(h0, (-1, 256, 7, 7))
+        h1 = Layers.deconv2d(
+            h0,
+            128,
+            5,
+            strides=1,
+            name="g_deconv1",
+            const_init=const_init,
+            trainable=trainable,
+        )
+        h1 = Layers.batchnorm(h1, name="g_bn2")
+        h1 = flow.nn.leaky_relu(h1, 0.3)
+        h2 = Layers.deconv2d(
+            h1,
+            64,
+            5,
+            strides=2,
+            name="g_deconv2",
+            const_init=const_init,
+            trainable=trainable,
+        )
+        h2 = Layers.batchnorm(h2, name="g_bn3")
+        h2 = flow.nn.leaky_relu(h2, 0.3)
+        out = Layers.deconv2d(
+            h2,
+            1,
+            5,
+            strides=2,
+            name="g_deconv3",
+            const_init=const_init,
+            trainable=trainable,
+        )
+        out = flow.math.tanh(out)
+        return out
+
+    def _discriminator(self, img, const_init=False, trainable=True, reuse=False):
+        h0 = Layers.conv2d(
+            img,
+            64,
+            5,
+            name="d_conv1",
+            const_init=const_init,
+            trainable=trainable,
+            reuse=reuse,
+        )
+        h0 = flow.nn.leaky_relu(h0, 0.3)
+        h1 = Layers.conv2d(
+            h0,
+            128,
+            5,
+            name="d_conv2",
+            const_init=const_init,
+            trainable=trainable,
+            reuse=reuse,
+        )
+        h1 = flow.nn.leaky_relu(h1, 0.3)
+        out = flow.reshape(h1, (self.batch_size, -1))
+        out = Layers.dense(
+            out, 1, name="d_fc", const_init=const_init, trainable=trainable, reuse=reuse
+        )
+        return out
+
+    def forward(self, batch, const_init=False, trainable=False):
+        return self._generator(batch, const_init=const_init, trainable=trainable)
+
+    def training_step(self, batch, optimizer_idx):
+        if optimizer_idx == 0:
+            (z,) = batch
+            g_out = self._generator(z, trainable=True, const_init=True)
+            g_logits = self._discriminator(g_out, trainable=False, const_init=True)
+            g_loss = flow.nn.sigmoid_cross_entropy_with_logits(
+                flow.ones_like(g_logits),
+                g_logits,
+                name="Gloss_sigmoid_cross_entropy_with_logits",
+            )
+            return (g_loss, g_out)
+        elif optimizer_idx == 1:
+            (z, images) = batch
+            g_out = self._generator(z, trainable=False, const_init=True)
+            g_logits = self._discriminator(g_out, trainable=True, const_init=True)
+            d_loss_fake = flow.nn.sigmoid_cross_entropy_with_logits(
+                flow.zeros_like(g_logits),
+                g_logits,
+                name="Dloss_fake_sigmoid_cross_entropy_with_logits",
+            )
+            d_logits = self._discriminator(
+                images, trainable=True, reuse=True, const_init=True
+            )
+            d_loss_real = flow.nn.sigmoid_cross_entropy_with_logits(
+                flow.ones_like(d_logits),
+                d_logits,
+                name="Dloss_real_sigmoid_cross_entropy_with_logits",
+            )
+            d_loss = d_loss_fake + d_loss_real
+            return d_loss
+
+    def configure_optimizers(self):
+        generator_opt = flow.optimizer.SGD(
+            flow.optimizer.PiecewiseConstantScheduler([], [self.lr]), momentum=0
+        )
+        discriminator_opt = flow.optimizer.SGD(
+            flow.optimizer.PiecewiseConstantScheduler([], [self.lr]), momentum=0
+        )
+        return [generator_opt, discriminator_opt]
+
+
+class LossMoniter(flow.model.Callback):
+    def __init__(self, result_dir):
+        self.result_dir = result_dir
+
+    def on_training_step_end(self, step_idx, outputs, optimizer_idx):
+        if optimizer_idx == 0:
+            (g_loss, g_out) = outputs
+            fmt_str = "{:>12}  {:>12}  {:>12.6f}"
+            print(fmt_str.format(step_idx, "train g_loss:", g_loss.numpy().mean()))
+            print(fmt_str.format(step_idx, "train g_out:", g_out.numpy().mean()))
+            tf_g_loss = np.load(os.path.join(self.result_dir, "g_loss.npy"))
+            assert np.allclose(
+                g_loss.numpy(), tf_g_loss, rtol=0.01, atol=0.1
+            ), "{}-{}".format(g_loss.numpy().mean(), tf_g_loss.mean())
+        elif optimizer_idx == 1:
+            d_loss = outputs
+            fmt_str = "{:>12}  {:>12}  {:>12.6f}"
+            print(fmt_str.format(step_idx, "train d_loss:", d_loss.numpy().mean()))
+            tf_d_loss = np.load(os.path.join(self.result_dir, "d_loss.npy"))
+            assert np.allclose(
+                d_loss.numpy(), tf_d_loss, rtol=0.01, atol=0.1
+            ), "{}-{}".format(d_loss.numpy().mean(), tf_d_loss.mean())
+
+
+class NumpyTrainData(flow.model.NumpyDataModule):
+    def __init__(self, result_dir, batch_size):
+        super().__init__()
+        self.z = np.load(os.path.join(result_dir, "z.npy"))
+        self.images = np.load(os.path.join(result_dir, "img.npy")).transpose(0, 3, 1, 2)
+
+    def forward(self, step_idx, optimizer_idx):
+        if optimizer_idx == 0:
+            return (self.z,)
+        else:
+            return (self.z, self.images)
+
+
+class NumpyValData(flow.model.NumpyDataModule):
+    def __init__(self, result_dir, batch_size):
+        super().__init__()
+        self.z = np.load(os.path.join(result_dir, "z.npy"))
+
+    def forward(self, step_idx):
+        return (self.z,)
+
+
+@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+def test_1n1c(test_case):
+    dcgan_compare = DCGANCompare()
+    dcgan_compare.compare_with_tf(1)
+
+
+class DCGANCompare:
+    def compare_with_tf(self, gpu_num, result_dir="/dataset/gan_test/dcgan/"):
+        batch_size = 32
+        flow.config.gpu_device_num(gpu_num)
+        train_exe_config = flow.ExecutionConfig()
+        train_exe_config.default_data_type(flow.float)
+        train_exe_config.default_logical_view(flow.scope.consistent_view())
+        train_config = flow.model.TrainingConfig()
+        train_config.config_execution(train_exe_config)
+        train_config.config_data(NumpyTrainData(result_dir, batch_size))
+        loss_monitor_cb = LossMoniter(result_dir)
+        dcgan_md = DCGAN(gpu_num, batch_size, is_deprecated_function_style=True)
+        dcgan_md.fit(
+            training_config=train_config, callbacks=[loss_monitor_cb], max_steps=3
+        )
+
+
+class Layers:
+    @staticmethod
+    def deconv2d(
+        input,
+        filters,
+        size,
+        name,
+        strides=2,
+        trainable=True,
+        reuse=False,
+        const_init=False,
+        use_bias=False,
+    ):
+        name_ = name if not reuse else name + "_reuse"
+        weight_shape = (input.shape[1], filters, size, size)
+        output_shape = (
+            input.shape[0],
+            filters,
+            input.shape[2] * strides,
+            input.shape[3] * strides,
+        )
+        weight = flow.get_variable(
+            name + "-weight",
+            shape=weight_shape,
+            dtype=input.dtype,
+            initializer=flow.random_normal_initializer(stddev=0.02)
+            if not const_init
+            else flow.constant_initializer(0.002),
+            trainable=trainable,
+            reuse=reuse,
+        )
+        output = flow.nn.conv2d_transpose(
+            input,
+            weight,
+            strides=[strides, strides],
+            output_shape=output_shape,
+            padding="SAME",
+            data_format="NCHW",
+            name=name_,
+        )
+        if use_bias:
+            bias = flow.get_variable(
+                name + "-bias",
+                shape=(filters,),
+                dtype=input.dtype,
+                initializer=flow.constant_initializer(0.0),
+                trainable=trainable,
+                reuse=reuse,
+            )
+            output = flow.nn.bias_add(output, bias, "NCHW")
+        return output
+
+    @staticmethod
+    def conv2d(
+        input,
+        filters,
+        size,
+        name,
+        strides=2,
+        padding="same",
+        trainable=True,
+        reuse=False,
+        const_init=False,
+        use_bias=True,
+    ):
+        name_ = name if not reuse else name + "_reuse"
+        weight_shape = (filters, input.shape[1], size, size)
+        weight = flow.get_variable(
+            name + "-weight",
+            shape=weight_shape,
+            dtype=input.dtype,
+            initializer=flow.random_normal_initializer(stddev=0.02)
+            if not const_init
+            else flow.constant_initializer(0.002),
+            trainable=trainable,
+            reuse=reuse,
+        )
+        output = flow.nn.compat_conv2d(
+            input,
+            weight,
+            strides=[strides, strides],
+            padding=padding,
+            data_format="NCHW",
+            name=name_,
+        )
+        if use_bias:
+            bias = flow.get_variable(
+                name + "-bias",
+                shape=(filters,),
+                dtype=input.dtype,
+                initializer=flow.constant_initializer(0.0),
+                trainable=trainable,
+                reuse=reuse,
+            )
+            output = flow.nn.bias_add(output, bias, "NCHW")
+        return output
+
+    @staticmethod
+    def dense(
+        input,
+        units,
+        name,
+        use_bias=False,
+        trainable=True,
+        reuse=False,
+        const_init=False,
+    ):
+        name_ = name if not reuse else name + "_reuse"
+        in_shape = input.shape
+        in_num_axes = len(in_shape)
+        assert in_num_axes >= 2
+        inputs = flow.reshape(input, (-1, in_shape[-1])) if in_num_axes > 2 else input
+        weight = flow.get_variable(
+            name="{}-weight".format(name),
+            shape=(units, inputs.shape[1]),
+            dtype=inputs.dtype,
+            initializer=flow.random_normal_initializer(stddev=0.02)
+            if not const_init
+            else flow.constant_initializer(0.002),
+            trainable=trainable,
+            model_name="weight",
+            reuse=reuse,
+        )
+        out = flow.matmul(a=inputs, b=weight, transpose_b=True, name=name_ + "matmul")
+        if use_bias:
+            bias = flow.get_variable(
+                name="{}-bias".format(name),
+                shape=(units,),
+                dtype=inputs.dtype,
+                initializer=flow.random_normal_initializer()
+                if not const_init
+                else flow.constant_initializer(0.002),
+                trainable=trainable,
+                model_name="bias",
+                reuse=reuse,
+            )
+            out = flow.nn.bias_add(out, bias, name=name_ + "_bias_add")
+        out = flow.reshape(out, in_shape[:-1] + (units,)) if in_num_axes > 2 else out
+        return out
+
+    @staticmethod
+    def batchnorm(input, name, axis=1, reuse=False):
+        name_ = name if not reuse else name + "_reuse"
+        return flow.layers.batch_normalization(input, axis=axis, name=name_)
diff --git a/python/oneflow/compatible/single_client/test/models/test_dqn.py b/python/oneflow/compatible/single_client/test/models/test_dqn.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4a32a1a24cc6f76637012061feb49e320030ee9
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/models/test_dqn.py
@@ -0,0 +1,250 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+def test_1n1c(test_case):
+    dqn = DQN("gpu")
+    dqn.test_parameters_copy()
+
+
+def getQNetParams(var_name_prefix: str = "QNet", is_train: bool = True):
+    weight_init = flow.variance_scaling_initializer(
+        scale=1.0, mode="fan_in", distribution="truncated_normal", data_format="NCHW"
+    )
+    bias_init = flow.constant_initializer(value=0.0)
+    conv_prefix = "_conv1"
+    conv1_weight = flow.get_variable(
+        var_name_prefix + conv_prefix + "_weight",
+        shape=(32, 4, 3, 3),
+        dtype=flow.float32,
+        initializer=weight_init,
+        trainable=is_train,
+    )
+    conv1_bias = flow.get_variable(
+        var_name_prefix + conv_prefix + "_bias",
+        shape=(32,),
+        dtype=flow.float32,
+        initializer=bias_init,
+        trainable=is_train,
+    )
+    conv_prefix = "_conv2"
+    conv2_weight = flow.get_variable(
+        var_name_prefix + conv_prefix + "_weight",
+        shape=(32, 32, 3, 3),
+        dtype=flow.float32,
+        initializer=weight_init,
+        trainable=is_train,
+    )
+    conv2_bias = flow.get_variable(
+        var_name_prefix + conv_prefix + "_bias",
+        shape=(32,),
+        dtype=flow.float32,
+        initializer=bias_init,
+        trainable=is_train,
+    )
+    fc_prefix = "_fc1"
+    fc1_weight = flow.get_variable(
+        var_name_prefix + fc_prefix + "_weight",
+        shape=(512, 32 * 16 * 16),
+        dtype=flow.float32,
+        initializer=weight_init,
+        trainable=is_train,
+    )
+    fc1_bias = flow.get_variable(
+        var_name_prefix + fc_prefix + "_bias",
+        shape=(512,),
+        dtype=flow.float32,
+        initializer=bias_init,
+        trainable=is_train,
+    )
+    fc_prefix = "_fc2"
+    fc2_weight = flow.get_variable(
+        var_name_prefix + fc_prefix + "_weight",
+        shape=(2, 512),
+        dtype=flow.float32,
+        initializer=weight_init,
+        trainable=is_train,
+    )
+    fc2_bias = flow.get_variable(
+        var_name_prefix + fc_prefix + "_bias",
+        shape=(2,),
+        dtype=flow.float32,
+        initializer=bias_init,
+        trainable=is_train,
+    )
+    return (
+        conv1_weight,
+        conv1_bias,
+        conv2_weight,
+        conv2_bias,
+        fc1_weight,
+        fc1_bias,
+        fc2_weight,
+        fc2_bias,
+    )
+
+
+BATCH_SIZE = 32
+
+
+def createOfQNet(
+    input_image: oft.Numpy.Placeholder((BATCH_SIZE, 4, 64, 64), dtype=flow.float32),
+    var_name_prefix: str = "QNet",
+    is_train: bool = True,
+) -> oft.Numpy:
+    (
+        conv1_weight,
+        conv1_bias,
+        conv2_weight,
+        conv2_bias,
+        fc1_weight,
+        fc1_bias,
+        fc2_weight,
+        fc2_bias,
+    ) = getQNetParams(var_name_prefix=var_name_prefix, is_train=is_train)
+    (
+        conv1_weight,
+        conv1_bias,
+        conv2_weight,
+        conv2_bias,
+        fc1_weight,
+        fc1_bias,
+        fc2_weight,
+        fc2_bias,
+    ) = getQNetParams(var_name_prefix=var_name_prefix, is_train=is_train)
+    conv1 = flow.nn.compat_conv2d(
+        input_image, conv1_weight, strides=[1, 1], padding="same", data_format="NCHW"
+    )
+    conv1 = flow.nn.bias_add(conv1, conv1_bias, "NCHW")
+    conv1 = flow.nn.relu(conv1)
+    pool1 = flow.nn.max_pool2d(conv1, 2, 2, "VALID", "NCHW", name="pool1")
+    conv2 = flow.nn.compat_conv2d(
+        pool1, conv2_weight, strides=[1, 1], padding="same", data_format="NCHW"
+    )
+    conv2 = flow.nn.bias_add(conv2, conv2_bias, "NCHW")
+    conv2 = flow.nn.relu(conv2)
+    pool2 = flow.nn.max_pool2d(conv2, 2, 2, "VALID", "NCHW", name="pool2")
+    pool2_flatten = flow.reshape(pool2, (BATCH_SIZE, -1))
+    fc1 = flow.matmul(a=pool2_flatten, b=fc1_weight, transpose_b=True)
+    fc1 = flow.nn.bias_add(fc1, fc1_bias)
+    fc1 = flow.nn.relu(fc1)
+    fc2 = flow.matmul(a=fc1, b=fc2_weight, transpose_b=True)
+    fc2 = flow.nn.bias_add(fc2, fc2_bias)
+    return fc2
+
+
+def get_train_config():
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float32)
+    func_config.default_logical_view(flow.scope.consistent_view())
+    return func_config
+
+
+def get_predict_config():
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float32)
+    func_config.default_logical_view(flow.scope.consistent_view())
+    return func_config
+
+
+class DQN:
+    def __init__(self, device_tag):
+        self.device_tag_ = device_tag
+
+    def test_parameters_copy(self):
+        @flow.global_function("train", get_train_config())
+        def trainQNet(
+            input_image: oft.Numpy.Placeholder(
+                (BATCH_SIZE, 4, 64, 64), dtype=flow.float32
+            ),
+            y_input: oft.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.float32),
+            action_input: oft.Numpy.Placeholder((BATCH_SIZE, 2), dtype=flow.float32),
+        ) -> oft.Numpy:
+            with flow.scope.placement(self.device_tag_, "0:0-0"):
+                out = createOfQNet(input_image, var_name_prefix="QNet", is_train=True)
+                Q_Action = flow.math.reduce_sum(out * action_input, axis=1)
+                cost = flow.math.reduce_mean(flow.math.square(y_input - Q_Action))
+                learning_rate = 0.0002
+                flow.optimizer.SGD(
+                    flow.optimizer.PiecewiseConstantScheduler([], [learning_rate]),
+                    momentum=0,
+                ).minimize(cost)
+            return out
+
+        @flow.global_function("predict", get_predict_config())
+        def predictQNet(
+            input_image: oft.Numpy.Placeholder(
+                (BATCH_SIZE, 4, 64, 64), dtype=flow.float32
+            )
+        ) -> oft.Numpy:
+            with flow.scope.placement(self.device_tag_, "0:0-0"):
+                out = createOfQNet(input_image, var_name_prefix="QNetT", is_train=False)
+                return out
+
+        @flow.global_function("predict", get_predict_config())
+        def copyQNetToQnetT():
+            with flow.scope.placement(self.device_tag_, "0:0-0"):
+                (
+                    t_conv1_weight,
+                    t_conv1_bias,
+                    t_conv2_weight,
+                    t_conv2_bias,
+                    t_fc1_weight,
+                    t_fc1_bias,
+                    t_fc2_weight,
+                    t_fc2_bias,
+                ) = getQNetParams(var_name_prefix="QNet", is_train=True)
+                (
+                    p_conv1_weight,
+                    p_conv1_bias,
+                    p_conv2_weight,
+                    p_conv2_bias,
+                    p_fc1_weight,
+                    p_fc1_bias,
+                    p_fc2_weight,
+                    p_fc2_bias,
+                ) = getQNetParams(var_name_prefix="QNetT", is_train=False)
+                flow.assign(p_conv1_weight, t_conv1_weight)
+                flow.assign(p_conv1_bias, t_conv1_bias)
+                flow.assign(p_conv2_weight, t_conv2_weight)
+                flow.assign(p_conv2_bias, t_conv2_bias)
+                flow.assign(p_fc1_weight, t_fc1_weight)
+                flow.assign(p_fc1_bias, t_fc1_bias)
+                flow.assign(p_fc2_weight, t_fc2_weight)
+                flow.assign(p_fc2_bias, t_fc2_bias)
+
+        check_point = flow.train.CheckPoint()
+        check_point.init()
+        input_image = np.ones((BATCH_SIZE, 4, 64, 64)).astype(np.float32)
+        y_input = np.random.random_sample((BATCH_SIZE,)).astype(np.float32)
+        action_input = np.random.random_sample((BATCH_SIZE, 2)).astype(np.float32)
+        train_out = trainQNet(input_image, y_input, action_input)
+        copyQNetToQnetT()
+        train_out = trainQNet(input_image, y_input, action_input)
+        predict_out = predictQNet(input_image)
+        assert np.allclose(train_out, predict_out, rtol=0.01, atol=0.1), "{}-{}".format(
+            train_out.mean(), predict_out.mean()
+        )
diff --git a/python/oneflow/compatible/single_client/test/models/vgg16.py b/python/oneflow/compatible/single_client/test/models/vgg16.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7a48f91865d8d61b014651bbced3d3db88e8d1b
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/models/vgg16.py
@@ -0,0 +1,303 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import argparse
+import os
+from datetime import datetime
+
+import numpy
+
+from oneflow.compatible import single_client as flow
+from oneflow.core.job import initializer_conf_pb2 as initializer_conf_util
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+
+_DATA_DIR = "/dataset/PNGS/PNG224/of_record_repeated"
+_SINGLE_DATA_DIR = "/dataset/PNGS/PNG224/of_record"
+_MODEL_LOAD_DIR = "/dataset/PNGS/cnns_model_for_test/vgg16/models/of_model"
+_MODEL_SAVE_DIR = "./model_save-{}".format(
+    str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))
+)
+NODE_LIST = "192.168.1.12,192.168.1.14"
+
+
+class DLNetSpec(object):
+    def __init__(self, enable_auto_mixed_precision):
+        self.batch_size = 8
+        self.data_part_num = 32
+        self.eval_dir = _DATA_DIR
+        self.train_dir = _DATA_DIR
+        self.model_save_dir = _MODEL_SAVE_DIR
+        self.model_load_dir = _MODEL_LOAD_DIR
+        self.num_nodes = 1
+        self.gpu_num_per_node = 1
+        self.iter_num = 10
+        self.enable_auto_mixed_precision = enable_auto_mixed_precision
+
+
+parser = argparse.ArgumentParser(description="flags for multi-node and resource")
+parser.add_argument("-g", "--gpu_num_per_node", type=int, default=1, required=False)
+parser.add_argument("-i", "--iter_num", type=int, default=10, required=False)
+parser.add_argument(
+    "-m", "--multinode", default=False, action="store_true", required=False
+)
+parser.add_argument("-n", "--node_list", type=str, default=NODE_LIST, required=False)
+parser.add_argument(
+    "-s", "--skip_scp_binary", default=False, action="store_true", required=False
+)
+parser.add_argument(
+    "-c",
+    "--scp_binary_without_uuid",
+    default=False,
+    action="store_true",
+    required=False,
+)
+parser.add_argument(
+    "-r", "--remote_by_hand", default=False, action="store_true", required=False
+)
+parser.add_argument("-e", "--eval_dir", type=str, default=_DATA_DIR, required=False)
+parser.add_argument("-t", "--train_dir", type=str, default=_DATA_DIR, required=False)
+parser.add_argument(
+    "-load", "--model_load_dir", type=str, default=_MODEL_LOAD_DIR, required=False
+)
+parser.add_argument(
+    "-save", "--model_save_dir", type=str, default=_MODEL_SAVE_DIR, required=False
+)
+parser.add_argument("-dn", "--data_part_num", type=int, default=32, required=False)
+parser.add_argument("-b", "--batch_size", type=int, default=8, required=False)
+
+
+def _conv2d_layer(
+    name,
+    input,
+    filters,
+    kernel_size=3,
+    strides=1,
+    padding="VALID",
+    data_format="NCHW",
+    dilation_rate=1,
+    activation=op_conf_util.kRelu,
+    use_bias=True,
+    weight_initializer=flow.random_uniform_initializer(),
+    bias_initializer=flow.constant_initializer(),
+):
+    weight_shape = (filters, input.shape[1], kernel_size, kernel_size)
+    weight = flow.get_variable(
+        name + "-weight",
+        shape=weight_shape,
+        dtype=input.dtype,
+        initializer=weight_initializer,
+    )
+    output = flow.nn.conv2d(
+        input, weight, strides, padding, None, data_format, dilation_rate, name=name
+    )
+    if use_bias:
+        bias = flow.get_variable(
+            name + "-bias",
+            shape=(filters,),
+            dtype=input.dtype,
+            initializer=bias_initializer,
+        )
+        output = flow.nn.bias_add(output, bias, "NCHW")
+    if activation is not None:
+        if activation == op_conf_util.kRelu:
+            output = flow.math.relu(output)
+        else:
+            raise NotImplementedError
+    return output
+
+
+def _data_load_layer(args, data_dir):
+    node_num = args.num_nodes
+    total_batch_size = args.batch_size * args.gpu_num_per_node * node_num
+    rgb_mean = [123.68, 116.78, 103.94]
+    ofrecord = flow.data.ofrecord_reader(
+        data_dir,
+        batch_size=total_batch_size,
+        data_part_num=args.data_part_num,
+        name="decode",
+    )
+    image = flow.data.ofrecord_image_decoder(ofrecord, "encoded", color_space="RGB")
+    label = flow.data.ofrecord_raw_decoder(
+        ofrecord, "class/label", shape=(), dtype=flow.int32
+    )
+    rsz = flow.image.resize(image, resize_x=224, resize_y=224, color_space="RGB")
+    normal = flow.image.crop_mirror_normalize(
+        rsz,
+        color_space="RGB",
+        output_layout="NCHW",
+        mean=rgb_mean,
+        output_dtype=flow.float,
+    )
+    return (label, normal)
+
+
+def _conv_block(in_blob, index, filters, conv_times):
+    conv_block = []
+    conv_block.insert(0, in_blob)
+    for i in range(conv_times):
+        conv_i = _conv2d_layer(
+            name="conv{}".format(index),
+            input=conv_block[i],
+            filters=filters,
+            kernel_size=3,
+            strides=1,
+        )
+        conv_block.append(conv_i)
+        index += 1
+    return conv_block
+
+
+def vgg(images, labels, trainable=True):
+    to_return = []
+    conv1 = _conv_block(images, 0, 64, 2)
+    pool1 = flow.nn.max_pool2d(conv1[-1], 2, 2, "VALID", "NCHW", name="pool1")
+    conv2 = _conv_block(pool1, 2, 128, 2)
+    pool2 = flow.nn.max_pool2d(conv2[-1], 2, 2, "VALID", "NCHW", name="pool2")
+    conv3 = _conv_block(pool2, 4, 256, 3)
+    pool3 = flow.nn.max_pool2d(conv3[-1], 2, 2, "VALID", "NCHW", name="pool3")
+    conv4 = _conv_block(pool3, 7, 512, 3)
+    pool4 = flow.nn.max_pool2d(conv4[-1], 2, 2, "VALID", "NCHW", name="pool4")
+    conv5 = _conv_block(pool4, 10, 512, 3)
+    pool5 = flow.nn.max_pool2d(conv5[-1], 2, 2, "VALID", "NCHW", name="pool5")
+
+    def _get_kernel_initializer():
+        kernel_initializer = initializer_conf_util.InitializerConf()
+        kernel_initializer.truncated_normal_conf.std = 0.816496580927726
+        return kernel_initializer
+
+    def _get_bias_initializer():
+        bias_initializer = initializer_conf_util.InitializerConf()
+        bias_initializer.constant_conf.value = 0.0
+        return bias_initializer
+
+    pool5 = flow.reshape(pool5, [-1, 512])
+    fc6 = flow.layers.dense(
+        inputs=pool5,
+        units=4096,
+        activation=flow.math.relu,
+        use_bias=True,
+        kernel_initializer=_get_kernel_initializer(),
+        bias_initializer=_get_bias_initializer(),
+        trainable=trainable,
+        name="fc1",
+    )
+    fc7 = flow.layers.dense(
+        inputs=fc6,
+        units=4096,
+        activation=flow.math.relu,
+        use_bias=True,
+        kernel_initializer=_get_kernel_initializer(),
+        bias_initializer=_get_bias_initializer(),
+        trainable=trainable,
+        name="fc2",
+    )
+    fc8 = flow.layers.dense(
+        inputs=fc7,
+        units=1001,
+        use_bias=True,
+        kernel_initializer=_get_kernel_initializer(),
+        bias_initializer=_get_bias_initializer(),
+        trainable=trainable,
+        name="fc_final",
+    )
+    loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+        labels, fc8, name="softmax_loss"
+    )
+    to_return.append(loss)
+    return tuple(to_return)
+
+
+def main(args):
+    flow.config.machine_num(args.num_nodes)
+    flow.config.gpu_device_num(args.gpu_num_per_node)
+    train_config = flow.FunctionConfig()
+    train_config.default_logical_view(flow.scope.consistent_view())
+    train_config.default_data_type(flow.float)
+    train_config.enable_auto_mixed_precision(args.enable_auto_mixed_precision)
+
+    @flow.global_function(type="train", function_config=train_config)
+    def vgg_train_job():
+        (labels, images) = _data_load_layer(args, args.train_dir)
+        to_return = vgg(images, labels)
+        loss = to_return[-1]
+        flow.optimizer.SGD(
+            flow.optimizer.PiecewiseConstantScheduler([], [1e-05]), momentum=0
+        ).minimize(loss)
+        return loss
+
+    eval_config = flow.FunctionConfig()
+    eval_config.default_logical_view(flow.scope.consistent_view())
+    eval_config.default_data_type(flow.float)
+    eval_config.enable_auto_mixed_precision(args.enable_auto_mixed_precision)
+
+    @flow.global_function(function_config=eval_config)
+    def vgg_eval_job():
+        (labels, images) = _data_load_layer(args, args.eval_dir)
+        return vgg(images, labels, False)
+
+    check_point = flow.train.CheckPoint()
+    if not args.model_load_dir:
+        check_point.init()
+    else:
+        check_point.load(args.model_load_dir)
+    num_nodes = args.num_nodes
+    print(
+        "Traning vgg16: num_gpu_per_node = {}, num_nodes = {}.".format(
+            args.gpu_num_per_node, num_nodes
+        )
+    )
+    print("{:>12}  {:>12}  {:>12}".format("iter", "loss type", "loss value"))
+    loss = []
+    for i in range(args.iter_num):
+        train_loss = vgg_train_job().get().mean()
+        loss.append(train_loss)
+        fmt_str = "{:>12}  {:>12}  {:>12.6f}"
+        print(fmt_str.format(i, "train loss:", train_loss))
+        if (i + 1) % 100 == 0:
+            check_point.save(_MODEL_SAVE_DIR + str(i))
+    loss_file = "{}n{}c.npy".format(
+        str(num_nodes), str(args.gpu_num_per_node * num_nodes)
+    )
+    loss_path = "./of_loss/vgg16"
+    if not os.path.exists(loss_path):
+        os.makedirs(loss_path)
+    numpy.save(os.path.join(loss_path, loss_file), loss)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    flow.env.log_dir("./log")
+    if args.multinode:
+        flow.env.ctrl_port(12138)
+        nodes = []
+        for n in args.node_list.strip().split(","):
+            addr_dict = {}
+            addr_dict["addr"] = n
+            nodes.append(addr_dict)
+        flow.env.machine(nodes)
+        if args.remote_by_hand is False:
+            if args.scp_binary_without_uuid:
+                flow.deprecated.init_worker(scp_binary=True, use_uuid=False)
+            elif args.skip_scp_binary:
+                flow.deprecated.init_worker(scp_binary=False, use_uuid=False)
+            else:
+                flow.deprecated.init_worker(scp_binary=True, use_uuid=True)
+    main(args)
+    if (
+        args.multinode
+        and args.skip_scp_binary is False
+        and (args.scp_binary_without_uuid is False)
+    ):
+        flow.deprecated.delete_worker()
diff --git a/python/oneflow/compatible/single_client/test/ops/image_test_util.py b/python/oneflow/compatible/single_client/test/ops/image_test_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..6492956c8b64db9562fd9aacedff61c4e9d82b60
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/image_test_util.py
@@ -0,0 +1,159 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+import random
+
+import cv2
+import numpy as np
+import PIL
+
+from oneflow.compatible import single_client as flow
+
+global_coco_dict = dict()
+default_coco_anno_file = "/dataset/mscoco_2017/annotations/instances_val2017.json"
+default_coco_image_dir = "/dataset/mscoco_2017/val2017"
+
+
+def get_coco(anno_file):
+    global global_coco_dict
+    if anno_file not in global_coco_dict:
+        from pycocotools.coco import COCO
+
+        global_coco_dict[anno_file] = COCO(anno_file)
+    return global_coco_dict[anno_file]
+
+
+def random_sample_images_from_coco(
+    anno_file=default_coco_anno_file, image_dir=default_coco_image_dir, batch_size=2
+):
+    image_files = []
+    image_ids = []
+    batch_group_id = -1
+    coco = get_coco(anno_file)
+    img_ids = coco.getImgIds()
+    while len(image_files) < batch_size:
+        rand_img_id = random.choice(img_ids)
+        img_h = coco.imgs[rand_img_id]["height"]
+        img_w = coco.imgs[rand_img_id]["width"]
+        group_id = int(img_h / img_w)
+        if batch_group_id == -1:
+            batch_group_id = group_id
+        if group_id != batch_group_id:
+            continue
+        image_files.append(os.path.join(image_dir, coco.imgs[rand_img_id]["file_name"]))
+        image_ids.append(rand_img_id)
+    assert len(image_files) == len(image_ids)
+    return (image_files, image_ids)
+
+
+def read_images_by_cv(image_files, dtype, channels=3):
+    np_dtype = flow.convert_oneflow_dtype_to_numpy_dtype(dtype)
+    images = [cv2.imread(image_file).astype(np_dtype) for image_file in image_files]
+    assert all((isinstance(image, np.ndarray) for image in images))
+    assert all((image.ndim == 3 for image in images))
+    assert all((image.shape[2] == channels for image in images))
+    return images
+
+
+def read_images_by_pil(image_files, dtype, channels=3):
+    image_objs = [PIL.Image.open(image_file) for image_file in image_files]
+    images = []
+    np_dtype = flow.convert_oneflow_dtype_to_numpy_dtype(dtype)
+    for im in image_objs:
+        bands = im.getbands()
+        band = "".join(bands)
+        if band == "RGB":
+            images.append(np.asarray(im).astype(np_dtype)[:, :, ::-1])
+        elif band == "L":
+            gs_image = np.asarray(im).astype(np_dtype)
+            gs_image_shape = gs_image.shape
+            assert len(gs_image_shape) == 2
+            gs_image = gs_image.reshape(gs_image_shape + (1,))
+            gs_image = np.broadcast_to(gs_image, shape=gs_image_shape + (3,))
+            images.append(gs_image)
+        elif band == "BGR":
+            images.append(np.asarray(im).astype(np_dtype))
+        else:
+            raise NotImplementedError
+    assert all((isinstance(image, np.ndarray) for image in images))
+    assert all((image.ndim == 3 for image in images))
+    assert all((image.shape[2] == channels for image in images))
+    return images
+
+
+def infer_images_static_shape(images, channels=3):
+    image_shapes = [image.shape for image in images]
+    assert all((image.ndim == 3 for image in images))
+    assert all((image.shape[2] == channels for image in images))
+    image_shapes = np.asarray(image_shapes)
+    max_h = np.max(image_shapes[:, 0]).item()
+    max_w = np.max(image_shapes[:, 1]).item()
+    image_static_shape = (len(images), max_h, max_w, channels)
+    group_ids = []
+    aspect_ratio_list = []
+    for image_shape in image_shapes:
+        (h, w) = image_shape[0:2]
+        if h < w:
+            group_id = 0
+            aspect_ratio = h / w
+        else:
+            group_id = 1
+            aspect_ratio = w / h
+        group_ids.append(group_id)
+        aspect_ratio_list.append(aspect_ratio)
+    assert all((group_id == group_ids[0] for group_id in group_ids))
+    return (image_static_shape, aspect_ratio_list)
+
+
+def compute_keep_aspect_ratio_resized_size(
+    target_size, min_size, max_size, aspect_ratio, resize_side
+):
+    if resize_side == "shorter":
+        min_res_size = target_size
+        max_res_size = int(round(min_res_size / aspect_ratio))
+        if max_size is not None and max_res_size > max_size:
+            max_res_size = max_size
+            min_res_size = int(round(max_res_size * aspect_ratio))
+    elif resize_side == "longer":
+        max_res_size = target_size
+        min_res_size = int(round(max_res_size * aspect_ratio))
+        if min_size is not None and min_res_size < min_size:
+            min_res_size = min_size
+            max_res_size = int(round(min_res_size / aspect_ratio))
+    else:
+        raise NotImplementedError
+    return (min_res_size, max_res_size)
+
+
+def infer_keep_aspect_ratio_resized_images_static_shape(
+    target_size,
+    min_size,
+    max_size,
+    aspect_ratio_list,
+    resize_side="shorter",
+    channels=3,
+):
+    resized_size_list = []
+    for aspect_ratio in aspect_ratio_list:
+        resized_size_list.append(
+            compute_keep_aspect_ratio_resized_size(
+                target_size, min_size, max_size, aspect_ratio, resize_side
+            )
+        )
+    (res_min_size, res_max_size) = max(
+        resized_size_list, key=lambda size: size[0] * size[1]
+    )
+    return (res_min_size, res_max_size, channels)
diff --git a/python/oneflow/compatible/single_client/test/ops/test_2d_gpu_variable.py b/python/oneflow/compatible/single_client/test/ops/test_2d_gpu_variable.py
new file mode 100644
index 0000000000000000000000000000000000000000..52024d6a45ce28038fe54522fdc55908e0157180
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_2d_gpu_variable.py
@@ -0,0 +1,51 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+
+@flow.unittest.skip_unless_1n2d()
+class Test2dGpuVariable(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_2d_gpu_variable(test_case):
+        flow.enable_eager_execution()
+        flow.config.gpu_device_num(2)
+        device_name = "0:0-1"
+
+        @flow.global_function(type="train", function_config=flow.FunctionConfig())
+        def Foo():
+            with flow.scope.placement("gpu", device_name):
+                w = flow.get_variable(
+                    "w",
+                    shape=(10,),
+                    dtype=flow.float,
+                    initializer=flow.constant_initializer(0),
+                )
+                print(w.numpy(0))
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.1]), momentum=0
+            ).minimize(w)
+
+        Foo()
+        Foo()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_KLDivloss.py b/python/oneflow/compatible/single_client/test/ops/test_KLDivloss.py
new file mode 100644
index 0000000000000000000000000000000000000000..7528b36d64c764f64da8ef7bd2142e52aef1638b
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_KLDivloss.py
@@ -0,0 +1,193 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+from typing import Dict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+def _compare_kldivloss_with_np(
+    input_shape, target_shape, log_target, device_type, machine_ids, device_counts
+):
+    input = np.random.random(size=input_shape).astype(np.float32)
+    target = np.random.random(size=target_shape).astype(np.float32)
+    log_target = log_target[0]
+    assert device_type in ["cpu", "gpu"]
+    flow.clear_default_session()
+    if device_type == "cpu":
+        flow.config.cpu_device_num(device_counts)
+    else:
+        flow.config.gpu_device_num(device_counts)
+    func_config = flow.FunctionConfig()
+    func_config.default_placement_scope(flow.scope.placement(device_type, machine_ids))
+    func_config.default_logical_view(flow.scope.consistent_view())
+
+    def np_kldivloss(np_input, np_target, np_log_target):
+        if log_target:
+            np_kl_div_loss = np.exp(np_target) * (np_target - np_input)
+        else:
+            np_kl_div_out_loss = target * (np.log(target) - np_input)
+            np_zeros = np.zeros_like(np_kl_div_out_loss, dtype=np.float32)
+            np_kl_div_loss = np.where(target > 0, np_kl_div_out_loss, np_zeros)
+        return {
+            "np_kldivloss": np_kl_div_loss,
+            "np_kldivloss_mean": np.mean(np_kl_div_loss),
+            "np_kldivloss_sum": np.sum(np_kl_div_loss),
+        }
+
+    np_out_kldivloss_dict = np_kldivloss(input, target, log_target)
+
+    def np_kldivloss_diff(input, target, np_log_target):
+        elem_cnt = input.size
+        if np_log_target:
+            _np_diff = -np.exp(target)
+        else:
+            _np_diff = -target
+            _zero_index = np.where(target > 0, 1, 0)
+            _np_diff = _np_diff * _zero_index
+        return {
+            "np_kldivloss_grad": _np_diff,
+            "np_kldivloss_grad_mean": _np_diff / elem_cnt,
+        }
+
+    np_grad_dict = np_kldivloss_diff(input, target, log_target)
+
+    def assert_prediction_grad(blob: tp.Numpy):
+        assert np.allclose(blob, np_grad_dict["np_kldivloss_grad_mean"], atol=0.0001)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def oneflow_kldivloss(
+        of_input: tp.Numpy.Placeholder(shape=input.shape),
+        of_target: tp.Numpy.Placeholder(shape=target.shape),
+    ) -> Dict[str, tp.Numpy]:
+        with flow.scope.placement(device_type, "0:0"):
+            v = flow.get_variable(
+                shape=input.shape,
+                dtype=flow.float32,
+                initializer=flow.zeros_initializer(),
+                name="x_var",
+            )
+            of_input = of_input + v
+        flow.watch_diff(of_input, assert_prediction_grad)
+        of_kldivloss = flow.nn.KLDivLoss(
+            of_input,
+            of_target,
+            log_target=log_target,
+            reduction="none",
+            name="kldivloss",
+        )
+        of_kldivloss_mean = flow.nn.KLDivLoss(
+            of_input,
+            of_target,
+            log_target=log_target,
+            reduction="mean",
+            name="kldivloss_mean",
+        )
+        of_kldivloss_sum = flow.nn.KLDivLoss(
+            of_input,
+            of_target,
+            log_target=log_target,
+            reduction="sum",
+            name="kldivloss_sum",
+        )
+        with flow.scope.placement(device_type, "0:0"):
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+            ).minimize(of_kldivloss_mean)
+        return {
+            "of_kldivloss": of_kldivloss,
+            "of_kldivloss_mean": of_kldivloss_mean,
+            "of_kldivloss_sum": of_kldivloss_sum,
+        }
+
+    of_out_kldivloss_dict = oneflow_kldivloss(input, target)
+    assert np.allclose(
+        of_out_kldivloss_dict["of_kldivloss"],
+        np_out_kldivloss_dict["np_kldivloss"],
+        atol=1e-05,
+    )
+    assert np.allclose(
+        of_out_kldivloss_dict["of_kldivloss_mean"],
+        np_out_kldivloss_dict["np_kldivloss_mean"],
+    )
+    assert np.allclose(
+        of_out_kldivloss_dict["of_kldivloss_sum"],
+        np_out_kldivloss_dict["np_kldivloss_sum"],
+    )
+
+
+def _gen_arg_dict(shape, log_target, device_type, machine_ids, device_counts):
+    arg_dict = OrderedDict()
+    arg_dict["input_shape"] = [shape]
+    arg_dict["target_shape"] = [shape]
+    arg_dict["log_target"] = [log_target]
+    arg_dict["device_type"] = [device_type]
+    arg_dict["machine_ids"] = [machine_ids]
+    arg_dict["device_counts"] = [device_counts]
+    return arg_dict
+
+
+@flow.unittest.skip_unless_1n1d()
+class Test_KLDivLoss_1n1d(flow.unittest.TestCase):
+    def test_kldivloss_cpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(3, 3),
+            log_target=[True],
+            device_type="cpu",
+            machine_ids="0:0",
+            device_counts=1,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_kldivloss_with_np(*arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_kldivloss_gpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(4, 4),
+            log_target=[False],
+            device_type="gpu",
+            machine_ids="0:0",
+            device_counts=1,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_kldivloss_with_np(*arg)
+
+
+@flow.unittest.skip_unless_1n2d()
+class Test_KLDivLoss_1n2d(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_kldivloss_gpu_1n2d(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(4, 4),
+            log_target=[True],
+            device_type="gpu",
+            machine_ids="0:0-1",
+            device_counts=2,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_kldivloss_with_np(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_MarginRankingLoss.py b/python/oneflow/compatible/single_client/test/ops/test_MarginRankingLoss.py
new file mode 100644
index 0000000000000000000000000000000000000000..861e31c6e0289e959825e6d5659532eb5fa83d0a
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_MarginRankingLoss.py
@@ -0,0 +1,201 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+from typing import Dict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+def _compare_margin_ranking_loss_with_np(
+    input1_shape,
+    input2_shape,
+    target_shape,
+    margin,
+    device_type,
+    machine_ids,
+    device_counts,
+):
+    input1 = np.random.random(size=input1_shape).astype(np.float32)
+    input2 = np.random.random(size=input2_shape).astype(np.float32)
+    target = np.random.random(size=target_shape).astype(np.float32)
+    assert device_type in ["cpu", "gpu"]
+    flow.clear_default_session()
+    if device_type == "cpu":
+        flow.config.cpu_device_num(device_counts)
+    else:
+        flow.config.gpu_device_num(device_counts)
+    func_config = flow.FunctionConfig()
+    func_config.default_placement_scope(flow.scope.placement(device_type, machine_ids))
+    func_config.default_logical_view(flow.scope.consistent_view())
+
+    def np_margin_ranking_loss(np_input1, np_input2, np_target, np_margin):
+        np_target = np.broadcast_to(np_target, shape=np_input1.shape)
+        np_margin_loss = np.maximum(0, -(np_input1 - np_input2) * np_target + np_margin)
+        np_margin_loss_mean = np.mean(np_margin_loss)
+        np_margin_loss_sum = np.sum(np_margin_loss)
+        return {
+            "np_margin_ranking_loss": np_margin_loss,
+            "np_margin_ranking_loss_mean": np_margin_loss_mean,
+            "np_margin_ranking_loss_sum": np_margin_loss_sum,
+        }
+
+    np_out_marginloss_dict = np_margin_ranking_loss(input1, input2, target, margin)
+
+    def np_margin_ranking_diff(np_out, np_target):
+        _elem_cnt = np_out.size
+        if np_out.shape != np_target.shape:
+            np_target = np.broadcast_to(np_target, shape=np_out.shape)
+        _clip_zero_index = np.where(np_out > 0, 1, 0)
+        _np_grad = -np_target
+        return {"np_margin_ranking_grad_mean": _np_grad * _clip_zero_index / _elem_cnt}
+
+    np_grad_dict = np_margin_ranking_diff(
+        np_out_marginloss_dict["np_margin_ranking_loss"], target
+    )
+
+    def assert_prediction_grad(blob: tp.Numpy):
+        assert np.allclose(blob, np_grad_dict["np_margin_ranking_grad_mean"])
+
+    @flow.global_function(type="train", function_config=func_config)
+    def oneflow_marginloss(
+        of_input1: tp.Numpy.Placeholder(shape=input1.shape),
+        of_input2: tp.Numpy.Placeholder(shape=input2.shape),
+        of_target: tp.Numpy.Placeholder(shape=target.shape),
+    ) -> Dict[str, tp.Numpy]:
+        with flow.scope.placement(device_type, "0:0"):
+            v = flow.get_variable(
+                shape=input1.shape,
+                dtype=flow.float32,
+                initializer=flow.constant_initializer(0),
+                name="x_var",
+            )
+            x_var = of_input1 + v
+        flow.watch_diff(x_var, assert_prediction_grad)
+        marginloss = flow.nn.MarginRankingLoss(
+            of_input1,
+            of_input2,
+            of_target,
+            margin=margin,
+            reduction="none",
+            name="of_marginloss",
+        )
+        marginloss_mean = flow.nn.MarginRankingLoss(
+            x_var,
+            of_input2,
+            of_target,
+            margin=margin,
+            reduction="mean",
+            name="of_marginloss_reduce_mean",
+        )
+        marginloss_sum = flow.nn.MarginRankingLoss(
+            of_input1,
+            of_input2,
+            of_target,
+            margin=margin,
+            reduction="sum",
+            name="of_marginloss_reduce_sum",
+        )
+        with flow.scope.placement(device_type, "0:0"):
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+            ).minimize(marginloss_mean)
+        return {
+            "of_margin_ranking_loss": marginloss,
+            "of_margin_ranking_loss_mean": marginloss_mean,
+            "of_margin_ranking_loss_sum": marginloss_sum,
+        }
+
+    of_out_marginloss_dict = oneflow_marginloss(input1, input2, target)
+    assert np.allclose(
+        of_out_marginloss_dict["of_margin_ranking_loss"],
+        np_out_marginloss_dict["np_margin_ranking_loss"],
+    )
+    assert np.allclose(
+        of_out_marginloss_dict["of_margin_ranking_loss_mean"],
+        np_out_marginloss_dict["np_margin_ranking_loss_mean"],
+    )
+    assert np.allclose(
+        of_out_marginloss_dict["of_margin_ranking_loss_sum"],
+        np_out_marginloss_dict["np_margin_ranking_loss_sum"],
+    )
+
+
+def _gen_arg_dict(shape, target_shape, margin, device_type, machine_ids, device_counts):
+    arg_dict = OrderedDict()
+    arg_dict["input1_shape"] = [shape]
+    arg_dict["input2_shape"] = [shape]
+    arg_dict["target_shape"] = [target_shape]
+    arg_dict["margin"] = [margin]
+    arg_dict["device_type"] = [device_type]
+    arg_dict["machine_ids"] = [machine_ids]
+    arg_dict["device_counts"] = [device_counts]
+    return arg_dict
+
+
+@flow.unittest.skip_unless_1n1d()
+class Testmarginloss1n1d(flow.unittest.TestCase):
+    def test_margin_ranking_loss_cpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(3, 5),
+            target_shape=(3, 1),
+            margin=0.3,
+            device_type="cpu",
+            machine_ids="0:0",
+            device_counts=1,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_margin_ranking_loss_with_np(*arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_margin_ranking_loss_gpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(4, 5),
+            target_shape=(4, 1),
+            margin=0.3,
+            device_type="gpu",
+            machine_ids="0:0",
+            device_counts=1,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_margin_ranking_loss_with_np(*arg)
+
+
+@flow.unittest.skip_unless_1n2d()
+class Testmarginloss1n2d(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_margin_ranking_loss_1n2d(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(3, 3),
+            target_shape=(3, 1),
+            margin=0.3,
+            device_type="gpu",
+            machine_ids="0:0-1",
+            device_counts=2,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_margin_ranking_loss_with_np(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_TestDataTypeAttr.py b/python/oneflow/compatible/single_client/test/ops/test_TestDataTypeAttr.py
new file mode 100644
index 0000000000000000000000000000000000000000..b37a72292f0cfd2763713b7a2deba3d9be4f6d1a
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_TestDataTypeAttr.py
@@ -0,0 +1,66 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def TestDataTypeAttr(input, output_type):
+    assert output_type in flow.dtypes()
+    return (
+        flow.user_op_builder("TestDataTypeAttr")
+        .Op("TestDataTypeAttr")
+        .Input("in", [input])
+        .Output("out")
+        .Attr("output_type", output_type)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def RunTest(data_type):
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(function_config=func_config)
+    def TestDataTypeAttrJob(input: oft.Numpy.Placeholder((10, 10), dtype=flow.float)):
+        return TestDataTypeAttr(input, type_name_to_flow_type[data_type])
+
+    input = np.random.random_sample((10, 10)).astype(np.float32)
+    output = TestDataTypeAttrJob(input).get().numpy()
+    assert output.dtype == type_name_to_np_type[data_type]
+
+
+@flow.unittest.skip_unless_1n1d()
+class Test_TestDataTypeAttr(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_data_type_attr(test_case):
+        for data_type in ["float32", "double", "int8", "int32", "int64", "uint8"]:
+            RunTest(data_type)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_TestDynamicSource.py b/python/oneflow/compatible/single_client/test/ops/test_TestDynamicSource.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc902533bbdcd737bcb1d2e26fdc31333f45741d
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_TestDynamicSource.py
@@ -0,0 +1,56 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+
+def my_test_source(name):
+    return (
+        flow.user_op_builder(name)
+        .Op("TestDynamicSource")
+        .Output("out")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class Test_TestDynamicSource(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_test_dynamic_source(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def TestSourceJob():
+            with flow.scope.placement("cpu", "0:0"):
+                ret = my_test_source("my_cc_test_source_op")
+            return ret
+
+        y = TestSourceJob().get().numpy_list()[0]
+        test_case.assertTrue(np.array_equal(y, np.arange(3.0)))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_TestListDataTypeAndListShapeAndListStringAttr.py b/python/oneflow/compatible/single_client/test/ops/test_TestListDataTypeAndListShapeAndListStringAttr.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd23d87f64678821a543f5770cbc4459370d87a2
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_TestListDataTypeAndListShapeAndListStringAttr.py
@@ -0,0 +1,87 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def TestListDataTypeAndListShapeAndListStringAttr(
+    input, out_shapes, out_types, string_list
+):
+    assert isinstance(out_shapes, list)
+    assert isinstance(out_types, list)
+    return (
+        flow.user_op_builder("TestListDataTypeAndListShapeAndListStringAttr")
+        .Op("TestListDataTypeAndListShapeAndListStringAttr")
+        .Input("in", [input])
+        .Output("out", 3)
+        .Attr("out_shapes", out_shapes)
+        .Attr("out_types", out_types)
+        .Attr("string_list", string_list)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()
+    )
+
+
+def RunTest(out_shapes, out_types):
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(function_config=func_config)
+    def TestListDataTypeAndListShapeAndListStringAttrJob(
+        input: oft.Numpy.Placeholder((10, 10), dtype=flow.float)
+    ):
+        return TestListDataTypeAndListShapeAndListStringAttr(
+            input,
+            out_shapes,
+            [type_name_to_flow_type[data_type] for data_type in out_types],
+            ["string1", "string2", "string3"],
+        )
+
+    input = np.random.random_sample((10, 10)).astype(np.float32)
+    outputs = [
+        x.numpy() for x in TestListDataTypeAndListShapeAndListStringAttrJob(input).get()
+    ]
+    for i in range(len(outputs)):
+        assert outputs[i].shape == out_shapes[i]
+        assert outputs[i].dtype == type_name_to_np_type[out_types[i]]
+
+
+def gen_arg_list():
+    arg_dict = OrderedDict()
+    arg_dict["out_shapes"] = [[(4, 4), (6, 6), (8, 8)]]
+    arg_dict["out_types"] = [["float32", "double", "int8"], ["int32", "int64", "uint8"]]
+    return GenArgList(arg_dict)
+
+
+@flow.unittest.skip_unless_1n1d()
+class Test_TestListDataTypeAndListShapeAndListStringAttr(flow.unittest.TestCase):
+    def test_data_type_attr(test_case):
+        for arg in gen_arg_list():
+            RunTest(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_TestMultiInputGrad.py b/python/oneflow/compatible/single_client/test/ops/test_TestMultiInputGrad.py
new file mode 100644
index 0000000000000000000000000000000000000000..59f1711e0d98690b3355825b67125709a3235782
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_TestMultiInputGrad.py
@@ -0,0 +1,90 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import test_global_storage
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+
+def TestMultiInput(x1, x2):
+    return (
+        flow.user_op_builder("my_test_multi_input")
+        .Op("TestMultiInput")
+        .Input("x1", [x1])
+        .Input("x2", [x2])
+        .Output("y")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class Test_TestMultiInputGrad(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_TestMultiInput_grad_mirrored_inplace(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.mirrored_view())
+        shape = (3, 3)
+
+        @flow.global_function(type="train", function_config=func_config)
+        def TestMultiInputJob():
+            with flow.scope.placement("gpu", "0:0"):
+                x1 = flow.get_variable(
+                    "x1",
+                    shape=shape,
+                    dtype=flow.float,
+                    initializer=flow.random_uniform_initializer(minval=-10, maxval=10),
+                    trainable=True,
+                )
+                x2 = flow.get_variable(
+                    "x2",
+                    shape=shape,
+                    dtype=flow.float,
+                    initializer=flow.random_uniform_initializer(minval=-10, maxval=10),
+                    trainable=True,
+                )
+                loss = TestMultiInput(x1, x2)
+                flow.optimizer.SGD(
+                    flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+                ).minimize(loss)
+                flow.watch(x1, test_global_storage.Setter("x1"))
+                flow.watch_diff(x1, test_global_storage.Setter("x1_diff"))
+                flow.watch(x2, test_global_storage.Setter("x2"))
+                flow.watch_diff(x2, test_global_storage.Setter("x2_diff"))
+                return loss
+
+        out = TestMultiInputJob().get()
+        x1_diff = test_global_storage.Get("x1_diff")
+        x2_diff = test_global_storage.Get("x2_diff")
+        expect_out = test_global_storage.Get("x1")
+        expect_x1_diff = np.ones(shape, dtype=np.float32)
+        expect_x2_diff = np.ones(shape, dtype=np.float32) * 2.0
+        assert np.allclose(out.numpy(), expect_out)
+        assert np.allclose(x1_diff, expect_x1_diff)
+        assert np.allclose(x2_diff, expect_x2_diff)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_TestMultiOutputOrder.py b/python/oneflow/compatible/single_client/test/ops/test_TestMultiOutputOrder.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a72aac6cdb0580e93fe18c0ee9438674251a6dc
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_TestMultiOutputOrder.py
@@ -0,0 +1,80 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def TestMultiOutputOrder(x, name):
+    return (
+        flow.user_op_builder(name)
+        .Op("TestMultiOutputOrder")
+        .Input("in", [x])
+        .Output("out1")
+        .Output("out2")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()
+    )
+
+
+def GenerateTest(test_case, shape):
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.consistent_view())
+
+    @flow.global_function(function_config=func_config)
+    def TestMultiOutputOrderJob(x: oft.Numpy.Placeholder(shape)):
+        return TestMultiOutputOrder(x, "my_2_output_op")
+
+    x = np.random.rand(*shape).astype(np.float32)
+    (out1, out2) = TestMultiOutputOrderJob(x).get()
+    out1_ndarray = out1.numpy()
+    out2_ndarray = out2.numpy()
+    out2_shape = list(shape)
+    out2_shape[-1] = out2_shape[-1] * 2
+    out2_shape = tuple(out2_shape)
+    test_case.assertTrue(shape == out1_ndarray.shape)
+    test_case.assertTrue(out2_shape == out2_ndarray.shape)
+    test_case.assertTrue(np.allclose(x, out1_ndarray))
+    test_case.assertTrue(
+        np.allclose(np.zeros(out2_shape, dtype=np.float32), out2_ndarray)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class Test_TestMultiOutputOrder(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_TestMultiOutputOrder_example_1(test_case):
+        GenerateTest(test_case, (7,))
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_TestMultiOutputOrder_example_2(test_case):
+        GenerateTest(test_case, (2, 5))
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_TestMultiOutputOrder_example_3(test_case):
+        GenerateTest(test_case, (3, 3, 2))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_TestRandomSource.py b/python/oneflow/compatible/single_client/test/ops/test_TestRandomSource.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe6e4ad96532936f786fbbe0d5dbe91440db0990
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_TestRandomSource.py
@@ -0,0 +1,64 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+
+def my_test_source(name, seed):
+    return (
+        flow.user_op_builder(name)
+        .Op("TestRandomSource")
+        .Output("out")
+        .Attr("seed", seed)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class Test_TestRandomSource(flow.unittest.TestCase):
+    def test_testsource(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def TestSourceJob():
+            with flow.scope.placement("cpu", "0:0"):
+                ret = my_test_source("my_cc_test_source_op", 0)
+            return ret
+
+        y = TestSourceJob().get().numpy()
+        rand_0_4 = np.array([0.5488136, 0.59284467, 0.7151894, 0.8442659, 0.6027634])
+        test_case.assertTrue(np.allclose(y, rand_0_4, atol=1e-05, rtol=1e-05))
+        y = TestSourceJob().get().numpy()
+        if flow.eager_execution_enabled():
+            rand_5_9 = rand_0_4
+        else:
+            rand_5_9 = np.array(
+                [0.85794574, 0.54488325, 0.84725183, 0.42365485, 0.62356377]
+            )
+        test_case.assertTrue(np.allclose(y, rand_5_9, atol=1e-05, rtol=1e-05))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_TestReshape.py b/python/oneflow/compatible/single_client/test/ops/test_TestReshape.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0e2e53003c307657d20104e7061926a5e106783
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_TestReshape.py
@@ -0,0 +1,83 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def TestReshape(x, shape, name):
+    return (
+        flow.user_op_builder(name)
+        .Op("TestReshape")
+        .Input("in", [x])
+        .Output("out")
+        .Attr("shape", shape)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def fixed_tensor_def_test(test_case, func_config):
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(function_config=func_config)
+    def ReshapeJob(x: oft.Numpy.Placeholder((10, 2))):
+        return TestReshape(x, [5, 4], "xx_test_reshape")
+
+    x = np.random.rand(10, 2).astype(np.float32)
+    y = ReshapeJob(x).get().numpy()
+    print(y.shape)
+    test_case.assertTrue((5, 4) == y.shape)
+    test_case.assertTrue(np.array_equal(x.reshape(5, 4), y))
+
+
+def mirrored_tensor_def_test(test_case, func_config):
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(function_config=func_config)
+    def ReshapeJob(x: oft.ListNumpy.Placeholder((10, 2))):
+        return TestReshape(x, [5, 4], "xx_test_reshape")
+
+    x = np.random.rand(10, 2).astype(np.float32)
+    y = ReshapeJob([x]).get().numpy_list()[0]
+    test_case.assertTrue((5, 4) == y.shape)
+    test_case.assertTrue(np.array_equal(x.reshape(5, 4), y))
+
+
+@flow.unittest.skip_unless_1n1d()
+class Test_TestReshape(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_fixed_TestReshape(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.consistent_view())
+        fixed_tensor_def_test(test_case, func_config)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_mirrored_TestReshape(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.mirrored_view())
+        mirrored_tensor_def_test(test_case, func_config)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_TestSource.py b/python/oneflow/compatible/single_client/test/ops/test_TestSource.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f16968173d32469224de4f7d07e27cc35ee23f9
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_TestSource.py
@@ -0,0 +1,69 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+
+def my_test_source(name):
+    return (
+        flow.user_op_builder(name)
+        .Op("TestSource")
+        .Output("out")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def TODO_test_mirror_testsource(test_case):
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.mirrored_view())
+
+    @flow.global_function(function_config=func_config)
+    def TestSourceJob():
+        with flow.scope.placement("cpu", "0:0"):
+            ret = my_test_source("my_cc_test_source_op")
+        return ret
+
+    y = TestSourceJob().get().numpy()
+    test_case.assertTrue(np.array_equal(y, np.arange(5.0)))
+
+
+@flow.unittest.skip_unless_1n1d()
+class Test_TestSource(flow.unittest.TestCase):
+    def test_testsource(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def TestSourceJob():
+            with flow.scope.placement("cpu", "0:0"):
+                ret = my_test_source("my_cc_test_source_op")
+            return ret
+
+        y = TestSourceJob().get().numpy()
+        test_case.assertTrue(np.array_equal(y, np.arange(5.0)))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_TestSourceMultiGpuFixedOutNum.py b/python/oneflow/compatible/single_client/test/ops/test_TestSourceMultiGpuFixedOutNum.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba34f2ca658831f04ea828549c4431669806d40e
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_TestSourceMultiGpuFixedOutNum.py
@@ -0,0 +1,57 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+
+def my_test_source(name, out_num):
+    return (
+        flow.user_op_builder(name)
+        .Op("TestSourceMultiGpuFixedOutNum")
+        .Output("out")
+        .Attr("out_num", out_num)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class Test_TestSourceMultiGpuFixedOutNum(flow.unittest.TestCase):
+    def test_testsource_2_gpu(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def TestSourceJob():
+            with flow.scope.placement("cpu", "0:0-1"):
+                ret = my_test_source("my_cc_test_source_op", 10)
+            return ret
+
+        y = TestSourceJob().get().numpy()
+        test_case.assertTrue(
+            np.array_equal(y, np.append(np.arange(5.0), np.arange(5.0)))
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_TripletMarginLoss.py b/python/oneflow/compatible/single_client/test/ops/test_TripletMarginLoss.py
new file mode 100644
index 0000000000000000000000000000000000000000..378c022eaeae0db8a5970040edbbf3df012b1fd7
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_TripletMarginLoss.py
@@ -0,0 +1,253 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+from typing import Dict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+def _compare_triplet_margin_loss_with_np(
+    anchor_shape,
+    pos_shape,
+    neg_shape,
+    eps,
+    margin,
+    p,
+    swap,
+    device_type,
+    machine_ids,
+    device_counts,
+):
+    anchor = np.random.random(size=anchor_shape).astype(np.float32)
+    pos = np.random.random(size=pos_shape).astype(np.float32)
+    neg = np.random.random(size=neg_shape).astype(np.float32)
+    eps = eps
+    assert device_type in ["cpu", "gpu"]
+    flow.clear_default_session()
+    if device_type == "cpu":
+        flow.config.cpu_device_num(device_counts)
+    else:
+        flow.config.gpu_device_num(device_counts)
+    func_config = flow.FunctionConfig()
+    func_config.default_placement_scope(flow.scope.placement(device_type, machine_ids))
+    func_config.default_logical_view(flow.scope.consistent_view())
+
+    def np_triplet_margin_loss(np_anchor, np_pos, np_neg, eps, np_margin, np_p, swap):
+        np_d_1_norm = np.power(np.abs(np_anchor - np_pos + eps), np_p)
+        np_d_2_norm = np.power(np.abs(np_anchor - np_neg + eps), np_p)
+        np_d_1 = np.power(np.sum(np_d_1_norm, axis=-1), 1.0 / np_p)
+        np_d_2 = np.power(np.sum(np_d_2_norm, axis=-1), 1.0 / np_p)
+        if swap:
+            np_dist_swap = np.power(np.abs(np_pos - np_neg + eps), np_p)
+            np_dist_swap = np.power(np.sum(np_dist_swap, axis=-1), 1.0 / np_p)
+            np_d_2 = np.minimum(np_d_2, np_dist_swap)
+        np_triplet_margin_loss = np.maximum(np_margin + np_d_1 - np_d_2, 0)
+        np_triplet_margin_loss_mean = np.mean(np_triplet_margin_loss)
+        np_triplet_margin_loss_sum = np.sum(np_triplet_margin_loss)
+        return {
+            "np_triplet_margin_loss": np_triplet_margin_loss,
+            "np_triplet_margin_loss_mean": np_triplet_margin_loss_mean,
+            "np_triplet_margin_loss_sum": np_triplet_margin_loss_sum,
+        }
+
+    np_out_tripletloss_dict = np_triplet_margin_loss(
+        anchor, pos, neg, eps, margin, p, swap
+    )
+
+    def np_triplet_loss_diff(anchor, pos, neg, margin, p):
+        def _compute_distance(x1, x2, x3):
+            d_1_norm = np.power(np.abs(x1 - x2 + 1e-06), p)
+            d_2_norm = np.power(np.abs(x1 - x3 + 1e-06), p)
+            d_1 = np.power(np.sum(d_1_norm, axis=-1), 1.0 / p)
+            d_2 = np.power(np.sum(d_2_norm, axis=-1), 1.0 / p)
+            return d_1 - d_2 + margin
+
+        def _compute_per_diff(x1, x2, p, eps=1e-06):
+            _abs_index = np.where(x1 - x2 > 0, 1, -1)
+            _abs_index_support = np.where(x1 - x2 == 0, 1, 0)
+            _abs_grad = _abs_index + _abs_index_support
+            _abs_val = np.abs(x1 - x2 + eps)
+            _power_abs_val = np.power(_abs_val, p)
+            _sum_val = np.sum(_power_abs_val, axis=1, keepdims=True)
+            _sqrt_sum_val = np.power(_sum_val + eps, 1.0 / p - 1)
+            _power_val = np.power(_abs_val, p - 1)
+            _grad = np.multiply(_sqrt_sum_val, _power_val)
+            _grad *= _abs_grad
+            return _grad / x1.shape[0]
+
+        d = _compute_distance(anchor, pos, neg)
+        zero_index = np.where(d < -1e-06)
+        anchor_grad_1 = _compute_per_diff(anchor, pos, p)
+        anchor_grad_2 = _compute_per_diff(anchor, neg, p)
+        total_grad = anchor_grad_1 - anchor_grad_2
+        for i in zero_index:
+            total_grad[i] = 0
+        grad_dict = {"np_triplet_loss_grad_mean": total_grad}
+        return grad_dict
+
+    np_grad_dict = np_triplet_loss_diff(anchor, pos, neg, margin, p)
+
+    def assert_prediction_grad(blob: tp.Numpy):
+        assert np.allclose(blob, np_grad_dict["np_triplet_loss_grad_mean"], atol=0.002)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def oneflow_marginloss(
+        of_anchor: tp.Numpy.Placeholder(shape=anchor.shape),
+        of_pos: tp.Numpy.Placeholder(shape=pos.shape),
+        of_neg: tp.Numpy.Placeholder(shape=neg.shape),
+    ) -> Dict[str, tp.Numpy]:
+        with flow.scope.placement(device_type, "0:0"):
+            v = flow.get_variable(
+                shape=anchor.shape,
+                dtype=flow.float32,
+                initializer=flow.constant_initializer(0),
+                name="x_var",
+            )
+            x_anchor = of_anchor + v
+        flow.watch_diff(x_anchor, assert_prediction_grad)
+        triplet_marginloss = flow.nn.TripletMarginLoss(
+            x_anchor,
+            of_pos,
+            of_neg,
+            margin=margin,
+            p=p,
+            swap=swap,
+            reduction="none",
+            name="of_tripletmarginloss",
+        )
+        triplet_marginloss_mean = flow.nn.TripletMarginLoss(
+            x_anchor,
+            of_pos,
+            of_neg,
+            margin=margin,
+            p=p,
+            swap=swap,
+            reduction="mean",
+            name="of_tripletmarginloss_mean",
+        )
+        triplet_marginloss_sum = flow.nn.TripletMarginLoss(
+            x_anchor,
+            of_pos,
+            of_neg,
+            margin=margin,
+            p=p,
+            swap=swap,
+            reduction="sum",
+            name="of_tripletmarginloss_sum",
+        )
+        with flow.scope.placement(device_type, "0:0"):
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+            ).minimize(triplet_marginloss_mean)
+        return {
+            "of_triplet_margin_loss": triplet_marginloss,
+            "of_triplet_margin_loss_mean": triplet_marginloss_mean,
+            "of_triplet_margin_loss_sum": triplet_marginloss_sum,
+        }
+
+    of_out_tripletloss_dict = oneflow_marginloss(anchor, pos, neg)
+    assert np.allclose(
+        of_out_tripletloss_dict["of_triplet_margin_loss"],
+        np_out_tripletloss_dict["np_triplet_margin_loss"],
+        atol=0.001,
+    )
+    assert np.allclose(
+        of_out_tripletloss_dict["of_triplet_margin_loss_mean"],
+        np_out_tripletloss_dict["np_triplet_margin_loss_mean"],
+        atol=0.001,
+    )
+    assert np.allclose(
+        of_out_tripletloss_dict["of_triplet_margin_loss_sum"],
+        np_out_tripletloss_dict["np_triplet_margin_loss_sum"],
+        atol=0.001,
+    )
+
+
+def _gen_arg_dict(shape, eps, margin, p, swap, device_type, machine_ids, device_counts):
+    arg_dict = OrderedDict()
+    arg_dict["anchor_shape"] = [shape]
+    arg_dict["pos_shape"] = [shape]
+    arg_dict["neg_shape"] = [shape]
+    arg_dict["eps"] = [eps]
+    arg_dict["margin"] = [margin]
+    arg_dict["p"] = [p]
+    arg_dict["swap"] = [swap]
+    arg_dict["device_type"] = [device_type]
+    arg_dict["machine_ids"] = [machine_ids]
+    arg_dict["device_counts"] = [device_counts]
+    return arg_dict
+
+
+@flow.unittest.skip_unless_1n1d()
+class Test_triplet_loss_1n1d(flow.unittest.TestCase):
+    def test_triplet_margin_loss_cpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(3, 3),
+            eps=1e-06,
+            margin=1,
+            p=1.5,
+            swap=False,
+            device_type="cpu",
+            machine_ids="0:0",
+            device_counts=1,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_triplet_margin_loss_with_np(*arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_margin_ranking_loss_gpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(3, 6),
+            eps=1e-06,
+            margin=1,
+            p=2.0,
+            swap=False,
+            device_type="gpu",
+            machine_ids="0:0",
+            device_counts=1,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_triplet_margin_loss_with_np(*arg)
+
+
+@flow.unittest.skip_unless_1n2d()
+class Testmarginloss1n2d(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_margin_ranking_loss_1n2d(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(6, 6),
+            eps=1e-06,
+            margin=1,
+            p=2.0,
+            swap=False,
+            device_type="gpu",
+            machine_ids="0:0-1",
+            device_counts=2,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_triplet_margin_loss_with_np(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_activations.py b/python/oneflow/compatible/single_client/test/ops/test_activations.py
new file mode 100644
index 0000000000000000000000000000000000000000..b28c7f117020431ed0d83e8e47e7580a25f1f0a1
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_activations.py
@@ -0,0 +1,102 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import math
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import test_global_storage
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def compare_with_tensorflow(device_type, activation_type, shape, data_type):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    flow.config.enable_debug_mode(True)
+    func_config = flow.FunctionConfig()
+    if data_type == flow.float16:
+        func_config.enable_auto_mixed_precision(True)
+        data_type = flow.float
+    func_config.default_data_type(data_type)
+    of_activation_map = {
+        "relu": flow.nn.relu,
+        "sigmoid": flow.math.sigmoid,
+        "tanh": flow.math.tanh,
+    }
+    tf_activation_map = {
+        "relu": tf.nn.relu,
+        "sigmoid": tf.math.sigmoid,
+        "tanh": tf.math.tanh,
+    }
+
+    @flow.global_function(type="train", function_config=func_config)
+    def ActivationJob():
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "x",
+                shape=shape,
+                dtype=data_type,
+                initializer=flow.random_uniform_initializer(minval=-10, maxval=10),
+                trainable=True,
+            )
+            loss = of_activation_map[activation_type](x)
+            lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.0001])
+            flow.optimizer.SGD(lr_scheduler, momentum=0).minimize(loss)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch(loss, test_global_storage.Setter("loss"))
+            flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))
+            return loss
+
+    of_out = ActivationJob().get()
+    with tf.GradientTape(persistent=True) as tape:
+        x = tf.Variable(test_global_storage.Get("x"))
+        tf_out = tf_activation_map[activation_type](x)
+    loss_diff = test_global_storage.Get("loss_diff")
+    tf_x_diff = tape.gradient(tf_out, x, loss_diff)
+    rtol = 1e-05
+    atol = 1e-05
+    assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol, atol)
+    assert np.allclose(test_global_storage.Get("x_diff"), tf_x_diff.numpy(), rtol, atol)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestActivations(flow.unittest.TestCase):
+    def test_activations(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["activation_type"] = ["relu", "sigmoid", "tanh"]
+        arg_dict["shape"] = [(64, 64)]
+        arg_dict["data_type"] = [flow.float, flow.double]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+        if os.getenv("ONEFLOW_TEST_CPU_ONLY") is None:
+            for act_type in arg_dict["activation_type"]:
+                compare_with_tensorflow("gpu", act_type, (64, 64), flow.float16)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_add.py b/python/oneflow/compatible/single_client/test/ops/test_add.py
new file mode 100644
index 0000000000000000000000000000000000000000..6654521537c7d4ccadc91bdfab2352ccf08d1ff6
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_add.py
@@ -0,0 +1,96 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+from test_util import Args, CompareOpWithTensorFlow, GenArgDict
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+func_config = flow.FunctionConfig()
+func_config.default_data_type(flow.float)
+
+
+def GenerateTest(test_case, a_shape, b_shape):
+    @flow.global_function(function_config=func_config)
+    def AddJob(a: oft.Numpy.Placeholder(a_shape), b: oft.Numpy.Placeholder(b_shape)):
+        return a + b
+
+    a = np.random.rand(*a_shape).astype(np.float32)
+    b = np.random.rand(*b_shape).astype(np.float32)
+    y = AddJob(a, b).get().numpy()
+    test_case.assertTrue(np.array_equal(y, a + b))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestAdd(flow.unittest.TestCase):
+    def test_naive(test_case):
+        @flow.global_function(function_config=func_config)
+        def AddJob(a: oft.Numpy.Placeholder((5, 2)), b: oft.Numpy.Placeholder((5, 2))):
+            return a + b + b
+
+        x = np.random.rand(5, 2).astype(np.float32)
+        y = np.random.rand(5, 2).astype(np.float32)
+        z = None
+        z = AddJob(x, y).get().numpy()
+        test_case.assertTrue(np.array_equal(z, x + y + y))
+
+    def test_broadcast(test_case):
+        @flow.global_function(function_config=func_config)
+        def AddJob(a: oft.Numpy.Placeholder((5, 2)), b: oft.Numpy.Placeholder((1, 2))):
+            return a + b
+
+        x = np.random.rand(5, 2).astype(np.float32)
+        y = np.random.rand(1, 2).astype(np.float32)
+        z = None
+        z = AddJob(x, y).get().numpy()
+        test_case.assertTrue(np.array_equal(z, x + y))
+
+    def test_xy_add_x1(test_case):
+        GenerateTest(test_case, (64, 64), (64, 1))
+
+    def test_xy_add_1y(test_case):
+        GenerateTest(test_case, (64, 64), (1, 64))
+
+    def test_xyz_add_x1z(test_case):
+        GenerateTest(test_case, (64, 64, 64), (64, 1, 64))
+
+    def test_xyz_add_1y1(test_case):
+        GenerateTest(test_case, (64, 64, 64), (1, 64, 1))
+
+    def test_scalar_add(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["flow_op"] = [flow.math.add]
+        arg_dict["tf_op"] = [tf.math.add]
+        arg_dict["input_shape"] = [(10, 10, 10)]
+        arg_dict["op_args"] = [
+            Args([1]),
+            Args([-1]),
+            Args([84223.19348]),
+            Args([-3284.139]),
+        ]
+        for arg in GenArgDict(arg_dict):
+            CompareOpWithTensorFlow(**arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_add_n.py b/python/oneflow/compatible/single_client/test/ops/test_add_n.py
new file mode 100644
index 0000000000000000000000000000000000000000..21a79ec1103031b89cc3182464bc2beaebf86ae6
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_add_n.py
@@ -0,0 +1,103 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from typing import Tuple
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+func_config = flow.FunctionConfig()
+func_config.default_data_type(flow.float)
+
+
+def GenerateTest(test_case, shape, num_inputs):
+    @flow.global_function(function_config=func_config)
+    def AddJob(xs: Tuple[(oft.Numpy.Placeholder(shape),) * num_inputs]):
+        return flow.math.add_n(xs)
+
+    inputs = tuple(
+        (np.random.rand(*shape).astype(np.float32) for i in range(num_inputs))
+    )
+    r = AddJob(inputs).get().numpy()
+    test_case.assertTrue(np.allclose(r, sum(inputs)))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestAddN(flow.unittest.TestCase):
+    def test_naive(test_case):
+        @flow.global_function(function_config=func_config)
+        def AddJob(xs: Tuple[(oft.Numpy.Placeholder((5, 2)),) * 3]):
+            return flow.math.add_n(xs)
+
+        inputs = tuple((np.random.rand(5, 2).astype(np.float32) for i in range(3)))
+        r = AddJob(inputs).get().numpy()
+        test_case.assertTrue(np.allclose(r, sum(inputs)))
+
+    def test_2_inputs(test_case):
+        GenerateTest(test_case, (64, 64), 2)
+
+    def test_3_inputs(test_case):
+        GenerateTest(test_case, (64, 64), 3)
+
+    def test_4_inputs(test_case):
+        GenerateTest(test_case, (64, 64), 4)
+
+    def test_5_inputs(test_case):
+        GenerateTest(test_case, (64, 64), 5)
+
+    def test_6_inputs(test_case):
+        GenerateTest(test_case, (64, 64), 6)
+
+    def test_7_inputs(test_case):
+        GenerateTest(test_case, (64, 64), 7)
+
+    def test_8_inputs(test_case):
+        GenerateTest(test_case, (64, 64), 8)
+
+    def test_9_inputs(test_case):
+        GenerateTest(test_case, (64, 64), 9)
+
+    def test_10_inputs(test_case):
+        GenerateTest(test_case, (64, 64), 10)
+
+    def test_11_inputs(test_case):
+        GenerateTest(test_case, (64, 64), 11)
+
+    def test_12_inputs(test_case):
+        GenerateTest(test_case, (64, 64), 12)
+
+    def test_13_inputs(test_case):
+        GenerateTest(test_case, (64, 64), 13)
+
+    def test_14_inputs(test_case):
+        GenerateTest(test_case, (64, 64), 14)
+
+    def test_15_inputs(test_case):
+        GenerateTest(test_case, (64, 64), 15)
+
+    def test_16_inputs(test_case):
+        GenerateTest(test_case, (64, 64), 16)
+
+    def test_100_inputs(test_case):
+        GenerateTest(test_case, (64, 64), 100)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_all_reduce_group.py b/python/oneflow/compatible/single_client/test/ops/test_all_reduce_group.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9b9776aed282cea7800529e315c3218a68a0304
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_all_reduce_group.py
@@ -0,0 +1,61 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+
+def do_test(test_case, mirrored):
+    flow.clear_default_session()
+    flow.config.gpu_device_num(2)
+    func_config = flow.FunctionConfig()
+    if mirrored:
+        func_config.default_logical_view(flow.scope.mirrored_view())
+    else:
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+    @flow.global_function(type="train", function_config=func_config)
+    def Foo():
+        w = flow.get_variable("w", (10,), initializer=flow.constant_initializer(1))
+        lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [5])
+        flow.optimizer.SGD(lr_scheduler, momentum=0).minimize(w)
+        return w
+
+    r1 = Foo().get().numpy()
+    test_case.assertTrue(np.all(r1 == 1.0))
+    r2 = Foo().get().numpy()
+    test_case.assertTrue(np.all(r2 == 0.5))
+
+
+@flow.unittest.skip_unless_1n2d()
+class TestAllReduceGroup(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_variable_as_loss_on_two_device(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["mirrored"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            do_test(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_argmax.py b/python/oneflow/compatible/single_client/test/ops/test_argmax.py
new file mode 100644
index 0000000000000000000000000000000000000000..d377928d08edde6bdcc066ea9cee1494ba7c4b1e
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_argmax.py
@@ -0,0 +1,90 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+from test_util import GenArgList, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def compare_with_tensorflow(device_type, in_shape, axis, data_type):
+    assert device_type in ["gpu", "cpu"]
+    assert data_type in ["float32", "double", "int8", "int32", "int64"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_logical_view(flow.scope.mirrored_view())
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(function_config=func_config)
+    def ArgMaxJob(
+        input: oft.ListNumpy.Placeholder(
+            tuple([dim + 10 for dim in in_shape]),
+            dtype=type_name_to_flow_type[data_type],
+        )
+    ):
+        with flow.scope.placement(device_type, "0:0"):
+            return flow.math.argmax(input, axis)
+
+    input = (np.random.random(in_shape) * 100).astype(type_name_to_np_type[data_type])
+    of_out = ArgMaxJob([input]).get().numpy_list()[0]
+    tf_out = tf.math.argmax(input, axis).numpy()
+    tf_out = np.array([tf_out]) if isinstance(tf_out, np.int64) else tf_out
+    assert np.array_equal(of_out, tf_out)
+
+
+def gen_arg_list():
+    arg_dict = OrderedDict()
+    arg_dict["device_type"] = ["gpu", "cpu"]
+    arg_dict["in_shape"] = [(100,), (10, 10, 20), (10, 1000)]
+    arg_dict["axis"] = [-1]
+    arg_dict["data_type"] = ["double", "int64"]
+    return GenArgList(arg_dict)
+
+
+def gen_arg_list_for_test_axis():
+    arg_dict = OrderedDict()
+    arg_dict["device_type"] = ["gpu"]
+    arg_dict["in_shape"] = [(10, 10, 20, 30)]
+    arg_dict["axis"] = [-2, 0, 1, 2]
+    arg_dict["data_type"] = ["float32", "int32"]
+    return GenArgList(arg_dict)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestArgmax(flow.unittest.TestCase):
+    def test_argmax(test_case):
+        for arg in gen_arg_list():
+            compare_with_tensorflow(*arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_argmax_gpu(test_case):
+        for arg in gen_arg_list_for_test_axis():
+            compare_with_tensorflow(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_argsort.py b/python/oneflow/compatible/single_client/test/ops/test_argsort.py
new file mode 100644
index 0000000000000000000000000000000000000000..4cd7643d61711a1d5fe363b1abdef1853841bdd0
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_argsort.py
@@ -0,0 +1,92 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+from test_util import GenArgList, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def compare_with_tensorflow(device_type, in_shape, axis, direction, data_type):
+    assert device_type in ["gpu", "cpu"]
+    assert data_type in ["float32", "double", "int8", "int32", "int64"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_logical_view(flow.scope.mirrored_view())
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(function_config=func_config)
+    def ArgSortJob(
+        input: oft.ListNumpy.Placeholder(
+            tuple([dim + 10 for dim in in_shape]),
+            dtype=type_name_to_flow_type[data_type],
+        )
+    ):
+        with flow.scope.placement(device_type, "0:0"):
+            return flow.argsort(input, axis, direction)
+
+    input = (np.random.random(in_shape) * 100).astype(type_name_to_np_type[data_type])
+    of_out = ArgSortJob([input]).get().numpy_list()[0]
+    tf_out = tf.argsort(input, axis, direction)
+    assert np.array_equal(of_out, tf_out.numpy())
+
+
+def gen_arg_list():
+    arg_dict = OrderedDict()
+    arg_dict["device_type"] = ["cpu", "gpu"]
+    arg_dict["in_shape"] = [(10,), (10, 10, 20)]
+    arg_dict["axis"] = [-1]
+    arg_dict["direction"] = ["ASCENDING", "DESCENDING"]
+    arg_dict["data_type"] = ["double", "int32"]
+    return GenArgList(arg_dict)
+
+
+def gen_arg_list_for_test_axis():
+    arg_dict = OrderedDict()
+    arg_dict["device_type"] = ["gpu"]
+    arg_dict["in_shape"] = [(10, 10, 20)]
+    arg_dict["axis"] = [-2, 0, 2]
+    arg_dict["direction"] = ["ASCENDING", "DESCENDING"]
+    arg_dict["data_type"] = ["float32", "int64"]
+    return GenArgList(arg_dict)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestArgsort(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_argsort(test_case):
+        for arg in gen_arg_list():
+            compare_with_tensorflow(*arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_argsort_gpu(test_case):
+        for arg in gen_arg_list_for_test_axis():
+            compare_with_tensorflow(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_argwhere.py b/python/oneflow/compatible/single_client/test/ops/test_argwhere.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe62a5c0fa7fbe3a6e55d5b08c048bb7bf534184
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_argwhere.py
@@ -0,0 +1,198 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgDict
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+
+def _np_dtype_to_of_dtype(np_dtype):
+    if np_dtype == np.float32:
+        return flow.float32
+    elif np_dtype == np.int32:
+        return flow.int32
+    elif np_dtype == np.int64:
+        return flow.int64
+    elif np_dtype == np.int8:
+        return flow.int8
+    else:
+        raise NotImplementedError
+
+
+def _random_input(shape, dtype):
+    if dtype == np.float32:
+        rand_ = np.random.random_sample(shape).astype(np.float32)
+        rand_[np.nonzero(rand_ < 0.5)] = 0.0
+        return rand_
+    elif dtype == np.int32:
+        return np.random.randint(low=0, high=2, size=shape).astype(np.int32)
+    elif dtype == np.int8:
+        return np.random.randint(low=0, high=2, size=shape).astype(np.int8)
+    else:
+        raise NotImplementedError
+
+
+def _of_argwhere(x, index_dtype, device_type="gpu", device_num=1, dynamic=False):
+    data_type = _np_dtype_to_of_dtype(x.dtype)
+    out_data_type = _np_dtype_to_of_dtype(index_dtype)
+    flow.clear_default_session()
+    if device_type == "gpu":
+        flow.config.gpu_device_num(device_num)
+    elif device_type == "cpu":
+        flow.config.cpu_device_num(device_num)
+    else:
+        raise ValueError
+    assert device_num > 0
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(data_type)
+    func_config.default_placement_scope(
+        flow.scope.placement(device_type, "0:0-{}".format(device_num - 1))
+    )
+    if dynamic is True:
+        func_config.default_logical_view(flow.scope.mirrored_view())
+
+        @flow.global_function("predict", function_config=func_config)
+        def argwhere_fn(
+            x: flow.typing.ListNumpy.Placeholder(x.shape, dtype=data_type)
+        ) -> flow.typing.ListNumpy:
+            return flow.argwhere(x, dtype=out_data_type)
+
+        return argwhere_fn([x] * device_num)[0]
+    else:
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function("predict", function_config=func_config)
+        def argwhere_fn(
+            x: flow.typing.Numpy.Placeholder(x.shape, dtype=data_type)
+        ) -> flow.typing.ListNumpy:
+            return flow.argwhere(x, dtype=out_data_type)
+
+        return argwhere_fn(x)[0]
+
+
+def _compare_with_np(
+    test_case,
+    shape,
+    value_dtype,
+    index_dtype,
+    device_type="gpu",
+    device_num=1,
+    dynamic=False,
+    verbose=False,
+):
+    if verbose:
+        print("shape:", shape)
+        print("value_dtype:", value_dtype)
+        print("index_dtype:", index_dtype)
+        print("device_type:", device_type)
+        print("device_num:", device_num)
+        print("dynamic:", dynamic)
+    x = _random_input(shape, value_dtype)
+    y = np.argwhere(x)
+    of_y = _of_argwhere(
+        x, index_dtype, device_type=device_type, device_num=device_num, dynamic=dynamic
+    )
+    if verbose is True:
+        print("input:", x)
+        print("np result:", y)
+        print("of result:", of_y)
+    test_case.assertTrue(np.array_equal(y, of_y))
+
+
+def _dynamic_multi_iter_compare(
+    test_case,
+    iter_num,
+    shape,
+    value_dtype,
+    index_dtype,
+    device_type="gpu",
+    verbose=False,
+):
+    x = [_random_input(shape, value_dtype) for _ in range(iter_num)]
+    y = [np.argwhere(x_) for x_ in x]
+    data_type = _np_dtype_to_of_dtype(value_dtype)
+    out_data_type = _np_dtype_to_of_dtype(index_dtype)
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(data_type)
+    func_config.default_placement_scope(flow.scope.placement(device_type, "0:0"))
+    func_config.default_logical_view(flow.scope.mirrored_view())
+
+    @flow.global_function("predict", function_config=func_config)
+    def argwhere_fn(
+        x: flow.typing.Numpy.Placeholder(tuple(shape), dtype=data_type)
+    ) -> flow.typing.ListNumpy:
+        return flow.argwhere(x, dtype=out_data_type)
+
+    results = []
+    for x_ in x:
+        y_ = argwhere_fn(x_)[0]
+        results.append(y_)
+    for (i, result) in enumerate(results):
+        test_case.assertTrue(np.array_equal(result, y[i]))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestArgwhere(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_argwhere(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [10, (30, 4), (8, 256, 20)]
+        arg_dict["value_dtype"] = [np.float32, np.int32, np.int8]
+        arg_dict["index_dtype"] = [np.int32, np.int64]
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["dynamic"] = [True, False]
+        arg_dict["verbose"] = [False]
+        for arg in GenArgDict(arg_dict):
+            _compare_with_np(test_case, **arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_argwhere_multi_iter(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["iter_num"] = [2]
+        arg_dict["shape"] = [(20, 4)]
+        arg_dict["value_dtype"] = [np.float32, np.int32, np.int8]
+        arg_dict["index_dtype"] = [np.int32, np.int64]
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["verbose"] = [False]
+        for arg in GenArgDict(arg_dict):
+            _dynamic_multi_iter_compare(test_case, **arg)
+
+
+@flow.unittest.skip_unless_1n4d()
+class TestArgwhere4D(flow.unittest.TestCase):
+    @unittest.skipIf(True, "skip for now because of single-client tensor_list removed")
+    def test_argwhere(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(10, 5)]
+        arg_dict["value_dtype"] = [np.float32, np.int32, np.int8]
+        arg_dict["index_dtype"] = [np.int32, np.int64]
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["device_num"] = [4]
+        arg_dict["dynamic"] = [True]
+        arg_dict["verbose"] = [False]
+        for arg in GenArgDict(arg_dict):
+            _compare_with_np(test_case, **arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_assign.py b/python/oneflow/compatible/single_client/test/ops/test_assign.py
new file mode 100644
index 0000000000000000000000000000000000000000..84b82d0e208035d604fe7279fa13e1d63e818744
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_assign.py
@@ -0,0 +1,173 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgDict
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+flow_to_np_dtype_dict = {
+    flow.int32: np.int32,
+    flow.float: np.single,
+    flow.double: np.float,
+}
+
+
+def _random_input(shape, dtype):
+    if np.issubdtype(dtype, np.integer):
+        return np.random.random_integers(low=-10, high=10, size=shape)
+    elif np.issubdtype(dtype, np.floating):
+        rng = np.random.default_rng()
+        return rng.standard_normal(size=shape, dtype=dtype)
+    else:
+        raise NotImplementedError
+
+
+def _of_assign_and_relu(value, dtype, device_type, assign=flow.assign):
+    flow.clear_default_session()
+    if os.getenv("ONEFLOW_TEST_CPU_ONLY") is None:
+        flow.config.gpu_device_num(1)
+    flow.config.cpu_device_num(1)
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(dtype)
+    func_config.default_placement_scope(flow.scope.placement(device_type, "0:0"))
+
+    @flow.global_function(function_config=func_config)
+    def assign_fn(value_def: oft.Numpy.Placeholder(value.shape, dtype=dtype)):
+        var = flow.get_variable(
+            name="var",
+            shape=value.shape,
+            dtype=dtype,
+            initializer=flow.constant_initializer(0),
+        )
+        assign(var, value_def)
+
+    @flow.global_function(function_config=func_config)
+    def relu_fn():
+        var = flow.get_variable(
+            name="var",
+            shape=value.shape,
+            dtype=dtype,
+            initializer=flow.constant_initializer(0),
+        )
+        return flow.nn.relu(var)
+
+    assign_fn(value)
+    return relu_fn().get().numpy()
+
+
+def _np_relu(x):
+    return np.maximum(x, 0)
+
+
+def _compare_with_np(test_case, shape, dtype, device_type, assign):
+    x = _random_input(shape, flow_to_np_dtype_dict[dtype])
+    of_y = _of_assign_and_relu(x, dtype, device_type, assign=assign)
+    test_case.assertTrue(np.allclose(_np_relu(x), of_y))
+
+
+@flow.unittest.skip_unless_2n1d()
+class TestTwoNodeAssign(flow.unittest.TestCase):
+    def test_2node_assign(test_case):
+        if flow.eager_execution_enabled():
+            assign = flow.experimental.eager_assign_121
+        else:
+            assign = flow.assign
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [10, (30, 4), (8, 256, 20)]
+        arg_dict["dtype"] = [flow.float, flow.double]
+        arg_dict["device_type"] = ["cpu"]
+        arg_dict["assign"] = [assign]
+        for arg in GenArgDict(arg_dict):
+            _2node_compare_with_np(test_case, **arg)
+
+
+def _2node_compare_with_np(test_case, shape, dtype, device_type, assign):
+    x = _random_input(shape, flow_to_np_dtype_dict[dtype])
+    of_y = _2node_of_assign_and_relu(x, dtype, device_type, assign=assign)
+    np_y = _np_relu(x)
+    test_case.assertTrue(np.allclose(np_y, of_y))
+
+
+def _2node_of_assign_and_relu(value, dtype, device_type, assign=flow.assign):
+    flow.clear_default_session()
+    flow.config.machine_num(2)
+    if os.getenv("ONEFLOW_TEST_CPU_ONLY") is None:
+        flow.config.gpu_device_num(1)
+    flow.config.cpu_device_num(1)
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(dtype)
+    func_config.default_placement_scope(flow.scope.placement(device_type, "0:0"))
+
+    @flow.global_function(function_config=func_config)
+    def assign_fn(value_def: oft.Numpy.Placeholder(value.shape, dtype=dtype)):
+        with flow.scope.placement(device_type, "1:0"):
+            var = flow.get_variable(
+                name="var",
+                shape=value.shape,
+                dtype=dtype,
+                initializer=flow.constant_initializer(0),
+            )
+            assign(var, value_def)
+
+    @flow.global_function(function_config=func_config)
+    def relu_fn():
+        with flow.scope.placement(device_type, "1:0"):
+            var = flow.get_variable(
+                name="var",
+                shape=value.shape,
+                dtype=dtype,
+                initializer=flow.constant_initializer(0),
+            )
+        ret = flow.nn.relu(var)
+        return ret
+
+    assign_fn(value)
+    relu_ret = relu_fn().get()
+    return relu_ret.numpy()
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestAssign(flow.unittest.TestCase):
+    def test_assign(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [10, (30, 4), (8, 256, 20)]
+        arg_dict["dtype"] = [flow.float, flow.double]
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["assign"] = [flow.assign]
+        for arg in GenArgDict(arg_dict):
+            _compare_with_np(test_case, **arg)
+
+    def test_eager_assign_121(test_case):
+        if not flow.eager_execution_enabled():
+            return
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [10, (30, 4), (8, 256, 20)]
+        arg_dict["dtype"] = [flow.float, flow.double]
+        arg_dict["device_type"] = ["cpu"]
+        arg_dict["assign"] = [flow.experimental.eager_assign_121]
+        for arg in GenArgDict(arg_dict):
+            _compare_with_np(test_case, **arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_batch_gather.py b/python/oneflow/compatible/single_client/test/ops/test_batch_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff2cfceac30ebc8a27094f2934ebc07f44458030
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_batch_gather.py
@@ -0,0 +1,178 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.ops import gen_math_ops
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def _random_inputs(params_shape, indices_shape):
+    params = np.random.rand(*params_shape).astype(np.float32)
+    indices = np.random.randint(
+        low=0,
+        high=params_shape[len(indices_shape) - 1],
+        size=indices_shape,
+        dtype=np.int32,
+    )
+    return (params, indices)
+
+
+def _make_gather_fn(
+    params, indices, axis, batch_dims, device_type, mirrored, compare_fn
+):
+    flow.clear_default_session()
+    flow.config.enable_debug_mode(True)
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    if mirrored:
+        func_config.default_logical_view(flow.scope.mirrored_view())
+    else:
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+    def do_gather(x_blob, i_blob):
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "params",
+                shape=params.shape,
+                dtype=flow.float32,
+                initializer=flow.constant_initializer(0),
+            )
+            x = x + x_blob
+            y = flow.gather(x, i_blob, axis=axis, batch_dims=batch_dims)
+            lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.001])
+            flow.optimizer.SGD(lr_scheduler, momentum=0).minimize(y)
+        flow.watch_diff(x, compare_fn)
+        return y
+
+    if mirrored:
+
+        @flow.global_function(type="train", function_config=func_config)
+        def gather_fn(
+            params_def: oft.ListNumpy.Placeholder(params.shape, dtype=flow.float32),
+            indices_def: oft.ListNumpy.Placeholder(indices.shape, dtype=flow.int32),
+        ):
+            return do_gather(params_def, indices_def)
+
+    else:
+
+        @flow.global_function(type="train", function_config=func_config)
+        def gather_fn(
+            params_def: oft.Numpy.Placeholder(params.shape, dtype=flow.float32),
+            indices_def: oft.Numpy.Placeholder(indices.shape, dtype=flow.int32),
+        ):
+            return do_gather(params_def, indices_def)
+
+    return gather_fn
+
+
+def _compare_gather_with_tf(
+    test_case,
+    device_type,
+    params_shape,
+    indices_shape,
+    axis,
+    batch_dims,
+    mirrored=False,
+):
+    (params, indices) = _random_inputs(params_shape, indices_shape)
+    i = tf.constant(indices.astype(np.int32))
+    with tf.GradientTape() as t:
+        x = tf.Variable(params.astype(np.float32))
+        y = tf.gather(x, i, axis=axis, batch_dims=axis)
+    dy = t.gradient(y, x)
+    if mirrored:
+
+        def compare_dy(params_grad):
+            test_case.assertTrue(
+                np.allclose(dy, params_grad.numpy_list()[0], atol=1e-05, rtol=1e-05)
+            )
+
+    else:
+
+        def compare_dy(params_grad):
+            test_case.assertTrue(
+                np.allclose(dy, params_grad.numpy(), atol=1e-05, rtol=1e-05)
+            )
+
+    gather_fn = _make_gather_fn(
+        params, indices, axis, batch_dims, device_type, mirrored, compare_dy
+    )
+    if mirrored:
+        of_y = gather_fn([params], [indices]).get().numpy_list()[0]
+    else:
+        of_y = gather_fn(params, indices).get().numpy()
+    test_case.assertTrue(np.array_equal(y.numpy(), of_y))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestBatchGather(flow.unittest.TestCase):
+    def test_batch_gather(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["params_shape"] = [(2, 8, 4)]
+        arg_dict["indices_shape"] = [(2, 1)]
+        arg_dict["axis"] = [1]
+        arg_dict["batch_dims"] = [1]
+        for arg in GenArgList(arg_dict):
+            _compare_gather_with_tf(test_case, *arg)
+
+    def test_batch_gather_case_1(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["params_shape"] = [(20, 10, 200)]
+        arg_dict["indices_shape"] = [(20, 10)]
+        arg_dict["axis"] = [1]
+        arg_dict["batch_dims"] = [1]
+        for arg in GenArgList(arg_dict):
+            _compare_gather_with_tf(test_case, *arg)
+
+    def test_batch_gather_case_2(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["params_shape"] = [(20, 80, 30, 5)]
+        arg_dict["indices_shape"] = [(20, 40)]
+        arg_dict["axis"] = [1]
+        arg_dict["batch_dims"] = [1]
+        arg_dict["mirrored"] = [True]
+        for arg in GenArgList(arg_dict):
+            _compare_gather_with_tf(test_case, *arg)
+
+    def test_batch_gather_case_3(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["params_shape"] = [(20, 80, 30, 5)]
+        arg_dict["indices_shape"] = [(20, 80, 20)]
+        arg_dict["axis"] = [2]
+        arg_dict["batch_dims"] = [2]
+        arg_dict["mirrored"] = [True]
+        for arg in GenArgList(arg_dict):
+            _compare_gather_with_tf(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_batch_normalization.py b/python/oneflow/compatible/single_client/test/ops/test_batch_normalization.py
new file mode 100644
index 0000000000000000000000000000000000000000..2bb79d1a5f79f4d96b8af5d35caabf8eb002c638
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_batch_normalization.py
@@ -0,0 +1,594 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import test_global_storage
+from test_util import Args, GenArgDict, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def TODO_test_train(test_case):
+    flow.config.enable_debug_mode(True)
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float32)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def Foo(x: oft.Numpy.Placeholder((2, 8, 32, 32))):
+        y = flow.layers.batch_normalization(x, axis=1)
+        flow.optimizer.SGD(
+            flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+        ).minimize(flow.math.reduce_sum(y))
+
+    Foo(np.ones((2, 8, 32, 32), dtype=np.float32))
+
+
+def CompareNnBnWithTensorFlow(
+    test_case,
+    device_type,
+    input_shape,
+    data_type,
+    axis,
+    epsilon,
+    input_minval=-10,
+    input_maxval=10,
+    y_rtol=0.0001,
+    y_atol=0.0001,
+    x_diff_rtol=0.0001,
+    x_diff_atol=0.0001,
+):
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_logical_view(flow.scope.consistent_view())
+    func_config.default_data_type(flow.float32)
+    x = np.random.uniform(low=input_minval, high=input_maxval, size=input_shape).astype(
+        np.float32
+    )
+    param_shape = input_shape[axis]
+    mean = np.random.uniform(
+        low=input_minval, high=input_maxval, size=param_shape
+    ).astype(np.float32)
+    variance = np.random.uniform(low=0, high=input_maxval, size=param_shape).astype(
+        np.float32
+    )
+    offset = np.random.uniform(
+        low=input_minval, high=input_maxval, size=param_shape
+    ).astype(np.float32)
+    scale = np.random.uniform(
+        low=input_minval, high=input_maxval, size=param_shape
+    ).astype(np.float32)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def FlowNnBnJob(
+        x_full_precision: oft.Numpy.Placeholder(x.shape),
+        mean: oft.Numpy.Placeholder(mean.shape),
+        variance: oft.Numpy.Placeholder(variance.shape),
+        offset: oft.Numpy.Placeholder(offset.shape),
+        scale: oft.Numpy.Placeholder(scale.shape),
+    ):
+        with flow.scope.placement(device_type, "0:0"):
+            x_full_precision += flow.get_variable(
+                name="v1",
+                shape=(1,),
+                dtype=flow.float32,
+                initializer=flow.zeros_initializer(),
+            )
+            if data_type == "float16":
+                x = flow.cast(x_full_precision, flow.float16)
+            else:
+                x = x_full_precision
+            y = flow.nn.batch_normalization(
+                x, mean, variance, offset, scale, epsilon, axis=axis
+            )
+            y = flow.cast(y, flow.float32)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0]), momentum=0
+            ).minimize(y)
+            flow.watch_diff(x_full_precision, test_global_storage.Setter("x_diff"))
+            return y
+
+    of_y = FlowNnBnJob(x, mean, variance, offset, scale).get().numpy()
+    of_x_diff = test_global_storage.Get("x_diff")
+
+    def TensorFlowNnBn(x, mean, variance, offset, scale):
+        tf_params_shape = [1, 1, 1, 1]
+        tf_params_shape[axis] = input_shape[axis]
+        with tf.GradientTape(persistent=True) as tape:
+            x = tf.Variable(x)
+            if data_type == "float16":
+                x = tf.cast(x, tf.float16)
+            mean = tf.Variable(mean.reshape(tf_params_shape))
+            variance = tf.Variable(variance.reshape(tf_params_shape))
+            offset = tf.Variable(offset.reshape(tf_params_shape))
+            scale = tf.Variable(scale.reshape(tf_params_shape))
+            y = tf.cast(
+                tf.nn.batch_normalization(x, mean, variance, offset, scale, epsilon),
+                tf.float32,
+            )
+        x_diff = tape.gradient(y, x)
+        return (y.numpy(), x_diff.numpy())
+
+    msg = "device_type={}, input_shape={}, data_type={}, input_minval={}, input_maxval={}, y_rtol={}, y_atol={}, x_diff_rtol={}, x_diff_atol={}".format(
+        device_type,
+        input_shape,
+        data_type,
+        input_minval,
+        input_maxval,
+        y_rtol,
+        y_atol,
+        x_diff_rtol,
+        x_diff_atol,
+    )
+    (tf_y, tf_x_diff) = TensorFlowNnBn(x, mean, variance, offset, scale)
+    test_case.assertTrue(np.allclose(of_y, tf_y, rtol=y_rtol, atol=y_atol), msg)
+    test_case.assertTrue(
+        np.allclose(of_x_diff, tf_x_diff, rtol=x_diff_rtol, atol=x_diff_atol), msg
+    )
+
+
+def RunTensorFlowBn(x, tf_args, training, trainable):
+    x = x.astype(np.float32)
+    with tf.GradientTape(persistent=True) as tape:
+        x = tf.Variable(x)
+        tf_op = tf.keras.layers.BatchNormalization(*tf_args, trainable=trainable)
+        y = tf_op(x, training=training)
+    if trainable:
+        x_diff = tape.gradient(y, x)
+        return (y.numpy(), x_diff.numpy())
+    else:
+        return y.numpy()
+
+
+def RunOneflowLayerBn(
+    device_type, x, data_type, flow_args, training=True, trainable=True
+):
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_logical_view(flow.scope.consistent_view())
+    if data_type == "float16":
+        func_config.enable_auto_mixed_precision(True)
+        dtype = flow.float
+        np_dtype = np.float32
+    else:
+        dtype = type_name_to_flow_type[data_type]
+        np_dtype = type_name_to_np_type[data_type]
+    x = x.astype(np_dtype)
+    func_config.default_data_type(dtype)
+    if trainable:
+        func_config_type = "train"
+    else:
+        func_config_type = "predict"
+
+    @flow.global_function(type=func_config_type, function_config=func_config)
+    def FlowJob(x_full_precision: oft.Numpy.Placeholder(x.shape, dtype=dtype)):
+        with flow.scope.placement(device_type, "0:0"):
+            x_full_precision += flow.get_variable(
+                name="v1", shape=(1,), dtype=dtype, initializer=flow.zeros_initializer()
+            )
+            if data_type == "float16":
+                x = flow.cast(x_full_precision, flow.float16)
+            else:
+                x = x_full_precision
+            y = flow.layers.batch_normalization(
+                x, *flow_args, trainable=trainable, training=training
+            )
+            y = flow.cast(y, flow.float)
+            if trainable:
+                flow.optimizer.SGD(
+                    flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+                ).minimize(y)
+                flow.watch_diff(x_full_precision, test_global_storage.Setter("x_diff"))
+            return y
+
+    y = FlowJob(x).get().numpy()
+    if trainable:
+        x_diff = test_global_storage.Get("x_diff")
+        return (y, x_diff)
+    else:
+        return y
+
+
+def CompareFp16WithFp32(
+    test_case,
+    device_type,
+    input_shape,
+    op_args=None,
+    input_minval=-10,
+    input_maxval=10,
+    y_rtol=1e-05,
+    y_atol=1e-05,
+    x_diff_rtol=1e-05,
+    x_diff_atol=1e-05,
+    training=True,
+    trainable=True,
+):
+    assert device_type in ["gpu", "cpu"]
+    if op_args is None:
+        (flow_args, tf_args) = ([], [])
+    else:
+        (flow_args, tf_args) = (op_args.flow_args, op_args.tf_args)
+    msg = "device_type={}, input_shape={}, op_args={}, input_minval={}, input_maxval={}, y_rtol={}, y_atol={}, x_diff_rtol={}, x_diff_atol={}, training={}, trainable={}".format(
+        device_type,
+        input_shape,
+        op_args,
+        input_minval,
+        input_maxval,
+        y_rtol,
+        y_atol,
+        x_diff_rtol,
+        x_diff_atol,
+        training,
+        trainable,
+    )
+    x = np.random.uniform(low=input_minval, high=input_maxval, size=input_shape)
+    if trainable:
+        (y_fp16, x_diff_fp16) = RunOneflowLayerBn(
+            device_type, x, "float16", flow_args, training=training, trainable=trainable
+        )
+        (y_fp32, x_diff_fp32) = RunOneflowLayerBn(
+            device_type, x, "float32", flow_args, training=training, trainable=trainable
+        )
+        test_case.assertTrue(np.allclose(y_fp16, y_fp32, rtol=y_rtol, atol=y_atol), msg)
+        test_case.assertTrue(
+            np.allclose(x_diff_fp16, x_diff_fp32, rtol=x_diff_rtol, atol=x_diff_atol),
+            msg,
+        )
+    else:
+        y_fp16 = RunOneflowLayerBn(
+            device_type, x, "float16", flow_args, training=training, trainable=trainable
+        )
+        y_fp32 = RunOneflowLayerBn(
+            device_type, x, "float32", flow_args, training=training, trainable=trainable
+        )
+        test_case.assertTrue(np.allclose(y_fp16, y_fp32, rtol=y_rtol, atol=y_atol), msg)
+
+
+def CompareBnWithTensorFlow(
+    test_case,
+    device_type,
+    input_shape,
+    data_type,
+    op_args=None,
+    input_minval=-10,
+    input_maxval=10,
+    y_rtol=0.01,
+    y_atol=0.01,
+    x_diff_rtol=0.01,
+    x_diff_atol=0.01,
+    training=True,
+    trainable=True,
+):
+    assert device_type in ["gpu", "cpu"]
+    assert data_type in ["float32"]
+    if op_args is None:
+        (flow_args, tf_args) = ([], [])
+    else:
+        (flow_args, tf_args) = (op_args.flow_args, op_args.tf_args)
+    x = np.random.uniform(low=input_minval, high=input_maxval, size=input_shape)
+    msg = "device_type={}, input_shape={}, data_type={}, op_args={}, input_minval={}, input_maxval={}, y_rtol={}, y_atol={}, x_diff_rtol={}, x_diff_atol={}, training={}, trainable={}".format(
+        device_type,
+        input_shape,
+        data_type,
+        op_args,
+        input_minval,
+        input_maxval,
+        y_rtol,
+        y_atol,
+        x_diff_rtol,
+        x_diff_atol,
+        training,
+        trainable,
+    )
+    if trainable:
+        (of_y, of_x_diff) = RunOneflowLayerBn(
+            device_type, x, data_type, flow_args, training=training, trainable=trainable
+        )
+        (tf_y, tf_x_diff) = RunTensorFlowBn(
+            x, tf_args, training=training, trainable=trainable
+        )
+        test_case.assertTrue(np.allclose(of_y, tf_y, rtol=y_rtol, atol=y_atol), msg)
+        test_case.assertTrue(
+            np.allclose(of_x_diff, tf_x_diff, rtol=x_diff_rtol, atol=x_diff_atol), msg
+        )
+    else:
+        of_y = RunOneflowLayerBn(
+            device_type, x, data_type, flow_args, training=training, trainable=trainable
+        )
+        tf_y = RunTensorFlowBn(x, tf_args, training=training, trainable=trainable)
+        test_case.assertTrue(np.allclose(of_y, tf_y, rtol=y_rtol, atol=y_atol), msg)
+
+
+def _test_batchnorm_add_relu(test_case, input_shape, axis, data_type):
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_logical_view(flow.scope.consistent_view())
+    func_config.default_data_type(flow.float32)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def test_job(
+        x: oft.Numpy.Placeholder(input_shape, dtype=flow.float32),
+        addend: oft.Numpy.Placeholder(input_shape, dtype=flow.float32),
+    ):
+        v = flow.get_variable(
+            name="v",
+            shape=(1,),
+            dtype=flow.float32,
+            initializer=flow.zeros_initializer(),
+        )
+        x = x + v
+        addend = addend + v
+        x1 = flow.identity(x)
+        x2 = flow.identity(x)
+        addend1 = flow.identity(addend)
+        addend2 = flow.identity(addend)
+        flow.watch_diff(x1, test_global_storage.Setter("x1_diff"))
+        flow.watch_diff(x2, test_global_storage.Setter("x2_diff"))
+        flow.watch_diff(addend1, test_global_storage.Setter("addend1_diff"))
+        flow.watch_diff(addend2, test_global_storage.Setter("addend2_diff"))
+        x1 = flow.cast(x1, data_type)
+        x2 = flow.cast(x2, data_type)
+        addend1 = flow.cast(addend1, data_type)
+        addend2 = flow.cast(addend2, data_type)
+        y1 = flow.layers.batch_normalization_add_relu(
+            x1, addend=addend1, axis=axis, name="BN1"
+        )
+        y2 = flow.math.relu(
+            flow.layers.batch_normalization(x2, axis=axis, name="BN2") + addend2
+        )
+        y1 = flow.cast(y1, flow.float32)
+        y2 = flow.cast(y2, flow.float32)
+        flow.watch(y1, test_global_storage.Setter("y1"))
+        flow.watch(y2, test_global_storage.Setter("y2"))
+        y1 = flow.where(flow.math.greater(y2, v), y1, v)
+        y2 = flow.where(flow.math.greater(y1, v), y2, v)
+        loss = y1 + y2
+        flow.optimizer.SGD(
+            flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+        ).minimize(flow.math.reduce_sum(loss))
+        return loss
+
+    x = np.random.rand(*input_shape).astype(np.float32)
+    addend = np.random.rand(*input_shape).astype(np.float32)
+    test_job(x, addend).get()
+    tol = 0.001 if data_type == flow.float16 else 1e-05
+    y1 = test_global_storage.Get("y1")
+    y2 = test_global_storage.Get("y2")
+    test_case.assertTrue(np.allclose(y1, y2, rtol=tol, atol=tol))
+    x1_diff = test_global_storage.Get("x1_diff")
+    x2_diff = test_global_storage.Get("x2_diff")
+    test_case.assertTrue(np.allclose(x1_diff, x2_diff, rtol=tol, atol=tol))
+    addend1_diff = test_global_storage.Get("addend1_diff")
+    addend2_diff = test_global_storage.Get("addend2_diff")
+    test_case.assertTrue(np.allclose(addend1_diff, addend2_diff, rtol=tol, atol=tol))
+
+
+def _test_batchnorm_relu(test_case, input_shape, axis, data_type):
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_logical_view(flow.scope.consistent_view())
+    func_config.default_data_type(flow.float32)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def test_job(x: oft.Numpy.Placeholder(input_shape, dtype=flow.float32)):
+        v = flow.get_variable(
+            name="v",
+            shape=(1,),
+            dtype=flow.float32,
+            initializer=flow.zeros_initializer(),
+        )
+        x = x + v
+        x1 = flow.identity(x)
+        x2 = flow.identity(x)
+        flow.watch_diff(x1, test_global_storage.Setter("x1_diff"))
+        flow.watch_diff(x2, test_global_storage.Setter("x2_diff"))
+        x1 = flow.cast(x1, data_type)
+        x2 = flow.cast(x2, data_type)
+        y1 = flow.layers.batch_normalization_relu(x1, axis=axis, name="BN1")
+        y2 = flow.math.relu(flow.layers.batch_normalization(x2, axis=axis, name="BN2"))
+        y1 = flow.cast(y1, flow.float32)
+        y2 = flow.cast(y2, flow.float32)
+        flow.watch(y1, test_global_storage.Setter("y1"))
+        flow.watch(y2, test_global_storage.Setter("y2"))
+        y1 = flow.where(flow.math.greater(y2, v), y1, v)
+        y2 = flow.where(flow.math.greater(y1, v), y2, v)
+        loss = y1 + y2
+        flow.optimizer.SGD(
+            flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+        ).minimize(flow.math.reduce_sum(loss))
+        return loss
+
+    x = np.random.rand(*input_shape).astype(np.float32)
+    test_job(x).get()
+    tol = 0.001 if data_type == flow.float16 else 1e-05
+    y1 = test_global_storage.Get("y1")
+    y2 = test_global_storage.Get("y2")
+    test_case.assertTrue(np.allclose(y1, y2, rtol=tol, atol=tol))
+    x1_diff = test_global_storage.Get("x1_diff")
+    x2_diff = test_global_storage.Get("x2_diff")
+    test_case.assertTrue(np.allclose(x1_diff, x2_diff, rtol=tol, atol=tol))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestBatchNormalization(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_no_watch_scope_consistent(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.consistent_view())
+        func_config.default_data_type(flow.float32)
+
+        @flow.global_function(function_config=func_config)
+        def Foo(x: oft.Numpy.Placeholder((2, 8, 32, 32))):
+            return flow.layers.batch_normalization(x)
+
+        Foo(np.ones((2, 8, 32, 32), dtype=np.float32))
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_train_consistent(test_case):
+        flow.config.enable_debug_mode(True)
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.consistent_view())
+        func_config.default_data_type(flow.float32)
+
+        @flow.global_function(type="train", function_config=func_config)
+        def Foo(x: oft.Numpy.Placeholder((2, 8, 32, 32))):
+            y = flow.layers.batch_normalization(x, axis=1)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+            ).minimize(flow.math.reduce_sum(y))
+
+        Foo(np.ones((2, 8, 32, 32), dtype=np.float32))
+
+    def test_layer_batchnorm(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["data_type"] = ["float32"]
+        arg_dict["input_shape"] = [(1, 4, 1, 2)]
+        arg_dict["op_args"] = [
+            Args([1]),
+            Args([2]),
+            Args([1, 0.95, 0.0001]),
+            Args([1, 0.99, 0.001, False]),
+            Args([1, 0.99, 0.001, False, False]),
+            Args([]),
+            Args([1, 0.95, 0.1]),
+        ]
+        for arg in GenArgDict(arg_dict):
+            CompareBnWithTensorFlow(test_case, **arg)
+
+    def test_layer_batchnorm_inference(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["data_type"] = ["float32"]
+        arg_dict["input_shape"] = [(1, 4, 1, 2)]
+        arg_dict["op_args"] = [
+            Args([1]),
+            Args([2]),
+            Args([1, 0.95, 0.0001]),
+            Args([1, 0.99, 0.001, False]),
+            Args([1, 0.99, 0.001, False, False]),
+            Args([]),
+            Args([1, 0.95, 0.1]),
+        ]
+        for arg in GenArgDict(arg_dict):
+            CompareBnWithTensorFlow(test_case, **arg, training=False, trainable=False)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_layer_batchnorm_trainable_without_training(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["data_type"] = ["float32"]
+        arg_dict["input_shape"] = [(2, 4, 3, 5)]
+        arg_dict["op_args"] = [
+            Args([1]),
+            Args([2]),
+            Args([1, 0.95, 0.0001]),
+            Args([1, 0.99, 0.001, False]),
+            Args([1, 0.99, 0.001, False, False]),
+            Args([]),
+            Args([1, 0.95, 0.1]),
+        ]
+        for arg in GenArgDict(arg_dict):
+            CompareBnWithTensorFlow(test_case, **arg, training=False, trainable=True)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_nn_batchnorm(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["input_shape"] = [(2, 4, 3, 5)]
+        arg_dict["data_type"] = ["float32"]
+        arg_dict["axis"] = [1, -1]
+        arg_dict["epsilon"] = [1.001e-05, 0.0001]
+        for arg in GenArgDict(arg_dict):
+            CompareNnBnWithTensorFlow(test_case, **arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_batchnorm_fp16(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["input_shape"] = [(2, 4, 3, 5)]
+        arg_dict["input_minval"] = [-2]
+        arg_dict["input_maxval"] = [2]
+        arg_dict["op_args"] = [
+            Args([1]),
+            Args([2]),
+            Args([1, 0.95, 0.0001]),
+            Args([1, 0.99, 0.001, False]),
+            Args([1, 0.99, 0.001, False, False]),
+            Args([]),
+            Args([1, 0.95, 0.1]),
+        ]
+        for arg in GenArgDict(arg_dict):
+            CompareFp16WithFp32(
+                test_case,
+                **arg,
+                training=False,
+                trainable=False,
+                y_rtol=0.001,
+                y_atol=0.001
+            )
+            CompareFp16WithFp32(
+                test_case,
+                **arg,
+                training=True,
+                trainable=True,
+                y_rtol=0.001,
+                y_atol=0.001,
+                x_diff_rtol=0.001,
+                x_diff_atol=0.001
+            )
+            CompareFp16WithFp32(
+                test_case,
+                **arg,
+                training=False,
+                trainable=True,
+                y_rtol=0.001,
+                y_atol=0.001,
+                x_diff_rtol=0.001,
+                x_diff_atol=0.001
+            )
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_batchnorm_add_relu(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["input_shape"] = [(5, 7, 9, 11)]
+        arg_dict["axis"] = [0, 1, 2, 3]
+        arg_dict["data_type"] = [flow.float32, flow.float16]
+        for arg in GenArgDict(arg_dict):
+            _test_batchnorm_add_relu(test_case, **arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_batchnorm_relu(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["input_shape"] = [(12, 16, 24, 32)]
+        arg_dict["axis"] = [0, 1, 2, 3]
+        arg_dict["data_type"] = [flow.float32, flow.float16]
+        for arg in GenArgDict(arg_dict):
+            _test_batchnorm_relu(test_case, **arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_bce_loss.py b/python/oneflow/compatible/single_client/test/ops/test_bce_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6ae866f71cdf8340768df0bf2c2bdd1d835d6e9
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_bce_loss.py
@@ -0,0 +1,161 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+from typing import Dict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+def _compare_bceloss_with_np(
+    input_shape, target_shape, weight_shape, device_type, machine_ids, device_counts
+):
+    input = np.random.random(size=input_shape).astype(np.float32)
+    target = np.random.random(size=target_shape).astype(np.float32)
+    weight = np.random.random(size=weight_shape).astype(np.float32)
+    assert device_type in ["cpu", "gpu"]
+    flow.clear_default_session()
+    if device_type == "cpu":
+        flow.config.cpu_device_num(device_counts)
+    else:
+        flow.config.gpu_device_num(device_counts)
+    func_config = flow.FunctionConfig()
+    func_config.default_placement_scope(flow.scope.placement(device_type, machine_ids))
+
+    def np_bceloss(np_input, np_target, np_weight):
+        np_bce = -np_weight * (
+            np_target * np.log(np_input) + (1 - np_target) * np.log(1 - np_input)
+        )
+        np_bce_mean = np.mean(np_bce)
+        np_bce_sum = np.sum(np_bce)
+        return {
+            "np_bce_loss": np_bce,
+            "np_bce_loss_mean": np_bce_mean,
+            "np_bce_loss_sum": np_bce_sum,
+        }
+
+    def np_bce_loss_diff(np_input, np_target, np_weight):
+        elemcnt = np_target.size
+        np_bce_grad_mean = (
+            -(np_weight / elemcnt)
+            * (np_target - np_input)
+            / ((1 - np_input) * np_input)
+        )
+        return {"np_bce_grad_mean": np_bce_grad_mean}
+
+    np_out_bceloss_dict = np_bceloss(input, target, weight)
+    np_grad_dict = np_bce_loss_diff(input, target, weight)
+
+    def assert_prediction_grad(blob: tp.Numpy):
+        assert np.allclose(blob, np_grad_dict["np_bce_grad_mean"])
+
+    @flow.global_function(type="train", function_config=func_config)
+    def oneflow_bceloss(
+        of_input: tp.Numpy.Placeholder(shape=input.shape),
+        of_target: tp.Numpy.Placeholder(shape=target.shape),
+        of_weight: tp.Numpy.Placeholder(shape=weight.shape),
+    ) -> Dict[str, tp.Numpy]:
+        with flow.scope.placement(device_type, "0:0"):
+            v = flow.get_variable(
+                shape=input.shape,
+                dtype=flow.float32,
+                initializer=flow.zeros_initializer(),
+                name="v",
+            )
+            x_var = of_input + v
+        flow.watch_diff(x_var, assert_prediction_grad)
+        bceloss = flow.nn.BCELoss(
+            x_var, of_target, of_weight, reduction="none", name="of_mseloss"
+        )
+        bceloss_mean = flow.nn.BCELoss(
+            x_var, of_target, of_weight, reduction="mean", name="of_mseloss_reduce_mean"
+        )
+        bceloss_sum = flow.nn.BCELoss(
+            x_var, of_target, of_weight, reduction="sum", name="of_mseloss_reduce_sum"
+        )
+        with flow.scope.placement(device_type, "0:0"):
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+            ).minimize(bceloss_mean)
+        return {
+            "of_bce_loss": bceloss,
+            "of_bce_loss_mean": bceloss_mean,
+            "of_bce_loss_sum": bceloss_sum,
+        }
+
+    of_out_bceloss_dict = oneflow_bceloss(input, target, weight)
+    assert np.allclose(
+        of_out_bceloss_dict["of_bce_loss"], np_out_bceloss_dict["np_bce_loss"]
+    )
+    assert np.allclose(
+        of_out_bceloss_dict["of_bce_loss_mean"][0],
+        np_out_bceloss_dict["np_bce_loss_mean"],
+    )
+    assert np.allclose(
+        of_out_bceloss_dict["of_bce_loss_sum"][0],
+        np_out_bceloss_dict["np_bce_loss_sum"],
+    )
+
+
+def _gen_arg_dict(shape, device_type, machine_ids, device_counts):
+    arg_dict = OrderedDict()
+    arg_dict["input_shape"] = [shape]
+    arg_dict["target_shape"] = [shape]
+    arg_dict["weight_shape"] = [shape]
+    arg_dict["device_type"] = [device_type]
+    arg_dict["machine_ids"] = [machine_ids]
+    arg_dict["device_counts"] = [device_counts]
+    return arg_dict
+
+
+@flow.unittest.skip_unless_1n1d()
+class Testbceloss1n1d(flow.unittest.TestCase):
+    def test_bceloss_cpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(3, 3), device_type="cpu", machine_ids="0:0", device_counts=1
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_bceloss_with_np(*arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_bceloss_gpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(3, 16, 32), device_type="gpu", machine_ids="0:0", device_counts=1
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_bceloss_with_np(*arg)
+
+
+@flow.unittest.skip_unless_1n2d()
+class Testbceloss1n2d(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_bceloss_gpu_1n2d(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(3, 16, 16), device_type="gpu", machine_ids="0:0-1", device_counts=2
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_bceloss_with_np(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_bce_with_logits_loss.py b/python/oneflow/compatible/single_client/test/ops/test_bce_with_logits_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..b676e973e343802ea2fd61e13851fce6a31271d3
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_bce_with_logits_loss.py
@@ -0,0 +1,197 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+from typing import Dict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+def _compare_bce_with_logits_loss_np(
+    input_shape,
+    target_shape,
+    weight_shape,
+    pos_weight_shape,
+    device_type,
+    machine_ids,
+    device_counts,
+):
+    input = np.random.random(size=input_shape).astype(np.float32) - 0.5
+    target = np.random.random(size=target_shape).astype(np.float32) - 0.5
+    pos_weight = np.random.random(size=pos_weight_shape).astype(np.float32)
+    weight = np.random.random(size=weight_shape).astype(np.float32)
+    assert device_type in ["cpu", "gpu"]
+    flow.clear_default_session()
+    if device_type == "cpu":
+        flow.config.cpu_device_num(device_counts)
+    else:
+        flow.config.gpu_device_num(device_counts)
+    func_config = flow.FunctionConfig()
+    func_config.default_placement_scope(flow.scope.placement(device_type, machine_ids))
+
+    def np_bce_with_logits_loss(np_input, np_target, np_weight, np_pos_weight):
+        max_val = np.clip(-np_input, a_min=0, a_max=1000000.0)
+        if pos_weight.any():
+            log_weight = (np_pos_weight - 1) * np_target + 1
+            loss = (1 - np_target) * np_input
+            loss_1 = np.log(np.exp(-max_val) + np.exp(-np_input - max_val)) + max_val
+            loss += log_weight * loss_1
+        else:
+            loss = (1 - np_target) * np_input
+            loss += max_val
+            loss += np.log(np.exp(-max_val) + np.exp(-np_input - max_val))
+        np_bce = loss * np_weight
+        np_bce_mean = np.mean(np_bce)
+        np_bce_sum = np.sum(np_bce)
+        return {
+            "np_bce_with_logits_loss": np_bce,
+            "np_bce_with_logits_loss_mean": np_bce_mean,
+            "np_bce_with_logits_loss_sum": np_bce_sum,
+        }
+
+    def np_bce_with_logits_loss_diff(np_input, np_target, np_weight, np_pos_weight):
+        elemcnt = np_target.size
+        np_bce_with_logits_grad_mean = -(np_weight / elemcnt) * (
+            np_target
+            - 1
+            + ((1 - np_pos_weight) * np_target - 1)
+            * (-np.exp(-np_input) / (1 + np.exp(-np_input)))
+        )
+        return {"np_bce_with_logits_grad_mean": np_bce_with_logits_grad_mean}
+
+    np_out_bceloss_dict = np_bce_with_logits_loss(input, target, weight, pos_weight)
+    np_grad_dict = np_bce_with_logits_loss_diff(input, target, weight, pos_weight)
+
+    def assert_prediction_grad(blob: tp.Numpy):
+        assert np.allclose(blob, np_grad_dict["np_bce_with_logits_grad_mean"])
+
+    @flow.global_function(type="train", function_config=func_config)
+    def oneflow_bce_with_logits_loss(
+        of_input: tp.Numpy.Placeholder(shape=input.shape),
+        of_target: tp.Numpy.Placeholder(shape=target.shape),
+        of_weight: tp.Numpy.Placeholder(shape=weight.shape),
+        of_pos_weight: tp.Numpy.Placeholder(shape=pos_weight.shape),
+    ) -> Dict[str, tp.Numpy]:
+        with flow.scope.placement(device_type, "0:0"):
+            v = flow.get_variable(
+                shape=input.shape,
+                dtype=flow.float32,
+                initializer=flow.zeros_initializer(),
+                name="v",
+            )
+            x_var = of_input + v
+        flow.watch_diff(x_var, assert_prediction_grad)
+        bceloss = flow.nn.BCEWithLogitsLoss(
+            x_var,
+            of_target,
+            of_weight,
+            of_pos_weight,
+            reduction="none",
+            name="of_mseloss",
+        )
+        bceloss_mean = flow.nn.BCEWithLogitsLoss(
+            x_var,
+            of_target,
+            of_weight,
+            of_pos_weight,
+            reduction="mean",
+            name="of_mseloss_reduce_mean",
+        )
+        bceloss_sum = flow.nn.BCEWithLogitsLoss(
+            x_var,
+            of_target,
+            of_weight,
+            of_pos_weight,
+            reduction="sum",
+            name="of_mseloss_reduce_sum",
+        )
+        with flow.scope.placement(device_type, "0:0"):
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+            ).minimize(bceloss_mean)
+        return {
+            "of_bce_with_logits_loss": bceloss,
+            "of_bce_with_logits_loss_mean": bceloss_mean,
+            "of_bce_with_logits_loss_sum": bceloss_sum,
+        }
+
+    of_out_bceloss_dict = oneflow_bce_with_logits_loss(
+        input, target, weight, pos_weight
+    )
+    assert np.allclose(
+        of_out_bceloss_dict["of_bce_with_logits_loss"],
+        np_out_bceloss_dict["np_bce_with_logits_loss"],
+    )
+    assert np.allclose(
+        of_out_bceloss_dict["of_bce_with_logits_loss_mean"][0],
+        np_out_bceloss_dict["np_bce_with_logits_loss_mean"],
+    )
+    assert np.allclose(
+        of_out_bceloss_dict["of_bce_with_logits_loss_sum"][0],
+        np_out_bceloss_dict["np_bce_with_logits_loss_sum"],
+    )
+
+
+def _gen_arg_dict(shape, device_type, machine_ids, device_counts):
+    arg_dict = OrderedDict()
+    arg_dict["input_shape"] = [shape]
+    arg_dict["target_shape"] = [shape]
+    arg_dict["weight_shape"] = [shape]
+    arg_dict["pos_weight_shape"] = [shape[-1]]
+    arg_dict["device_type"] = [device_type]
+    arg_dict["machine_ids"] = [machine_ids]
+    arg_dict["device_counts"] = [device_counts]
+    return arg_dict
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestBCEWithLogitsLoss1n1d(flow.unittest.TestCase):
+    def test_bce_with_logits_loss_cpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(3, 3), device_type="cpu", machine_ids="0:0", device_counts=1
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_bce_with_logits_loss_np(*arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_bce_with_logits_loss_gpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(4, 16), device_type="gpu", machine_ids="0:0", device_counts=1
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_bce_with_logits_loss_np(*arg)
+
+
+@flow.unittest.skip_unless_1n2d()
+class TestBCEWithLogits1n2d(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_bce_with_logits_gpu_1n2d(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(4, 8), device_type="gpu", machine_ids="0:0-1", device_counts=2
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_bce_with_logits_loss_np(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_bernoulli.py b/python/oneflow/compatible/single_client/test/ops/test_bernoulli.py
new file mode 100644
index 0000000000000000000000000000000000000000..87f04d2aa1ac16fdf59d237916a31f30896e9b8a
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_bernoulli.py
@@ -0,0 +1,62 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestBernoulli(flow.unittest.TestCase):
+    def test_bernoulli(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.consistent_view())
+        func_config.default_data_type(flow.float)
+
+        @flow.global_function(function_config=func_config)
+        def BernoulliJob(a: oft.Numpy.Placeholder((10, 10))):
+            return flow.random.bernoulli(a)
+
+        x = np.ones((10, 10), dtype=np.float32)
+        y = BernoulliJob(x).get().numpy()
+        test_case.assertTrue(np.array_equal(y, x))
+        x = np.zeros((10, 10), dtype=np.float32)
+        y = BernoulliJob(x).get().numpy()
+        test_case.assertTrue(np.array_equal(y, x))
+
+    def test_bernoulli_mirrored(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.mirrored_view())
+        func_config.default_data_type(flow.float)
+
+        @flow.global_function(function_config=func_config)
+        def BernoulliJob(a: oft.ListNumpy.Placeholder((10, 10))):
+            return flow.random.bernoulli(a)
+
+        x = np.ones((10, 10), dtype=np.float32)
+        y = BernoulliJob([x]).get().numpy_list()[0]
+        test_case.assertTrue(np.array_equal(y, x))
+        x = np.zeros((10, 10), dtype=np.float32)
+        y = BernoulliJob([x]).get().numpy_list()[0]
+        test_case.assertTrue(np.array_equal(y, x))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_bias_add.py b/python/oneflow/compatible/single_client/test/ops/test_bias_add.py
new file mode 100644
index 0000000000000000000000000000000000000000..a697a0077ef852a1abcd4ee8f9e7c57295aaf292
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_bias_add.py
@@ -0,0 +1,153 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import test_global_storage
+from test_util import Args, GenArgDict
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def RunOneflowBiasAdd(data_type, device_type, value, bias, flow_args):
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def FlowJob(
+        value: oft.Numpy.Placeholder(value.shape),
+        bias: oft.Numpy.Placeholder(bias.shape),
+    ):
+        with flow.scope.placement(device_type, "0:0"):
+            value += flow.get_variable(
+                name="v1",
+                shape=(1,),
+                dtype=flow.float,
+                initializer=flow.zeros_initializer(),
+            )
+            bias += flow.get_variable(
+                name="v2",
+                shape=(1,),
+                dtype=flow.float,
+                initializer=flow.zeros_initializer(),
+            )
+            if data_type == "float16":
+                comp_value = flow.cast(value, dtype=flow.float16)
+                comp_bias = flow.cast(bias, dtype=flow.float16)
+            else:
+                comp_value = value
+                comp_bias = bias
+            loss = flow.nn.bias_add(comp_value, comp_bias, *flow_args)
+            if data_type == "float16":
+                loss = flow.cast(loss, dtype=flow.float)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0]), momentum=0
+            ).minimize(loss)
+            flow.watch_diff(value, test_global_storage.Setter("value_diff"))
+            flow.watch_diff(bias, test_global_storage.Setter("bias_diff"))
+            return loss
+
+    y = FlowJob(value, bias).get().numpy()
+    value_diff = test_global_storage.Get("value_diff")
+    bias_diff = test_global_storage.Get("bias_diff")
+    return (y, value_diff, bias_diff)
+
+
+def RunTensorFlowBiasAdd(data_type, value, bias, tf_args):
+    with tf.GradientTape(persistent=True) as tape:
+        (value, bias) = (tf.Variable(value), tf.Variable(bias))
+        if data_type == "float16":
+            comp_value = tf.cast(value, tf.float16)
+            comp_bias = tf.cast(bias, tf.float16)
+        else:
+            comp_value = value
+            comp_bias = bias
+        y = tf.nn.bias_add(comp_value, comp_bias, *tf_args)
+    value_diff = tape.gradient(y, value).numpy()
+    bias_diff = tape.gradient(y, bias).numpy()
+    return (y.numpy(), value_diff, bias_diff)
+
+
+def CompareBiasAddWithTensorFlow(
+    data_type,
+    device_type,
+    input_shapes,
+    op_args=None,
+    input_minval=-10,
+    input_maxval=10,
+    y_rtol=0.001,
+    y_atol=0.001,
+    x_diff_rtol=0.01,
+    x_diff_atol=0.01,
+):
+    assert device_type in ["gpu", "cpu"]
+    if op_args is None:
+        (flow_args, tf_args) = ([], [])
+    else:
+        (flow_args, tf_args) = (op_args.flow_args, op_args.tf_args)
+    x = [
+        np.random.uniform(low=input_minval, high=input_maxval, size=input_shape).astype(
+            np.float32
+        )
+        for input_shape in input_shapes
+    ]
+    (of_y, of_x_diff1, of_x_diff2) = RunOneflowBiasAdd(
+        data_type, device_type, *x, flow_args
+    )
+    (tf_y, tf_x_diff1, tf_x_diff2) = RunTensorFlowBiasAdd(data_type, *x, tf_args)
+    assert np.allclose(of_y, tf_y, rtol=y_rtol, atol=y_atol)
+    assert np.allclose(of_x_diff1, tf_x_diff1, rtol=x_diff_rtol, atol=x_diff_atol)
+    assert np.allclose(of_x_diff2, tf_x_diff2, rtol=x_diff_rtol, atol=x_diff_atol)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestBiasAdd(flow.unittest.TestCase):
+    def test_bias_add_nchw(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["data_type"] = ["float16", "float32"]
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shapes"] = [((1, 20, 1, 11), (20,)), ((2, 20, 1, 11), (20,))]
+        arg_dict["op_args"] = [Args(["NCHW"])]
+        for arg in GenArgDict(arg_dict):
+            if arg["data_type"] == "float16" and arg["device_type"] == "cpu":
+                continue
+            CompareBiasAddWithTensorFlow(**arg)
+
+    def test_bias_add_nhwc(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["data_type"] = ["float16", "float32"]
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shapes"] = [((30, 20, 5, 10), (10,)), ((2, 5, 7, 7), (7,))]
+        arg_dict["op_args"] = [Args(["NHWC"])]
+        for arg in GenArgDict(arg_dict):
+            if arg["data_type"] == "float16" and arg["device_type"] == "cpu":
+                continue
+            CompareBiasAddWithTensorFlow(**arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_binary_elementwise_ops.py b/python/oneflow/compatible/single_client/test/ops/test_binary_elementwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf77e4f03dce80b0fc1251e9802e79ba405294ac
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_binary_elementwise_ops.py
@@ -0,0 +1,229 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import test_global_storage
+from test_util import (
+    GenArgDict,
+    GenArgList,
+    type_name_to_flow_type,
+    type_name_to_np_type,
+)
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def RunOneflowBinaryOp(device_type, flow_op, x, y, data_type):
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    flow_type = type_name_to_flow_type[data_type]
+
+    @flow.global_function(type="train", function_config=func_config)
+    def FlowJob(
+        x: oft.Numpy.Placeholder(x.shape, dtype=flow_type),
+        y: oft.Numpy.Placeholder(y.shape, dtype=flow_type),
+    ):
+        with flow.scope.placement(device_type, "0:0"):
+            x += flow.get_variable(
+                name="x",
+                shape=x.shape,
+                dtype=flow_type,
+                initializer=flow.zeros_initializer(),
+                trainable=True,
+            )
+            y += flow.get_variable(
+                name="y",
+                shape=y.shape,
+                dtype=flow_type,
+                initializer=flow.zeros_initializer(),
+                trainable=True,
+            )
+            loss = flow_op(x, y)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch_diff(y, test_global_storage.Setter("y_diff"))
+            return loss
+
+    out = FlowJob(x, y).get().numpy()
+    x_diff = test_global_storage.Get("x_diff")
+    y_diff = test_global_storage.Get("y_diff")
+    return (out, x_diff, y_diff)
+
+
+def RunTensorFlowBinaryOp(tf_op, x, y):
+    with tf.GradientTape(persistent=True) as tape:
+        x = tf.Variable(x)
+        y = tf.Variable(y)
+        out = tf_op(x, y)
+    x_diff = tape.gradient(out, x)
+    y_diff = tape.gradient(out, y)
+    return (out.numpy(), x_diff, y_diff)
+
+
+def compare_with_tensorflow(
+    test_case,
+    device_type,
+    flow_op,
+    tf_op,
+    x_shape,
+    y_shape,
+    data_type,
+    x_minval=-10,
+    x_maxval=10,
+    y_minval=-10,
+    y_maxval=10,
+    compare_grad=True,
+    out_rtol=1e-05,
+    out_atol=1e-05,
+    diff_rtol=1e-05,
+    diff_atol=1e-05,
+):
+    test_case.assertTrue(device_type in ["gpu", "cpu"])
+    np_type = type_name_to_np_type[data_type]
+    x = np.random.uniform(low=x_minval, high=x_maxval, size=x_shape).astype(np_type)
+    y = np.random.uniform(low=y_minval, high=y_maxval, size=y_shape).astype(np_type)
+    (of_out, of_x_diff, of_y_diff) = RunOneflowBinaryOp(
+        device_type, flow_op, x, y, data_type
+    )
+    (tf_out, tf_x_diff, tf_y_diff) = RunTensorFlowBinaryOp(tf_op, x, y)
+    test_case.assertTrue(
+        np.allclose(of_out, tf_out, rtol=out_rtol, atol=out_atol, equal_nan=True)
+    )
+    if compare_grad:
+        test_case.assertTrue(
+            np.allclose(
+                of_x_diff,
+                tf_x_diff.numpy(),
+                rtol=diff_rtol,
+                atol=diff_atol,
+                equal_nan=True,
+            )
+        )
+        test_case.assertTrue(
+            np.allclose(
+                of_y_diff,
+                tf_y_diff.numpy(),
+                rtol=diff_rtol,
+                atol=diff_atol,
+                equal_nan=True,
+            )
+        )
+    flow.clear_default_session()
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestBinaryElementwiseOps(flow.unittest.TestCase):
+    def test_floordiv(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_case"] = [test_case]
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["flow_op"] = [flow.math.floordiv]
+        arg_dict["tf_op"] = [tf.math.floordiv]
+        arg_dict["x_shape"] = [(5, 5)]
+        arg_dict["y_shape"] = [(5, 5)]
+        arg_dict["data_type"] = ["float32", "double"]
+        arg_dict["x_minval"] = [-10]
+        arg_dict["x_maxval"] = [10]
+        arg_dict["y_minval"] = [1]
+        arg_dict["y_maxval"] = [10]
+        arg_dict["compare_grad"] = [False]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_pow(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_case"] = [test_case]
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["flow_op"] = [flow.math.pow]
+        arg_dict["tf_op"] = [tf.math.pow]
+        arg_dict["x_shape"] = [(5, 5)]
+        arg_dict["y_shape"] = [(5, 5)]
+        arg_dict["data_type"] = ["float32", "double"]
+        arg_dict["x_minval"] = [1]
+        arg_dict["x_maxval"] = [5]
+        arg_dict["y_minval"] = [1]
+        arg_dict["y_maxval"] = [5]
+        arg_dict["compare_grad"] = [True]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_xdivy(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_case"] = [test_case]
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["flow_op"] = [flow.math.xdivy]
+        arg_dict["tf_op"] = [tf.math.xdivy]
+        arg_dict["x_shape"] = [(5, 5)]
+        arg_dict["y_shape"] = [(5, 5)]
+        arg_dict["data_type"] = ["float32", "double"]
+        arg_dict["x_minval"] = [1]
+        arg_dict["x_maxval"] = [100]
+        arg_dict["y_minval"] = [1]
+        arg_dict["y_maxval"] = [10]
+        arg_dict["compare_grad"] = [True]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_xlogy(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_case"] = [test_case]
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["flow_op"] = [flow.math.xlogy]
+        arg_dict["tf_op"] = [tf.math.xlogy]
+        arg_dict["x_shape"] = [(5, 5)]
+        arg_dict["y_shape"] = [(5, 5)]
+        arg_dict["data_type"] = ["float32", "double"]
+        arg_dict["x_minval"] = [1]
+        arg_dict["x_maxval"] = [5]
+        arg_dict["y_minval"] = [1]
+        arg_dict["y_maxval"] = [5]
+        arg_dict["compare_grad"] = [True]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_atan2(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_case"] = [test_case]
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["flow_op"] = [flow.math.atan2]
+        arg_dict["tf_op"] = [tf.math.atan2]
+        arg_dict["x_shape"] = [(5, 5)]
+        arg_dict["y_shape"] = [(5, 5)]
+        arg_dict["data_type"] = ["float32", "double"]
+        arg_dict["x_minval"] = [1]
+        arg_dict["x_maxval"] = [5]
+        arg_dict["y_minval"] = [1]
+        arg_dict["y_maxval"] = [5]
+        arg_dict["compare_grad"] = [True]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_boxing_v2.py b/python/oneflow/compatible/single_client/test/ops/test_boxing_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef8ccad96e01b72746ebebeb85779643c441994d
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_boxing_v2.py
@@ -0,0 +1,349 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def _test_split_to_split(
+    test_case, src_device_type, dst_device_type, src_axis, dst_axis
+):
+    flow.clear_default_session()
+    flow.config.gpu_device_num(4)
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.consistent_view())
+
+    def build_s2s(input_blob, src_device_num, dst_device_num):
+        with flow.scope.placement(src_device_type, "0:0-" + str(src_device_num - 1)):
+            src = flow.identity(
+                input_blob.with_distribute(flow.distribute.split(src_axis))
+            )
+        with flow.scope.placement(dst_device_type, "0:0-" + str(dst_device_num - 1)):
+            dst = flow.identity(src.with_distribute(flow.distribute.split(dst_axis)))
+        return dst
+
+    @flow.global_function(function_config=func_config)
+    def split_to_split_job(input_blob: oft.Numpy.Placeholder((96, 96))):
+        result_list = []
+        for i in (1, 2, 3):
+            for j in (1, 2, 3):
+                result_list.append(build_s2s(input_blob, i, j))
+        return tuple(result_list)
+
+    x = np.random.rand(96, 96).astype(np.float32)
+    result_tuple = split_to_split_job(x).get()
+    for out in result_tuple:
+        test_case.assertTrue(np.array_equal(x, out.numpy()))
+
+
+def _test_split_to_split_enable_all_to_all(
+    test_case, src_device_type, dst_device_type, src_device_num, dst_device_num
+):
+    flow.clear_default_session()
+    flow.config.gpu_device_num(4)
+    flow.config.collective_boxing.nccl_enable_all_to_all(True)
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.consistent_view())
+
+    def build_s2s_all2all(input_blob, src_axis, dst_axis):
+        with flow.scope.placement(src_device_type, "0:0-" + str(src_device_num - 1)):
+            src = flow.identity(
+                input_blob.with_distribute(flow.distribute.split(src_axis))
+            )
+        with flow.scope.placement(dst_device_type, "0:0-" + str(dst_device_num - 1)):
+            dst = flow.identity(src.with_distribute(flow.distribute.split(dst_axis)))
+        return dst
+
+    @flow.global_function(function_config=func_config)
+    def split_to_split_all2all_job(input_blob: oft.Numpy.Placeholder((32, 16, 64, 48))):
+        result_list = []
+        for i in (0, 1, 2, 3):
+            for j in (0, 1, 2, 3):
+                if i == j:
+                    continue
+                result_list.append(build_s2s_all2all(input_blob, i, j))
+        return tuple(result_list)
+
+    x = np.random.rand(32, 16, 64, 48).astype(np.float32)
+    result_tuple = split_to_split_all2all_job(x).get()
+    for out in result_tuple:
+        test_case.assertTrue(np.array_equal(x, out.numpy()))
+
+
+def _test_split_to_broadcast(test_case, src_device_type, dst_device_type, src_axis):
+    flow.clear_default_session()
+    flow.config.gpu_device_num(4)
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.consistent_view())
+
+    def build_s2b(input_blob, src_device_num, dst_device_num):
+        with flow.scope.placement(src_device_type, "0:0-" + str(src_device_num - 1)):
+            src = flow.identity(
+                input_blob.with_distribute(flow.distribute.split(src_axis))
+            )
+        with flow.scope.placement(dst_device_type, "0:0-" + str(dst_device_num - 1)):
+            dst = flow.identity(src.with_distribute(flow.distribute.broadcast()))
+        return dst
+
+    @flow.global_function(function_config=func_config)
+    def split_to_broadcast_job(input_blob: oft.Numpy.Placeholder((96, 96))):
+        result_list = []
+        for i in (1, 2, 3):
+            for j in (1, 2, 3):
+                result_list.append(build_s2b(input_blob, i, j))
+        return tuple(result_list)
+
+    x = np.random.rand(96, 96).astype(np.float32)
+    result_tuple = split_to_broadcast_job(x).get()
+    for out in result_tuple:
+        test_case.assertTrue(np.array_equal(x, out.numpy()))
+
+
+def _test_broadcast_to_split(test_case, src_device_type, dst_device_type, dst_axis):
+    flow.clear_default_session()
+    flow.config.gpu_device_num(4)
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.consistent_view())
+
+    def build_b2s(input_blob, src_device_num, dst_device_num):
+        with flow.scope.placement(src_device_type, "0:0-" + str(src_device_num - 1)):
+            src = flow.identity(input_blob.with_distribute(flow.distribute.broadcast()))
+        with flow.scope.placement(dst_device_type, "0:0-" + str(dst_device_num - 1)):
+            dst = flow.identity(src.with_distribute(flow.distribute.split(dst_axis)))
+        return dst
+
+    @flow.global_function(function_config=func_config)
+    def broadcast_to_split_job(input_blob: oft.Numpy.Placeholder((96, 96))):
+        result_list = []
+        for i in (1, 2, 3):
+            for j in (1, 2, 3):
+                result_list.append(build_b2s(input_blob, i, j))
+        return tuple(result_list)
+
+    x = np.random.rand(96, 96).astype(np.float32)
+    result_tuple = broadcast_to_split_job(x).get()
+    for out in result_tuple:
+        test_case.assertTrue(np.array_equal(x, out.numpy()))
+
+
+def _test_partial_sum_to_split(test_case, src_device_type, dst_device_type, dst_axis):
+    flow.clear_default_session()
+    flow.config.gpu_device_num(4)
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.consistent_view())
+
+    def build_p2s(input_blob, src_device_num, dst_device_num):
+        with flow.scope.placement(src_device_type, "0:0-" + str(src_device_num - 1)):
+            src = flow.identity(input_blob.with_distribute(flow.distribute.split(0)))
+            src = flow.math.reduce_sum(src, axis=0)
+        with flow.scope.placement(dst_device_type, "0:0-" + str(dst_device_num - 1)):
+            dst = flow.identity(src.with_distribute(flow.distribute.split(dst_axis)))
+        return dst
+
+    @flow.global_function(function_config=func_config)
+    def partial_sum_to_split_job(input_blob: oft.Numpy.Placeholder((96, 96, 96))):
+        result_list = []
+        for i in (2, 3):
+            for j in (1, 2, 3):
+                result_list.append(build_p2s(input_blob, i, j))
+        return tuple(result_list)
+
+    x = np.random.uniform(-1e-05, 1e-05, (96, 96, 96)).astype(np.float32)
+    result_tuple = partial_sum_to_split_job(x).get()
+    for out in result_tuple:
+        test_case.assertTrue(np.allclose(np.sum(x, axis=0), out.numpy()))
+
+
+def _test_partial_sum_to_broadcast(test_case, src_device_type, dst_device_type):
+    flow.clear_default_session()
+    flow.config.gpu_device_num(4)
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.consistent_view())
+
+    def build_p2b(input_blob, src_device_num, dst_device_num):
+        with flow.scope.placement(src_device_type, "0:0-" + str(src_device_num - 1)):
+            src = flow.identity(input_blob.with_distribute(flow.distribute.split(0)))
+            src = flow.math.reduce_sum(src, axis=0)
+        with flow.scope.placement(dst_device_type, "0:0-" + str(dst_device_num - 1)):
+            dst = flow.identity(src.with_distribute(flow.distribute.broadcast()))
+        return dst
+
+    @flow.global_function(function_config=func_config)
+    def partial_sum_to_broadcast_job(input_blob: oft.Numpy.Placeholder((96, 96, 96))):
+        result_list = []
+        for i in (2, 3):
+            for j in (1, 2, 3):
+                result_list.append(build_p2b(input_blob, i, j))
+        return tuple(result_list)
+
+    x = np.random.uniform(-1e-05, 1e-05, (96, 96, 96)).astype(np.float32)
+    result_tuple = partial_sum_to_broadcast_job(x).get()
+    for out in result_tuple:
+        test_case.assertTrue(np.allclose(np.sum(x, axis=0), out.numpy()))
+
+
+def _test_broadcast_to_broadcast(test_case, src_device_type, dst_device_type):
+    flow.clear_default_session()
+    flow.config.gpu_device_num(4)
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.consistent_view())
+
+    def build_b2b(input_blob, src_device_num, dst_device_num):
+        with flow.scope.placement(src_device_type, "0:0-" + str(src_device_num - 1)):
+            src = flow.identity(input_blob.with_distribute(flow.distribute.broadcast()))
+        with flow.scope.placement(dst_device_type, "0:0-" + str(dst_device_num - 1)):
+            dst = flow.identity(src.with_distribute(flow.distribute.broadcast()))
+        return dst
+
+    @flow.global_function(function_config=func_config)
+    def broadcast_to_broadcast_job(input_blob: oft.Numpy.Placeholder((96, 96))):
+        result_list = []
+        for i in (1, 2, 3):
+            for j in (1, 2, 3):
+                result_list.append(build_b2b(input_blob, i, j))
+        return tuple(result_list)
+
+    x = np.random.rand(96, 96).astype(np.float32)
+    result_tuple = broadcast_to_broadcast_job(x).get()
+    for out in result_tuple:
+        test_case.assertTrue(np.array_equal(x, out.numpy()))
+
+
+def _test_multi_lbi(
+    test_case, src_device_type, dst_device_type, src_device_num, dst_device_num
+):
+    flow.clear_default_session()
+    flow.config.gpu_device_num(4)
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.consistent_view())
+
+    @flow.global_function(function_config=func_config)
+    def multi_lbi_job(x: oft.Numpy.Placeholder((96, 96, 96))):
+        with flow.scope.placement(src_device_type, "0:0-" + str(src_device_num - 1)):
+            src_s0 = flow.identity(x.with_distribute(flow.distribute.split(0)))
+            src_s1 = flow.identity(x.with_distribute(flow.distribute.split(1)))
+            src_b = flow.identity(x.with_distribute(flow.distribute.split(1)))
+            (t0_0, t0_1, t0_2) = flow.identity_n((src_s0, src_s1, src_b))
+        with flow.scope.placement(dst_device_type, "0:0-" + str(dst_device_num - 1)):
+            t0_0 = t0_0.with_distribute(flow.distribute.split(1))
+            t0_1 = t0_1.with_distribute(flow.distribute.broadcast())
+            t0_2 = t0_2.with_distribute(flow.distribute.split(1))
+            (t1_0, t1_1, t1_2) = flow.identity_n((t0_0, t0_1, t0_2))
+        return (t1_0, t1_1, t1_2)
+
+    x = np.random.uniform(-1e-05, 1e-05, (96, 96, 96)).astype(np.float32)
+    r0 = multi_lbi_job(x).get()[0].numpy()
+    r1 = multi_lbi_job(x).get()[1].numpy()
+    r2 = multi_lbi_job(x).get()[2].numpy()
+    test_case.assertTrue(np.array_equal(x, r0))
+    test_case.assertTrue(np.array_equal(x, r1))
+    test_case.assertTrue(np.array_equal(x, r2))
+
+
+@flow.unittest.skip_unless_1n4d()
+class TestBoxingV2(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_split_to_split(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["src_device_type"] = ["cpu", "gpu"]
+        arg_dict["dst_device_type"] = ["cpu", "gpu"]
+        arg_dict["src_axis"] = [0, 1]
+        arg_dict["dst_axis"] = [0, 1]
+        for arg in GenArgList(arg_dict):
+            _test_split_to_split(test_case, *arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_split_to_split_all_to_all(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["src_device_type"] = ["gpu"]
+        arg_dict["dst_device_type"] = ["gpu"]
+        arg_dict["src_device_num"] = [4]
+        arg_dict["dst_device_num"] = [4]
+        for arg in GenArgList(arg_dict):
+            _test_split_to_split_enable_all_to_all(test_case, *arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_split_to_broadcast(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["src_device_type"] = ["cpu", "gpu"]
+        arg_dict["dst_device_type"] = ["cpu", "gpu"]
+        arg_dict["src_axis"] = [0, 1]
+        for arg in GenArgList(arg_dict):
+            _test_split_to_broadcast(test_case, *arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_broadcast_to_split(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["src_device_type"] = ["cpu", "gpu"]
+        arg_dict["dst_device_type"] = ["cpu", "gpu"]
+        arg_dict["dst_axis"] = [0, 1]
+        for arg in GenArgList(arg_dict):
+            _test_broadcast_to_split(test_case, *arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_partial_sum_to_split(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["src_device_type"] = ["cpu", "gpu"]
+        arg_dict["dst_device_type"] = ["cpu", "gpu"]
+        arg_dict["dst_axis"] = [0, 1]
+        for arg in GenArgList(arg_dict):
+            _test_partial_sum_to_split(test_case, *arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_partial_sum_to_broadcast(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["src_device_type"] = ["cpu", "gpu"]
+        arg_dict["dst_device_type"] = ["cpu", "gpu"]
+        for arg in GenArgList(arg_dict):
+            _test_partial_sum_to_broadcast(test_case, *arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_broadcast_to_broadcast(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["src_device_type"] = ["cpu", "gpu"]
+        arg_dict["dst_device_type"] = ["cpu", "gpu"]
+        for arg in GenArgList(arg_dict):
+            _test_broadcast_to_broadcast(test_case, *arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_multi_lbi(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["src_device_type"] = ["cpu", "gpu"]
+        arg_dict["dst_device_type"] = ["cpu", "gpu"]
+        arg_dict["src_device_num"] = [1, 2, 3]
+        arg_dict["dst_device_num"] = [1, 2, 3]
+        for arg in GenArgList(arg_dict):
+            _test_multi_lbi(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_broadcast_like.py b/python/oneflow/compatible/single_client/test/ops/test_broadcast_like.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe46c90f30bd8a232623d6c34b16b9f5cbda3243
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_broadcast_like.py
@@ -0,0 +1,102 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def compare_broadcast_like_with_tf(
+    device_type, input_shape, like_shape, broadcast_axes, rtol=1e-05, atol=1e-05
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(function_config=func_config)
+    def broadcast_like_forward(
+        x: tp.Numpy.Placeholder(shape=input_shape, dtype=flow.float),
+        y: tp.Numpy.Placeholder(shape=like_shape, dtype=flow.float),
+    ):
+        with flow.scope.placement(device_type, "0:0"):
+            return flow.broadcast_like(x, y, broadcast_axes=broadcast_axes)
+
+    x = np.random.rand(*input_shape).astype(np.float32)
+    like = np.random.rand(*like_shape).astype(np.float32)
+    of_out = broadcast_like_forward(x, like).get()
+    np_out = np.broadcast_to(x, like_shape)
+    assert np.allclose(of_out.numpy(), np_out, rtol=rtol, atol=atol)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestBroadcastLike(flow.unittest.TestCase):
+    def test_broadcast_like(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["input_shape"] = [(5, 2)]
+        arg_dict["like_shape"] = [(4, 5, 2)]
+        arg_dict["broadcast_axes"] = [[0]]
+        for arg in GenArgList(arg_dict):
+            compare_broadcast_like_with_tf(*arg)
+
+    def test_broadcast_like2(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["input_shape"] = [(5, 2)]
+        arg_dict["like_shape"] = [(4, 6, 5, 2)]
+        arg_dict["broadcast_axes"] = [[0, 1]]
+        for arg in GenArgList(arg_dict):
+            compare_broadcast_like_with_tf(*arg)
+
+    def test_broadcast_like_grad(test_case):
+        def watch_diff_handler(blob: tp.Numpy):
+            test_case.assertTrue(np.array_equal(blob, [[3.0], [3.0], [3.0]]))
+
+        @flow.global_function(type="train")
+        def watch_matmul_diff_job(
+            images: tp.Numpy.Placeholder((3, 3), dtype=flow.float)
+        ) -> None:
+            weight_initializer = flow.constant_initializer(2)
+            weight_shape = (3, 1)
+            weight = flow.get_variable(
+                "three-weight", shape=weight_shape, initializer=weight_initializer
+            )
+            weight_broadcast = flow.broadcast_like(
+                weight, like=images, broadcast_axes=(1,)
+            )
+            lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.1])
+            flow.optimizer.SGD(lr_scheduler, momentum=0.9).minimize(weight_broadcast)
+            flow.watch_diff(weight, watch_diff_handler)
+
+        x = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3]]).astype(np.float32)
+        watch_matmul_diff_job(x)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_broadcast_logical_ops.py b/python/oneflow/compatible/single_client/test/ops/test_broadcast_logical_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c678f0a1a6d3efdf86d58b7c4e9d8d4e29daec94
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_broadcast_logical_ops.py
@@ -0,0 +1,151 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList, type_name_to_flow_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def func_equal(a, b):
+    return a == b
+
+
+def func_not_equal(a, b):
+    return a != b
+
+
+def func_greater_than(a, b):
+    return a > b
+
+
+def func_greater_equal(a, b):
+    return a >= b
+
+
+def func_less_than(a, b):
+    return a < b
+
+
+def func_less_equal(a, b):
+    return a <= b
+
+
+def np_array(dtype, shape):
+    if dtype == flow.int8:
+        return np.random.randint(0, 127, shape).astype(np.int8)
+    elif dtype == flow.int32:
+        return np.random.randint(0, 10000, shape).astype(np.int32)
+    elif dtype == flow.int64:
+        return np.random.randint(0, 10000, shape).astype(np.int64)
+    elif dtype == flow.float:
+        return np.random.rand(*shape).astype(np.float32)
+    elif dtype == flow.double:
+        return np.random.rand(*shape).astype(np.double)
+    else:
+        assert False
+
+
+def GenerateTest(test_case, a_shape, b_shape, dtype=flow.int32, device_type="gpu"):
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(dtype)
+
+    @flow.global_function(function_config=func_config)
+    def MyTestJob(
+        a: oft.Numpy.Placeholder(a_shape, dtype=dtype),
+        b: oft.Numpy.Placeholder(b_shape, dtype=dtype),
+    ):
+        with flow.scope.placement(device_type, "0:0"):
+            equal_out = func_equal(a, b)
+            not_equal_out = func_not_equal(a, b)
+            greater_than_out = func_greater_than(a, b)
+            greater_equal_out = func_greater_equal(a, b)
+            less_than_out = func_less_than(a, b)
+            less_equal_out = func_less_equal(a, b)
+            return (
+                equal_out,
+                not_equal_out,
+                greater_than_out,
+                greater_equal_out,
+                less_than_out,
+                less_equal_out,
+            )
+
+    a = np_array(dtype, a_shape)
+    b = np_array(dtype, b_shape)
+    (
+        equal_out,
+        not_equal_out,
+        greater_than_out,
+        greater_equal_out,
+        less_than_out,
+        less_equal_out,
+    ) = MyTestJob(a, b).get()
+    test_case.assertTrue(np.array_equal(equal_out.numpy(), func_equal(a, b)))
+    test_case.assertTrue(np.array_equal(not_equal_out.numpy(), func_not_equal(a, b)))
+    test_case.assertTrue(
+        np.array_equal(greater_than_out.numpy(), func_greater_than(a, b))
+    )
+    test_case.assertTrue(
+        np.array_equal(greater_equal_out.numpy(), func_greater_equal(a, b))
+    )
+    test_case.assertTrue(np.array_equal(less_than_out.numpy(), func_less_than(a, b)))
+    test_case.assertTrue(np.array_equal(less_equal_out.numpy(), func_less_equal(a, b)))
+    flow.clear_default_session()
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestBroadcastLogicalOps(flow.unittest.TestCase):
+    def test_broadcast_logical_cpu(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_case"] = [test_case]
+        arg_dict["a_shape"] = [(64, 64)]
+        arg_dict["b_shape"] = [(1, 64)]
+        arg_dict["data_type"] = [flow.int32, flow.float]
+        arg_dict["device_type"] = ["cpu"]
+        for arg in GenArgList(arg_dict):
+            if len(arg[1]) < len(arg[2]):
+                continue
+            GenerateTest(*arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_broadcast_logical_gpu(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_case"] = [test_case]
+        arg_dict["a_shape"] = [(64, 64), (64, 64, 64)]
+        arg_dict["b_shape"] = [(1, 64), (1, 64, 1)]
+        arg_dict["data_type"] = [
+            flow.int8,
+            flow.int32,
+            flow.int64,
+            flow.float,
+            flow.double,
+        ]
+        arg_dict["device_type"] = ["gpu"]
+        for arg in GenArgList(arg_dict):
+            if len(arg[1]) < len(arg[2]):
+                continue
+            GenerateTest(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_broadcast_maximum.py b/python/oneflow/compatible/single_client/test/ops/test_broadcast_maximum.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c4024031d84464b8b9d3ccff31b934247e9b768
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_broadcast_maximum.py
@@ -0,0 +1,63 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+func_config = flow.FunctionConfig()
+func_config.default_data_type(flow.float)
+
+
+def _check(test_case, a, b, out):
+    test_case.assertTrue(np.array_equal(np.maximum(a, b), out))
+
+
+def _run_test(test_case, a, b, dtype, device):
+    @flow.global_function(function_config=func_config)
+    def BroadcastMaximum(
+        a: oft.Numpy.Placeholder(a.shape, dtype=dtype),
+        b: oft.Numpy.Placeholder(b.shape, dtype=dtype),
+    ):
+        with flow.scope.placement(device, "0:0"):
+            return flow.math.maximum(a, b)
+
+    out = BroadcastMaximum(a, b).get()
+    _check(test_case, a, b, out.numpy())
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestBroadcastMaximum(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_broadcast_maximum_random_gpu(test_case):
+        a = np.random.rand(1024, 1024).astype(np.float32)
+        b = np.random.rand(1024, 1024).astype(np.float32)
+        _run_test(test_case, a, b, flow.float32, "gpu")
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_broadcast_maximum_broadcast_gpu(test_case):
+        a = np.random.rand(1024, 1).astype(np.float32)
+        b = np.random.rand(1, 1024).astype(np.float32)
+        _run_test(test_case, a, b, flow.float32, "gpu")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_broadcast_minimum.py b/python/oneflow/compatible/single_client/test/ops/test_broadcast_minimum.py
new file mode 100644
index 0000000000000000000000000000000000000000..27d9a31c51d7089fc353d0e0365c72d9115e6034
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_broadcast_minimum.py
@@ -0,0 +1,63 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+func_config = flow.FunctionConfig()
+func_config.default_data_type(flow.float)
+
+
+def _check(test_case, a, b, out):
+    test_case.assertTrue(np.array_equal(np.minimum(a, b), out))
+
+
+def _run_test(test_case, a, b, dtype, device):
+    @flow.global_function(function_config=func_config)
+    def BroadcastMinimum(
+        a: oft.Numpy.Placeholder(a.shape, dtype=dtype),
+        b: oft.Numpy.Placeholder(b.shape, dtype=dtype),
+    ):
+        with flow.scope.placement(device, "0:0"):
+            return flow.math.minimum(a, b)
+
+    out = BroadcastMinimum(a, b).get()
+    _check(test_case, a, b, out.numpy())
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestBroadcastMinimum(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_broadcast_minimum_random_gpu(test_case):
+        a = np.random.rand(1024, 1024).astype(np.float32)
+        b = np.random.rand(1024, 1024).astype(np.float32)
+        _run_test(test_case, a, b, flow.float32, "gpu")
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_broadcast_minimum_broadcast_gpu(test_case):
+        a = np.random.rand(1024, 1).astype(np.float32)
+        b = np.random.rand(1, 1024).astype(np.float32)
+        _run_test(test_case, a, b, flow.float32, "gpu")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_broadcast_normal.py b/python/oneflow/compatible/single_client/test/ops/test_broadcast_normal.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e3c71410121c4e8ba96a32a48dfbd0d42622d6b
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_broadcast_normal.py
@@ -0,0 +1,256 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import test_global_storage
+from test_util import (
+    GenArgDict,
+    GenArgList,
+    type_name_to_flow_type,
+    type_name_to_np_type,
+)
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def RunOneflowOp(device_type, flow_op, x, y, data_type):
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    flow_type = type_name_to_flow_type[data_type]
+
+    @flow.global_function(type="train", function_config=func_config)
+    def FlowJob(
+        x: oft.Numpy.Placeholder(x.shape, dtype=flow_type),
+        y: oft.Numpy.Placeholder(y.shape, dtype=flow_type),
+    ):
+        with flow.scope.placement(device_type, "0:0"):
+            x += flow.get_variable(
+                name="x",
+                shape=x.shape,
+                dtype=flow_type,
+                initializer=flow.zeros_initializer(),
+                trainable=True,
+            )
+            y += flow.get_variable(
+                name="y",
+                shape=y.shape,
+                dtype=flow_type,
+                initializer=flow.zeros_initializer(),
+                trainable=True,
+            )
+            loss = flow_op(x, y)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch_diff(y, test_global_storage.Setter("y_diff"))
+            return loss
+
+    out = FlowJob(x, y).get().numpy()
+    x_diff = test_global_storage.Get("x_diff")
+    y_diff = test_global_storage.Get("y_diff")
+    return (out, x_diff, y_diff)
+
+
+def RunTensorFlowOp(tf_op, x, y):
+    with tf.GradientTape(persistent=True) as tape:
+        x = tf.Variable(x)
+        y = tf.Variable(y)
+        out = tf_op(x, y)
+    x_diff = tape.gradient(out, x)
+    y_diff = tape.gradient(out, y)
+    return (out.numpy(), x_diff.numpy(), y_diff.numpy())
+
+
+def compare_with_tensorflow_grad(
+    device_type,
+    flow_op,
+    tf_op,
+    x_shape,
+    y_shape,
+    data_type,
+    input_minval=-10,
+    input_maxval=10,
+    out_rtol=1e-05,
+    out_atol=1e-05,
+    diff_rtol=0.0001,
+    diff_atol=0.001,
+):
+    assert device_type in ["gpu", "cpu"]
+    np_type = type_name_to_np_type[data_type]
+    x = np.random.uniform(low=input_minval, high=input_maxval, size=x_shape).astype(
+        np_type
+    )
+    y = np.random.uniform(low=input_minval, high=input_maxval, size=y_shape).astype(
+        np_type
+    )
+    if flow_op in (flow.math.divide, flow.math.mod):
+        y[np.where(y == 0)] += 1
+    (of_out, of_x_diff, of_y_diff) = RunOneflowOp(device_type, flow_op, x, y, data_type)
+    (tf_out, tf_x_diff, tf_y_diff) = RunTensorFlowOp(tf_op, x, y)
+    assert np.allclose(of_out, tf_out, rtol=out_rtol, atol=out_atol, equal_nan=True)
+    assert np.allclose(
+        of_x_diff, tf_x_diff, rtol=diff_rtol, atol=diff_atol, equal_nan=True
+    )
+    assert np.allclose(
+        of_y_diff, tf_y_diff, rtol=diff_rtol, atol=diff_atol, equal_nan=True
+    )
+    flow.clear_default_session()
+
+
+def compare_with_tensorflow(
+    device_type,
+    flow_op,
+    tf_op,
+    x_shape,
+    y_shape,
+    data_type,
+    input_minval=-10,
+    input_maxval=10,
+    out_rtol=1e-05,
+    out_atol=1e-05,
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    flow_type = type_name_to_flow_type[data_type]
+
+    @flow.global_function(function_config=func_config)
+    def FlowJob(
+        x: oft.Numpy.Placeholder(x_shape, dtype=flow_type),
+        y: oft.Numpy.Placeholder(y_shape, dtype=flow_type),
+    ):
+        with flow.scope.placement(device_type, "0:0"):
+            return flow_op(x, y)
+
+    np_type = type_name_to_np_type[data_type]
+    if np_type in (np.int8, np.int32, np.int64):
+        x = np.random.randint(low=input_minval, high=input_maxval, size=x_shape).astype(
+            np_type
+        )
+        y = np.random.randint(low=input_minval, high=input_maxval, size=y_shape).astype(
+            np_type
+        )
+    else:
+        x = np.random.uniform(low=input_minval, high=input_maxval, size=x_shape).astype(
+            np_type
+        )
+        y = np.random.uniform(low=input_minval, high=input_maxval, size=y_shape).astype(
+            np_type
+        )
+    if flow_op in (flow.math.divide, flow.math.mod):
+        y[np.where(y == 0)] += 1
+    of_out = FlowJob(x, y).get().numpy()
+    tf_out = tf_op(x, y).numpy()
+    assert np.allclose(of_out, tf_out, rtol=out_rtol, atol=out_atol, equal_nan=True)
+    flow.clear_default_session()
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestBroadcastNormal(flow.unittest.TestCase):
+    def test_broadcast_add(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["flow_op"] = [flow.math.add]
+        arg_dict["tf_op"] = [tf.math.add]
+        arg_dict["x_shape"] = [(3, 1, 4, 1)]
+        arg_dict["y_shape"] = [(4, 1, 6)]
+        arg_dict["data_type"] = ["float32", "double"]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow_grad(*arg)
+
+    def test_broadcast_sub(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["flow_op"] = [flow.math.subtract]
+        arg_dict["tf_op"] = [tf.math.subtract]
+        arg_dict["x_shape"] = [(3, 1, 4, 1)]
+        arg_dict["y_shape"] = [(4, 1, 6)]
+        arg_dict["data_type"] = ["float32", "double"]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_broadcast_mul(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["flow_op"] = [flow.math.multiply]
+        arg_dict["tf_op"] = [tf.math.multiply]
+        arg_dict["x_shape"] = [(3, 1, 4, 5, 1)]
+        arg_dict["y_shape"] = [(1, 4, 1, 1, 5)]
+        arg_dict["data_type"] = ["float32", "double"]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow_grad(*arg)
+
+    def test_broadcast_div(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["flow_op"] = [flow.math.divide]
+        arg_dict["tf_op"] = [tf.math.divide]
+        arg_dict["x_shape"] = [(3, 1, 4, 5, 1)]
+        arg_dict["y_shape"] = [(3, 4, 1, 1, 5)]
+        arg_dict["data_type"] = ["float32", "double"]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow_grad(*arg)
+
+    def test_broadcast_floormod(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["flow_op"] = [flow.math.mod]
+        arg_dict["tf_op"] = [tf.math.floormod]
+        arg_dict["x_shape"] = [(3, 1, 4, 5, 1)]
+        arg_dict["y_shape"] = [(1, 4, 1, 1, 5)]
+        arg_dict["data_type"] = ["float32", "double", "int32", "int64"]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_broadcast_maximum(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["flow_op"] = [flow.math.maximum]
+        arg_dict["tf_op"] = [tf.math.maximum]
+        arg_dict["x_shape"] = [(3, 1, 4, 5, 1)]
+        arg_dict["y_shape"] = [(1, 4, 1, 1, 5)]
+        arg_dict["data_type"] = ["float32", "double", "int32", "int64"]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_broadcast_minimum(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["flow_op"] = [flow.math.minimum]
+        arg_dict["tf_op"] = [tf.math.minimum]
+        arg_dict["x_shape"] = [(3, 1, 4, 5, 1)]
+        arg_dict["y_shape"] = [(1, 4, 1, 1, 5)]
+        arg_dict["data_type"] = ["float32", "double", "int32", "int64"]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_broadcast_to_compatible_with.py b/python/oneflow/compatible/single_client/test/ops/test_broadcast_to_compatible_with.py
new file mode 100644
index 0000000000000000000000000000000000000000..18e07e149c1a80f4481540c64312e41965cb7326
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_broadcast_to_compatible_with.py
@@ -0,0 +1,189 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def _of_broadcast_to_compatible_with(x, compatible_shape, x_shape=None):
+    assert isinstance(compatible_shape, (list, tuple))
+    if x_shape is None:
+        x_shape = x.shape
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.mirrored_view())
+
+    @flow.global_function(function_config=func_config)
+    def broadcast_to_compatible_with_fn(
+        x_def: oft.ListNumpy.Placeholder(shape=x_shape, dtype=flow.float)
+    ):
+        compatible_var = [
+            flow.get_variable(
+                "compatible_var_{}".format(i),
+                shape=cp_shape,
+                dtype=flow.float,
+                initializer=flow.random_normal_initializer(),
+                trainable=False,
+            )
+            for (i, cp_shape) in enumerate(compatible_shape)
+        ]
+        return flow.broadcast_to_compatible_with(x_def, compatible_var)
+
+    return broadcast_to_compatible_with_fn([x]).get().numpy_list()[0]
+
+
+def _of_broadcast_to_compatible_with_dynamic(
+    x, a, b, x_shape=None, a_shape=None, b_shape=None
+):
+    if x_shape is None:
+        x_shape = x.shape
+    if a_shape is None:
+        a_shape = a.shape
+    if b_shape is None:
+        b_shape = b.shape
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.mirrored_view())
+
+    @flow.global_function(function_config=func_config)
+    def broadcast_to_compatible_with_fn(
+        x_def: oft.ListNumpy.Placeholder(x_shape, dtype=flow.float),
+        a_def: oft.ListNumpy.Placeholder(a_shape, dtype=flow.float),
+        b_def: oft.ListNumpy.Placeholder(b_shape, dtype=flow.float),
+    ):
+        return flow.broadcast_to_compatible_with(
+            x_def, [flow.identity(a_def), flow.identity(b_def)]
+        )
+
+    return broadcast_to_compatible_with_fn([x], [a], [b]).get().numpy_list()[0]
+
+
+def _of_broadcast_to_compatible_with_grad(x, compatible_shape, dx_watcher):
+    assert isinstance(compatible_shape, (list, tuple))
+    assert callable(dx_watcher)
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.consistent_view())
+
+    @flow.global_function(type="train", function_config=func_config)
+    def broadcast_to_compatible_with_fn(
+        x_def: oft.Numpy.Placeholder(x.shape, dtype=flow.float)
+    ):
+        x_var = flow.get_variable(
+            "x_var",
+            shape=x.shape,
+            dtype=flow.float,
+            initializer=flow.constant_initializer(0),
+            trainable=True,
+        )
+        compatible_var = [
+            flow.get_variable(
+                "compatible_var_{}".format(i),
+                shape=cp_shape,
+                dtype=flow.float,
+                initializer=flow.random_normal_initializer(),
+                trainable=False,
+            )
+            for (i, cp_shape) in enumerate(compatible_shape)
+        ]
+        x_var = x_var + x_def
+        y = flow.broadcast_to_compatible_with(x_var, compatible_var)
+        flow.optimizer.SGD(
+            flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+        ).minimize(y)
+        flow.watch_diff(x_var, dx_watcher)
+        return y
+
+    return broadcast_to_compatible_with_fn(x).get().numpy()
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestBroadcastToCompatibleWith(flow.unittest.TestCase):
+    def test_broadcast_to_compatible_with(test_case):
+        x = np.random.standard_normal((5, 2)).astype(np.float32)
+        compatible_shape = [[4, 5, 2], [4, 5, 1]]
+        ret = _of_broadcast_to_compatible_with(x, compatible_shape)
+        expected_ret = np.broadcast_to(x, [4, 5, 2])
+        test_case.assertTrue(np.array_equal(expected_ret, ret))
+
+    def test_dynamic_broadcast_to_compatible_with(test_case):
+        x = np.random.standard_normal((10, 6)).astype(np.float32)
+        x_static_shape = (15, 6)
+        a = np.random.standard_normal((3, 10, 6)).astype(np.float32)
+        a_static_shape = (3, 15, 6)
+        b = np.random.standard_normal((3, 10, 1)).astype(np.float32)
+        b_static_shape = (3, 15, 1)
+        ret = _of_broadcast_to_compatible_with_dynamic(
+            x, a, b, x_static_shape, a_static_shape, b_static_shape
+        )
+        expected_ret = np.broadcast_to(x, [3, 10, 6])
+        test_case.assertTrue(np.array_equal(expected_ret, ret))
+
+    def test_dynamic_broadcast_to_compatible_with_case_2(test_case):
+        x = np.random.standard_normal((20, 1, 1)).astype(np.float32)
+        x_static_shape = (23, 1, 1)
+        a = np.random.standard_normal((11, 1)).astype(np.float32)
+        a_static_shape = (15, 1)
+        b = np.random.standard_normal((7,)).astype(np.float32)
+        b_static_shape = (8,)
+        ret = _of_broadcast_to_compatible_with_dynamic(
+            x, a, b, x_static_shape, a_static_shape, b_static_shape
+        )
+        expected_ret = np.broadcast_to(x, [20, 11, 7])
+        test_case.assertTrue(np.array_equal(expected_ret, ret))
+
+    def test_broadcast_to_compatible_with_grad(test_case):
+        x = np.random.standard_normal((7, 1, 4)).astype(np.float32)
+        compatible_shape = [[7, 1, 4], [5, 4]]
+
+        def compare_dy(dx_blob):
+            dx = np.ones([7, 5, 4], dtype=np.float32).sum(axis=1).reshape(x.shape)
+            test_case.assertTrue(np.array_equal(dx, dx_blob.numpy()))
+
+        ret = _of_broadcast_to_compatible_with_grad(x, compatible_shape, compare_dy)
+        exp_ret = np.broadcast_to(x, [7, 5, 4])
+        test_case.assertTrue(np.array_equal(exp_ret, ret))
+
+    def test_broadcast_to_compatible_with_grad_case_2(test_case):
+        x = np.random.standard_normal((7, 1, 4)).astype(np.float32)
+        compatible_shape = [[1, 7, 5, 4]]
+
+        def compare_dy(dx_blob):
+            dx = np.ones([7, 5, 4], dtype=np.float32).sum(axis=1).reshape(x.shape)
+            test_case.assertTrue(np.array_equal(dx, dx_blob.numpy()))
+
+        ret = _of_broadcast_to_compatible_with_grad(x, compatible_shape, compare_dy)
+        exp_ret = np.broadcast_to(x, [1, 7, 5, 4])
+        test_case.assertTrue(np.array_equal(exp_ret, ret))
+
+    def test_broadcast_to_compatible_with_no_broadcast(test_case):
+        x = np.random.standard_normal((9, 9, 6)).astype(np.float32)
+        x_static_shape = (10, 9, 6)
+        compatible_shape = [[6], [9, 1]]
+        ret = _of_broadcast_to_compatible_with(x, compatible_shape, x_static_shape)
+        test_case.assertTrue(np.array_equal(x, ret))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_cast.py b/python/oneflow/compatible/single_client/test/ops/test_cast.py
new file mode 100644
index 0000000000000000000000000000000000000000..0eee62c16332634eb1131eac3a5b4d5a340154e3
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_cast.py
@@ -0,0 +1,115 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import test_global_storage
+from test_util import GenArgList, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def cast_forward_compare_with_tensorflow(test_cast, device_type, input_shape, dtype):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(function_config=func_config)
+    def cast_forward(
+        input_def: oft.Numpy.Placeholder(
+            shape=input_shape, dtype=type_name_to_flow_type[dtype]
+        )
+    ):
+        with flow.scope.placement(device_type, "0:0"):
+            return flow.cast(input_def, dtype=type_name_to_flow_type[dtype])
+
+    input = np.random.rand(*input_shape).astype(type_name_to_np_type[dtype])
+    of_out = cast_forward(input).get()
+    tf_out = tf.cast(input, dtype=type_name_to_np_type[dtype])
+    assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=1e-05, atol=1e-05)
+
+
+def compare_with_tensorflow(device_type, input_shape, dtype):
+    assert device_type in ["gpu", "cpu"]
+    assert dtype in ["float32", "double"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def CastJob():
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "in",
+                shape=input_shape,
+                dtype=type_name_to_flow_type[dtype],
+                initializer=flow.random_uniform_initializer(),
+                trainable=True,
+            )
+            loss = flow.cast(x, dtype=flow.float)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch(loss, test_global_storage.Setter("loss"))
+            flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))
+            return loss
+
+    of_out = CastJob().get()
+    with tf.GradientTape(persistent=True) as tape:
+        x = tf.Variable(test_global_storage.Get("x"))
+        tf_out = tf.cast(x, dtype=type_name_to_np_type[dtype])
+    loss_diff = test_global_storage.Get("loss_diff")
+    tf_x_diff = tape.gradient(tf_out, x, loss_diff)
+    assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=1e-05, atol=1e-05)
+    assert np.allclose(
+        test_global_storage.Get("x_diff"), tf_x_diff.numpy(), rtol=1e-05, atol=1e-05
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestCast(flow.unittest.TestCase):
+    def test_cast(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(5, 4, 3)]
+        arg_dict["dtype"] = ["float32", "double"]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_cast_forward(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(5, 4, 3)]
+        arg_dict["dtype"] = ["float32", "int8", "uint8", "double", "int32", "int64"]
+        for arg in GenArgList(arg_dict):
+            cast_forward_compare_with_tensorflow(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_cast_to_static_shape.py b/python/oneflow/compatible/single_client/test/ops/test_cast_to_static_shape.py
new file mode 100644
index 0000000000000000000000000000000000000000..436f70f6e434fb2df2d5e970e579d5df1792b87b
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_cast_to_static_shape.py
@@ -0,0 +1,131 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgDict, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+
+def _make_cast_to_static_shape_fn(
+    test_case, shape, data_type, device_type, device_num, compare_diff_fn
+):
+    dtype = type_name_to_flow_type[data_type]
+    require_grad = dtype is flow.float32
+    flow.clear_default_session()
+    if device_type == "gpu":
+        flow.config.gpu_device_num(device_num)
+    elif device_type == "cpu":
+        flow.config.cpu_device_num(device_num)
+    else:
+        raise ValueError
+    assert device_num > 0
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(dtype)
+    func_config.default_placement_scope(
+        flow.scope.placement(device_type, "0:0-{}".format(device_num - 1))
+    )
+    func_config.default_logical_view(flow.scope.mirrored_view())
+
+    @flow.global_function(
+        type="train" if require_grad else "predict", function_config=func_config
+    )
+    def cast_to_static_shape_fn(
+        x: flow.typing.ListNumpy.Placeholder(shape=shape, dtype=dtype)
+    ) -> flow.typing.ListNumpy:
+        x_var = flow.get_variable(
+            name="x_var",
+            shape=(1,),
+            dtype=flow.float32,
+            initializer=flow.zeros_initializer(),
+        )
+        x = x + flow.cast(x_var, dtype=dtype)
+        y = flow.cast_to_static_shape(x)
+        test_case.assertFalse(y.is_dynamic)
+        if require_grad:
+            flow.watch_diff(x, compare_diff_fn)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+            ).minimize(y)
+        return y
+
+    return cast_to_static_shape_fn
+
+
+def _random_input(shape, data_type):
+    dtype = type_name_to_np_type[data_type]
+    if data_type == "float32" or data_type == "double":
+        return np.random.random_sample(shape).astype(dtype)
+    elif data_type == "int32":
+        return np.random.randint(low=0, high=100, size=shape).astype(dtype)
+    else:
+        raise NotImplementedError
+
+
+def _check_cast_to_static_shape(test_case, shape, data_type, device_type, device_num):
+    x = _random_input(shape, data_type)
+
+    def comp(x, y):
+        test_case.assertTrue(np.array_equal(x, y))
+
+    def comp_diff(diff):
+        dx = np.ones(shape)
+        for d in diff.numpy_list():
+            test_case.assertTrue(np.array_equal(d, dx))
+
+    cast_to_static_shape_fn = _make_cast_to_static_shape_fn(
+        test_case, shape, data_type, device_type, device_num, comp_diff
+    )
+    y = cast_to_static_shape_fn([x] * device_num)
+    if isinstance(y, list):
+        for y_ in y:
+            comp(x, y_)
+    elif isinstance(y, np.ndarray):
+        comp(x, y)
+    else:
+        raise ValueError
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestCastToStaticShape(flow.unittest.TestCase):
+    def test_case_1(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(5, 4, 3), (10, 7)]
+        arg_dict["data_type"] = ["float32", "double", "int32"]
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["device_num"] = [1]
+        for arg in GenArgDict(arg_dict):
+            _check_cast_to_static_shape(test_case, **arg)
+
+
+@unittest.skipIf(True, "skip for now because of single-client tensor_list removed")
+class TestCastToStaticShapeParallel(flow.unittest.TestCase):
+    def test_case_1(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(10,)]
+        arg_dict["data_type"] = ["float32", "double", "int32"]
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["device_num"] = [4]
+        for arg in GenArgDict(arg_dict):
+            _check_cast_to_static_shape(test_case, **arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_categorical_ordinal_encoder.py b/python/oneflow/compatible/single_client/test/ops/test_categorical_ordinal_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccfeef72b1e64b1539df5e3eab909fbe1da46cc4
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_categorical_ordinal_encoder.py
@@ -0,0 +1,125 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import sys
+import typing
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def _test_categorical_ordinal_encoder(
+    test_case, device_tag, dtype, size, capacity, num_tokens, num_iters
+):
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.consistent_view())
+
+    @flow.global_function(function_config=func_config)
+    def test_job(
+        x: oft.Numpy.Placeholder(shape=(size,), dtype=dtype)
+    ) -> typing.Tuple[oft.Numpy, oft.Numpy]:
+        with flow.scope.placement(device_tag, "0:0"):
+            y = flow.layers.categorical_ordinal_encoder(x, capacity=capacity)
+            z = flow.layers.categorical_ordinal_encoder(
+                x, capacity=capacity, name="encode1"
+            )
+            return (y, z)
+
+    tokens = np.random.randint(-sys.maxsize, sys.maxsize, size=[num_tokens]).astype(
+        flow.convert_oneflow_dtype_to_numpy_dtype(dtype)
+    )
+    k_set = set()
+    v_set = set()
+    kv_set = set()
+    vk_set = set()
+    for i in range(num_iters):
+        x = tokens[np.random.randint(0, num_tokens, (size,))]
+        (y, z) = test_job(x)
+        test_case.assertEqual(x.shape, y.shape)
+        if device_tag == "cpu":
+            test_case.assertTrue(
+                np.array_equal(y, z),
+                "\ny: {}\n{}\nz: {}\n{}".format(y.shape, y, z.shape, z),
+            )
+        for (k, v) in zip(x, y):
+            k_set.add(k)
+            v_set.add(v)
+            kv_set.add((k, v))
+            vk_set.add((v, k))
+    unique_size = len(k_set)
+    test_case.assertEqual(len(v_set), unique_size)
+    test_case.assertEqual(len(kv_set), unique_size)
+    test_case.assertEqual(len(vk_set), unique_size)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestCategoricalOrdinalEncoder(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_categorical_ordinal_encoder_gpu_large(test_case):
+        _test_categorical_ordinal_encoder(
+            test_case=test_case,
+            device_tag="gpu",
+            dtype=flow.int64,
+            size=10000,
+            capacity=320000,
+            num_tokens=200000,
+            num_iters=256,
+        )
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_categorical_ordinal_encoder_gpu_small(test_case):
+        _test_categorical_ordinal_encoder(
+            test_case=test_case,
+            device_tag="gpu",
+            dtype=flow.int32,
+            size=10,
+            capacity=250,
+            num_tokens=200,
+            num_iters=4,
+        )
+
+    def test_categorical_ordinal_encoder_cpu_large(test_case):
+        _test_categorical_ordinal_encoder(
+            test_case=test_case,
+            device_tag="cpu",
+            dtype=flow.int64,
+            size=20000,
+            capacity=220000,
+            num_tokens=200000,
+            num_iters=100,
+        )
+
+    def test_categorical_ordinal_encoder_cpu_very_large(test_case):
+        _test_categorical_ordinal_encoder(
+            test_case=test_case,
+            device_tag="cpu",
+            dtype=flow.int64,
+            size=50000,
+            capacity=1000000,
+            num_tokens=500000,
+            num_iters=100,
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_ccrelu.py b/python/oneflow/compatible/single_client/test/ops/test_ccrelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ee8f22a53231703fc31cd64b371373039cfceca
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_ccrelu.py
@@ -0,0 +1,104 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def ccrelu(x, name):
+    return (
+        flow.user_op_builder(name)
+        .Op("ccrelu")
+        .Input("in", [x])
+        .Output("out")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def fixed_tensor_def_test(test_case, func_config):
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(function_config=func_config)
+    def ReluJob(a: oft.Numpy.Placeholder((5, 2))):
+        return ccrelu(a, "my_cc_relu_op")
+
+    x = np.random.rand(5, 2).astype(np.float32)
+    y = ReluJob(x).get().numpy()
+    test_case.assertTrue(np.array_equal(y, np.maximum(x, 0)))
+
+
+def mirrored_tensor_def_test(test_case, func_config):
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(function_config=func_config)
+    def ReluJob(a: oft.ListNumpy.Placeholder((5, 2))):
+        return ccrelu(a, "my_cc_relu_op")
+
+    x = np.random.rand(3, 1).astype(np.float32)
+    y = ReluJob([x]).get().numpy_list()[0]
+    test_case.assertTrue(np.array_equal(y, np.maximum(x, 0)))
+
+
+class TestCcrelu(flow.unittest.TestCase):
+    @flow.unittest.skip_unless_1n1d()
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_ccrelu(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.consistent_view())
+        fixed_tensor_def_test(test_case, func_config)
+
+    @flow.unittest.skip_unless_1n1d()
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_mirror_ccrelu(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.mirrored_view())
+        mirrored_tensor_def_test(test_case, func_config)
+
+    @flow.unittest.skip_unless_2n1d()
+    def test_ccrelu_2n1c_0(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.consistent_view())
+        fixed_tensor_def_test(test_case, func_config)
+
+    @flow.unittest.skip_unless_2n1d()
+    def test_ccrelu_2n1c_1(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.consistent_view())
+        fixed_tensor_def_test(test_case, func_config)
+
+    @flow.unittest.skip_unless_2n1d()
+    def test_ccrelu_2n1c_2(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.consistent_view())
+        fixed_tensor_def_test(test_case, func_config)
+
+    @flow.unittest.skip_unless_2n1d()
+    def test_ccrelu_2n1c_3(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.consistent_view())
+        fixed_tensor_def_test(test_case, func_config)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_checkpoint.py b/python/oneflow/compatible/single_client/test/ops/test_checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6f1d2ec45ef2b23cb4863d0f4da18cdb79db1ff
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_checkpoint.py
@@ -0,0 +1,397 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import shutil
+import tempfile
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+def refresh_session():
+    flow.clear_default_session()
+    flow.config.gpu_device_num(flow.unittest.env.device_num())
+
+
+def get_placement():
+    node_size = flow.unittest.env.node_size()
+    device_ids = "0-{}".format(flow.unittest.env.device_num() - 1)
+    machine_device_ids = [
+        "{}:{}".format(node_id, device_ids) for node_id in range(node_size)
+    ]
+    return flow.scope.placement("gpu", machine_device_ids)
+
+
+def get_simple_momentum_training_model(dtype):
+    assert dtype == flow.float32
+
+    @flow.global_function(type="train")
+    def model() -> tp.Numpy:
+        with get_placement():
+            x = flow.get_variable(
+                name="x",
+                shape=(4, 5),
+                dtype=flow.float32,
+                initializer=flow.random_normal_initializer(mean=10, stddev=1),
+            )
+            w = flow.get_variable(
+                name="w",
+                shape=(5, 6),
+                dtype=flow.float32,
+                initializer=flow.random_normal_initializer(mean=10, stddev=1),
+                distribute=flow.distribute.split(0),
+            )
+            y = flow.matmul(x, w)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.01]), momentum=0.9
+            ).minimize(y)
+            return y
+
+    return model
+
+
+def get_simple_model(dtype):
+    @flow.global_function()
+    def add() -> tp.Numpy:
+        with get_placement():
+            x = flow.get_variable(
+                name="x",
+                shape=(9, 3),
+                dtype=dtype,
+                initializer=flow.random_normal_initializer(mean=10, stddev=1),
+                distribute=flow.distribute.split(0),
+            )
+            y = flow.get_variable(
+                name="y",
+                shape=(9, 3),
+                dtype=dtype,
+                initializer=flow.constant_initializer(5, dtype=dtype),
+            )
+            z = flow.get_variable(
+                name="z",
+                shape=(9, 3),
+                dtype=dtype,
+                initializer=flow.random_normal_initializer(),
+            )
+            return flow.math.add_n([x, y, z])
+
+    return add
+
+
+def get_large_model(dtype):
+    @flow.global_function()
+    def large() -> tp.Numpy:
+        with get_placement():
+            x = flow.get_variable(
+                name="x",
+                shape=(10, 2801, 820, 4),
+                dtype=dtype,
+                initializer=flow.random_normal_initializer(mean=10, stddev=1),
+                distribute=flow.distribute.split(0),
+            )
+            return flow.math.reduce_mean(x)
+
+    return large
+
+
+def get_add_and_reduce_mean_model(dtype):
+    @flow.global_function()
+    def model() -> tp.Numpy:
+        with get_placement():
+            x = flow.get_variable(
+                name="x",
+                shape=(10, 801, 820, 4),
+                dtype=dtype,
+                initializer=flow.random_normal_initializer(mean=10, stddev=1),
+                distribute=flow.distribute.split(0),
+            )
+            y = flow.get_variable(
+                name="y",
+                shape=(10, 801, 820, 4),
+                dtype=dtype,
+                initializer=flow.random_normal_initializer(mean=10, stddev=1),
+                distribute=flow.distribute.split(0),
+            )
+            return flow.math.reduce_mean(x + y)
+
+    return model
+
+
+def get_checkpoint_ready_model(model_getter, dtype):
+    model = model_getter(dtype)
+    if flow.eager_execution_enabled():
+        model()
+    return model
+
+
+def _TestSaveCorrectness(test_case, model_getter, dtype, legacy_api):
+    """
+    Save weights by new model io, load weights by legacy model io,
+    and check the equality.
+    """
+    with tempfile.TemporaryDirectory() as save_dir:
+        refresh_session()
+        flow.config.enable_legacy_model_io(False)
+        large1 = get_checkpoint_ready_model(model_getter, dtype)
+        if legacy_api:
+            check_point = flow.train.CheckPoint()
+            check_point.save(save_dir)
+        else:
+            flow.checkpoint.save(save_dir)
+        res1 = large1()
+        refresh_session()
+        flow.config.enable_legacy_model_io(True)
+        large2 = get_checkpoint_ready_model(model_getter, dtype)
+        check_point = flow.train.CheckPoint()
+        check_point.load(save_dir)
+        flow.sync_default_session()
+        res2 = large2()
+        test_case.assertTrue(np.array_equal(res1, res2))
+
+
+def _TestRoundTrip(test_case, model_getter, dtype):
+    """
+    Save weights by new model io, load weights by new model io,
+    and check the equality.
+    """
+    with tempfile.TemporaryDirectory() as save_dir:
+        refresh_session()
+        large1 = get_checkpoint_ready_model(model_getter, dtype)
+        flow.checkpoint.save(save_dir)
+        res1 = large1()
+        refresh_session()
+        large2 = get_checkpoint_ready_model(model_getter, dtype)
+        vars_in_file = flow.checkpoint.get(save_dir)
+        flow.load_variables(vars_in_file)
+        res2 = large2()
+        test_case.assertTrue(np.array_equal(res1, res2))
+
+
+def _TestLoadCorrectness(test_case, model_getter, dtype, legacy_api):
+    """
+    Save weights by legacy model io, load weights by new model io,
+    and check the equality.
+    """
+    with tempfile.TemporaryDirectory() as save_dir:
+        refresh_session()
+        flow.config.enable_legacy_model_io(True)
+        large1 = get_checkpoint_ready_model(model_getter, dtype)
+        check_point = flow.train.CheckPoint()
+        check_point.init()
+        check_point.save(save_dir)
+        res1 = large1()
+        flow.clear_default_session()
+        flow.config.gpu_device_num(4)
+        flow.config.enable_legacy_model_io(False)
+        large2 = get_checkpoint_ready_model(model_getter, dtype)
+        if legacy_api:
+            check_point = flow.train.CheckPoint()
+            check_point.load(save_dir)
+        else:
+            vars_in_file = flow.checkpoint.get(save_dir)
+            flow.load_variables(vars_in_file)
+        res2 = large2()
+        test_case.assertTrue(np.array_equal(res1, res2))
+
+
+def _TestPartiallyLoadNumpy(test_case, dtype):
+    refresh_session()
+    model = get_checkpoint_ready_model(get_add_and_reduce_mean_model, dtype)
+    var_x = flow.get_all_variables()["x"]
+    var_y_value_before_loading = flow.get_all_variables()["y"].numpy()
+    new_val_np = np.random.random(var_x.shape).astype(np.float32)
+    flow.load_variables({"x": new_val_np})
+    var_y_value_after_loading = flow.get_all_variables()["y"].numpy()
+    flow_res = model()
+    np_res = (var_y_value_after_loading + new_val_np).mean()
+    test_case.assertTrue(
+        np.allclose(flow_res, np_res),
+        {"flow_res": flow_res, "np_res": np_res, "diff": flow_res - np_res},
+    )
+    test_case.assertTrue(
+        np.array_equal(var_y_value_before_loading, var_y_value_after_loading)
+    )
+
+
+def _TestMixedModel(test_case, dtype):
+    with tempfile.TemporaryDirectory() as save_dir1, tempfile.TemporaryDirectory() as save_dir2:
+
+        def get_variable(name):
+            return flow.get_variable(
+                name=name,
+                shape=(10, 80, 40, 20),
+                dtype=dtype,
+                initializer=flow.random_normal_initializer(mean=10, stddev=1),
+                distribute=flow.distribute.split(0),
+            )
+
+        def get_part_of_mixed_model(dtype):
+            @flow.global_function()
+            def model() -> tp.Numpy:
+                with get_placement():
+                    x = get_variable("x")
+                    return x
+
+            return model
+
+        def get_mixed_model(dtype):
+            @flow.global_function()
+            def model() -> tp.Numpy:
+                with get_placement():
+                    x1 = get_variable("x_from_model1")
+                    x2 = get_variable("x_from_model2")
+                    return x1 + x2
+
+            return model
+
+        refresh_session()
+        model1 = get_checkpoint_ready_model(get_part_of_mixed_model, dtype)
+        flow.checkpoint.save(save_dir1)
+        refresh_session()
+        model2 = get_checkpoint_ready_model(get_part_of_mixed_model, dtype)
+        flow.checkpoint.save(save_dir2)
+        refresh_session()
+        mixed_model = get_checkpoint_ready_model(get_mixed_model, dtype)
+        var_dict_from_model1 = flow.checkpoint.get(save_dir1)
+        var_dict_from_model2 = flow.checkpoint.get(save_dir2)
+        new_var_dict = {}
+        for (key, val) in var_dict_from_model1.items():
+            new_var_dict["{}_from_model1".format(key)] = val
+        for (key, val) in var_dict_from_model2.items():
+            new_var_dict["{}_from_model2".format(key)] = val
+        flow.load_variables(new_var_dict)
+        res = mixed_model()
+        test_case.assertTrue(
+            np.allclose(
+                res,
+                var_dict_from_model1["x"].numpy() + var_dict_from_model2["x"].numpy(),
+            )
+        )
+
+
+def _TestResumeTraining(test_case):
+    with tempfile.TemporaryDirectory() as save_dir:
+        refresh_session()
+        model = get_checkpoint_ready_model(
+            get_simple_momentum_training_model, flow.float32
+        )
+        model()
+        flow.checkpoint.save(save_dir)
+        model()
+        w1 = flow.get_all_variables()["w"].numpy()
+        refresh_session()
+        model = get_checkpoint_ready_model(
+            get_simple_momentum_training_model, flow.float32
+        )
+        flow.load_variables(flow.checkpoint.get(save_dir))
+        model()
+        w2 = flow.get_all_variables()["w"].numpy()
+        test_case.assertTrue(np.array_equal(w1, w2))
+
+
+def _TestAssignmentBetweenMemory(test_case, dtype):
+    refresh_session()
+    model = get_checkpoint_ready_model(get_simple_model, dtype)
+    all_vars = flow.get_all_variables()
+    flow.load_variables({"x": all_vars["z"]})
+    flow_res = model()
+    np_res = all_vars["z"].numpy() * 2 + all_vars["y"].numpy()
+    test_case.assertTrue(np.allclose(flow_res, np_res))
+
+
+class TestCheckpoint(flow.unittest.TestCase):
+    @flow.unittest.skip_unless_1n4d()
+    @unittest.skipIf(
+        flow.unittest.env.eager_execution_enabled(),
+        "legacy model io doesn't work in eager mode",
+    )
+    def test_save_correctness_1node_legacy_api(test_case):
+        _TestSaveCorrectness(test_case, get_simple_model, flow.float, True)
+
+    @flow.unittest.skip_unless_1n4d()
+    @unittest.skipIf(
+        flow.unittest.env.eager_execution_enabled(),
+        "legacy model io doesn't work in eager mode",
+    )
+    def test_load_correctness_1node_legacy_api(test_case):
+        _TestLoadCorrectness(test_case, get_simple_model, flow.float, True)
+
+    @flow.unittest.skip_unless_1n4d()
+    @unittest.skipIf(
+        flow.unittest.env.eager_execution_enabled(),
+        "legacy model io doesn't work in eager mode",
+    )
+    def test_save_correctness_1node(test_case):
+        for dtype in [flow.float, flow.double]:
+            _TestSaveCorrectness(test_case, get_large_model, dtype, False)
+
+    @flow.unittest.skip_unless_2n4d()
+    @unittest.skipIf(
+        flow.unittest.env.eager_execution_enabled(),
+        "legacy model io doesn't work in eager mode",
+    )
+    def test_save_correctness_2node(test_case):
+        _TestSaveCorrectness(test_case, get_large_model, flow.float, False)
+
+    @flow.unittest.skip_unless_1n4d()
+    @unittest.skipIf(
+        flow.unittest.env.eager_execution_enabled(),
+        "legacy model io doesn't work in eager mode",
+    )
+    def test_load_correctness_1node(test_case):
+        for dtype in [flow.float, flow.double]:
+            _TestLoadCorrectness(test_case, get_large_model, dtype, False)
+
+    @flow.unittest.skip_unless_2n4d()
+    @unittest.skipIf(
+        flow.unittest.env.eager_execution_enabled(),
+        "legacy model io doesn't work in eager mode",
+    )
+    def test_load_correctness_2node(test_case):
+        _TestLoadCorrectness(test_case, get_large_model, flow.float, False)
+
+    @flow.unittest.skip_unless_1n4d()
+    def test_assignment_between_memory(test_case):
+        _TestAssignmentBetweenMemory(test_case, flow.float)
+
+    @flow.unittest.skip_unless_1n4d()
+    @unittest.skipIf(
+        not flow.unittest.env.eager_execution_enabled(),
+        "Save and load are covered by other tests in lazy mode",
+    )
+    def test_round_trip(test_case):
+        _TestRoundTrip(test_case, get_large_model, flow.float)
+
+    @flow.unittest.skip_unless_1n4d()
+    def test_partially_load_numpy(test_case):
+        _TestPartiallyLoadNumpy(test_case, flow.float)
+
+    @flow.unittest.skip_unless_1n2d()
+    def test_mixed_model(test_case):
+        _TestMixedModel(test_case, flow.float)
+
+    @flow.unittest.skip_unless_1n2d()
+    def test_resume_training(test_case):
+        _TestResumeTraining(test_case)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_clip_by_value.py b/python/oneflow/compatible/single_client/test/ops/test_clip_by_value.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c3fbd449f724a0f3dc9619c6e39da5f11648aac
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_clip_by_value.py
@@ -0,0 +1,170 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def _np_dtype_to_of_dtype(np_dtype):
+    if np_dtype == np.float32:
+        return flow.float
+    else:
+        raise NotImplementedError
+
+
+def _of_clip_by_value(values, min, max, device_type="gpu", dynamic=False, grad_cb=None):
+    data_type = _np_dtype_to_of_dtype(values.dtype)
+    if callable(grad_cb):
+
+        def clip(values_blob):
+            with flow.scope.placement(device_type, "0:0"):
+                x = flow.get_variable(
+                    "values",
+                    shape=values.shape,
+                    dtype=data_type,
+                    initializer=flow.constant_initializer(0),
+                )
+                x = flow.cast_to_current_logical_view(x)
+                x = x + values_blob
+                y = flow.clip_by_value(x, min, max)
+                flow.optimizer.SGD(
+                    flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+                ).minimize(y)
+            flow.watch_diff(x, grad_cb)
+            return y
+
+    else:
+
+        def clip(values_blob):
+            with flow.scope.placement(device_type, "0:0"):
+                return flow.clip_by_value(values_blob, min, max, name="Clip")
+
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(data_type)
+    if grad_cb is not None:
+        func_config_type = "train"
+    else:
+        func_config_type = "predict"
+    if dynamic:
+        func_config.default_logical_view(flow.scope.mirrored_view())
+
+        @flow.global_function(type=func_config_type, function_config=func_config)
+        def clip_fn(
+            values_def: oft.ListNumpy.Placeholder(values.shape, dtype=data_type)
+        ):
+            return clip(values_def)
+
+        return clip_fn([values]).get().numpy_list()[0]
+    else:
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(type=func_config_type, function_config=func_config)
+        def clip_fn(values_def: oft.Numpy.Placeholder(values.shape, dtype=data_type)):
+            return clip(values_def)
+
+        return clip_fn(values).get().numpy()
+
+
+def _compare_with_tf(test_case, values, min, max, device_type, dynamic):
+    with tf.GradientTape() as t:
+        x = tf.Variable(values)
+        y = tf.clip_by_value(x, min, max)
+    dy = t.gradient(y, x)
+
+    def compare_dy(dy_blob):
+        test_case.assertTrue(
+            np.array_equal(
+                dy.numpy(), dy_blob.numpy_list()[0] if dynamic else dy_blob.numpy()
+            )
+        )
+
+    of_y = _of_clip_by_value(
+        values=values,
+        min=min,
+        max=max,
+        device_type=device_type,
+        dynamic=dynamic,
+        grad_cb=compare_dy,
+    )
+    test_case.assertTrue(np.array_equal(y.numpy(), of_y))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestClipByValue(flow.unittest.TestCase):
+    def test_clip_by_value(test_case):
+        values = np.random.randint(low=-100, high=100, size=(8, 512, 4)).astype(
+            np.float32
+        )
+        np_out = np.clip(values, -50, 50)
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["dynamic"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            of_out = _of_clip_by_value(values, -50, 50, *arg)
+            test_case.assertTrue(np.array_equal(np_out, of_out))
+
+    def test_clip_by_min(test_case):
+        values = np.random.standard_normal((100, 30)).astype(np.float32)
+        np_out = np.clip(values, a_min=0, a_max=None)
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["dynamic"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            of_out = _of_clip_by_value(values, 0, None, *arg)
+            test_case.assertTrue(np.array_equal(np_out, of_out))
+
+    def test_clip_by_max(test_case):
+        values = np.random.standard_normal((2, 64, 800, 1088)).astype(np.float32)
+        np_out = np.clip(values, a_min=None, a_max=0.2)
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["dynamic"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            of_out = _of_clip_by_value(values, None, 0.2, *arg)
+            test_case.assertTrue(np.allclose(np_out, of_out))
+
+    def test_clip_by_value_grad(test_case):
+        values = np.random.standard_normal(1024).astype(np.float32)
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["dynamic"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            _compare_with_tf(test_case, values, 0, 0.5, *arg)
+
+    def test_clip_by_value_grad_case_1(test_case):
+        values = np.random.standard_normal((128, 10, 27)).astype(np.float32)
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["dynamic"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            _compare_with_tf(test_case, values, -0.2, 0.2, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_coco_reader.py b/python/oneflow/compatible/single_client/test/ops/test_coco_reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..5678fddc258fbaf32c8800f1172781b52fb65acc
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_coco_reader.py
@@ -0,0 +1,448 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import math
+import os
+import unittest
+
+import cv2
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+VERBOSE = False
+coco_dict = dict()
+
+
+def _coco(anno_file):
+    global coco_dict
+    if anno_file not in coco_dict:
+        from pycocotools.coco import COCO
+
+        coco_dict[anno_file] = COCO(anno_file)
+    return coco_dict[anno_file]
+
+
+def _make_coco_data_load_fn(
+    anno_file,
+    image_dir,
+    nthread,
+    batch_size,
+    stride_partition,
+    shuffle_after_epoch,
+    ret_image_id_only=False,
+):
+    flow.clear_default_session()
+    flow.config.cpu_device_num(4)
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.consistent_view())
+
+    @flow.global_function(function_config=func_config)
+    def coco_load_fn():
+        with flow.scope.placement("cpu", "0:0-{}".format(nthread - 1)):
+            (
+                image,
+                image_id,
+                image_size,
+                gt_bbox,
+                gt_label,
+                gt_segm,
+                gt_segm_index,
+            ) = flow.data.coco_reader(
+                annotation_file=anno_file,
+                image_dir=image_dir,
+                batch_size=batch_size,
+                shuffle=shuffle_after_epoch,
+                stride_partition=stride_partition,
+                name="COCOReader",
+            )
+            if ret_image_id_only:
+                return image_id
+            decoded_image = flow.image_decode(image, dtype=flow.float)
+            image_list = flow.tensor_buffer_to_tensor_list(
+                decoded_image, shape=(800, 1333, 3), dtype=flow.float
+            )
+            bbox_list = flow.tensor_buffer_to_tensor_list(
+                gt_bbox, shape=(128, 4), dtype=flow.float
+            )
+            label_list = flow.tensor_buffer_to_tensor_list(
+                gt_label, shape=(128,), dtype=flow.int32
+            )
+            segm_list = flow.tensor_buffer_to_tensor_list(
+                gt_segm, shape=(1024, 2), dtype=flow.float
+            )
+            segm_index_list = flow.tensor_buffer_to_tensor_list(
+                gt_segm_index, shape=(1024, 3), dtype=flow.int32
+            )
+        return (
+            image_id,
+            image_size,
+            image_list,
+            bbox_list,
+            label_list,
+            segm_list,
+            segm_index_list,
+        )
+
+    return coco_load_fn
+
+
+def _get_coco_image_samples(anno_file, image_dir, image_ids):
+    coco = _coco(anno_file)
+    category_id_to_contiguous_id_map = _get_category_id_to_contiguous_id_map(coco)
+    (image, image_size) = _read_images_with_cv(coco, image_dir, image_ids)
+    bbox = _read_bbox(coco, image_ids)
+    label = _read_label(coco, image_ids, category_id_to_contiguous_id_map)
+    img_segm_poly_list = _read_segm_poly(coco, image_ids)
+    (poly, poly_index) = _segm_poly_list_to_tensor(img_segm_poly_list)
+    samples = []
+    for (im, ims, b, l, p, pi) in zip(image, image_size, bbox, label, poly, poly_index):
+        samples.append(
+            dict(image=im, image_size=ims, bbox=b, label=l, poly=p, poly_index=pi)
+        )
+    return samples
+
+
+def _get_category_id_to_contiguous_id_map(coco):
+    return {v: i + 1 for (i, v) in enumerate(coco.getCatIds())}
+
+
+def _read_images_with_cv(coco, image_dir, image_ids):
+    image_files = [
+        os.path.join(image_dir, coco.imgs[img_id]["file_name"]) for img_id in image_ids
+    ]
+    image_size = [
+        (coco.imgs[img_id]["height"], coco.imgs[img_id]["width"])
+        for img_id in image_ids
+    ]
+    return (
+        [cv2.imread(image_file).astype(np.single) for image_file in image_files],
+        image_size,
+    )
+
+
+def _bbox_convert_from_xywh_to_xyxy(bbox, image_h, image_w):
+    (x, y, w, h) = bbox
+    (x1, y1) = (x, y)
+    x2 = x1 + max(w - 1, 0)
+    y2 = y1 + max(h - 1, 0)
+    x1 = min(max(x1, 0), image_w - 1)
+    y1 = min(max(y1, 0), image_h - 1)
+    x2 = min(max(x2, 0), image_w - 1)
+    y2 = min(max(y2, 0), image_h - 1)
+    if x1 >= x2 or y1 >= y2:
+        return None
+    return [x1, y1, x2, y2]
+
+
+def _read_bbox(coco, image_ids):
+    img_bbox_list = []
+    for img_id in image_ids:
+        anno_ids = coco.getAnnIds(imgIds=[img_id])
+        assert len(anno_ids) > 0, "image with id {} has no anno".format(img_id)
+        image_h = coco.imgs[img_id]["height"]
+        image_w = coco.imgs[img_id]["width"]
+        bbox_list = []
+        for anno_id in anno_ids:
+            anno = coco.anns[anno_id]
+            if anno["iscrowd"] != 0:
+                continue
+            bbox = anno["bbox"]
+            assert isinstance(bbox, list)
+            bbox_ = _bbox_convert_from_xywh_to_xyxy(bbox, image_h, image_w)
+            if bbox_ is not None:
+                bbox_list.append(bbox_)
+        bbox_array = np.array(bbox_list, dtype=np.single)
+        img_bbox_list.append(bbox_array)
+    return img_bbox_list
+
+
+def _read_label(coco, image_ids, category_id_to_contiguous_id_map):
+    img_label_list = []
+    for img_id in image_ids:
+        anno_ids = coco.getAnnIds(imgIds=[img_id])
+        assert len(anno_ids) > 0, "image with id {} has no anno".format(img_id)
+        label_list = []
+        for anno_id in anno_ids:
+            anno = coco.anns[anno_id]
+            if anno["iscrowd"] != 0:
+                continue
+            cate_id = anno["category_id"]
+            isinstance(cate_id, int)
+            label_list.append(category_id_to_contiguous_id_map[cate_id])
+        label_array = np.array(label_list, dtype=np.int32)
+        img_label_list.append(label_array)
+    return img_label_list
+
+
+def _read_segm_poly(coco, image_ids):
+    img_segm_poly_list = []
+    for img_id in image_ids:
+        anno_ids = coco.getAnnIds(imgIds=[img_id])
+        assert len(anno_ids) > 0, "img {} has no anno".format(img_id)
+        segm_poly_list = []
+        for anno_id in anno_ids:
+            anno = coco.anns[anno_id]
+            if anno["iscrowd"] != 0:
+                continue
+            segm = anno["segmentation"]
+            assert isinstance(segm, list)
+            assert len(segm) > 0, str(len(segm))
+            assert all([len(poly) > 0 for poly in segm]), str(
+                [len(poly) for poly in segm]
+            )
+            segm_poly_list.append(segm)
+        img_segm_poly_list.append(segm_poly_list)
+    return img_segm_poly_list
+
+
+def _segm_poly_list_to_tensor(img_segm_poly_list):
+    poly_array_list = []
+    poly_index_array_list = []
+    for (img_idx, segm_poly_list) in enumerate(img_segm_poly_list):
+        img_poly_elem_list = []
+        img_poly_index_list = []
+        for (obj_idx, poly_list) in enumerate(segm_poly_list):
+            for (poly_idx, poly) in enumerate(poly_list):
+                img_poly_elem_list.extend(poly)
+                for (pt_idx, pt) in enumerate(poly):
+                    if pt_idx % 2 == 0:
+                        img_poly_index_list.append([pt_idx / 2, poly_idx, obj_idx])
+        img_poly_array = np.array(img_poly_elem_list, dtype=np.single).reshape(-1, 2)
+        assert img_poly_array.size > 0, segm_poly_list
+        poly_array_list.append(img_poly_array)
+        img_poly_index_array = np.array(img_poly_index_list, dtype=np.int32)
+        assert img_poly_index_array.size > 0, segm_poly_list
+        poly_index_array_list.append(img_poly_index_array)
+    return (poly_array_list, poly_index_array_list)
+
+
+def _get_coco_sorted_imgs(anno_file):
+    coco = _coco(anno_file)
+    img_ids = coco.getImgIds()
+    img_ids.sort()
+    img_info_list = []
+    for (i, img_id) in enumerate(img_ids):
+        img_h = coco.imgs[img_id]["height"]
+        img_w = coco.imgs[img_id]["width"]
+        group_id = int(img_h / img_w)
+        anno_ids = coco.getAnnIds(imgIds=img_id, iscrowd=None)
+        anno = coco.loadAnns(anno_ids)
+        if not _has_valid_annotation(anno):
+            continue
+        img_info_list.append(
+            dict(index=i, image_id=img_id, group_id=group_id, anno_len=len(anno_ids))
+        )
+    return img_info_list
+
+
+def _count_visible_keypoints(anno):
+    return sum((sum((1 for v in ann["keypoints"][2::3] if v > 0)) for ann in anno))
+
+
+def _has_only_empty_bbox(anno):
+    return all((any((o <= 1 for o in obj["bbox"][2:])) for obj in anno))
+
+
+def _has_valid_annotation(anno):
+    if len(anno) == 0:
+        return False
+    if _has_only_empty_bbox(anno):
+        return False
+    if "keypoints" not in anno[0]:
+        return True
+    if _count_visible_keypoints(anno) >= 10:
+        return True
+    return False
+
+
+class GroupedDistributedSampler(object):
+    def __init__(self, shards, batch_size, images, stride_sample, max_iter=3):
+        assert batch_size % shards == 0
+        self._images = images
+        self._shards = shards
+        self._shard_size = math.ceil(len(images) / shards)
+        self._batch_size = batch_size
+        self._batch_size_per_shard = batch_size // shards
+        self._stride_sample = stride_sample
+        self._max_iter = max_iter
+        self._init_sample_idx()
+        self._init_group_buckets()
+
+    def _init_sample_idx(self):
+        if self._stride_sample:
+            self._sample_idx = list(range(self._shards))
+        else:
+            self._sample_idx = [rank * self._shard_size for rank in range(self._shards)]
+            self._sample_idx_in_shard = [0 for _ in range(self._shards)]
+
+    def _init_group_buckets(self):
+        self._group_buckets = [[[] for _ in range(2)] for _ in range(self._shards)]
+
+    def __iter__(self):
+        for i in range(self._max_iter):
+            sample_ids = []
+            for rank in range(self._shards):
+                sample_cnt_cur_rank = 0
+                sample_ids_cur_rank = []
+                group_buckets_cur_rank = self._group_buckets[rank]
+                if (
+                    len(group_buckets_cur_rank[0]) > 0
+                    and len(group_buckets_cur_rank[1]) > 0
+                ):
+                    if (
+                        group_buckets_cur_rank[0][0]["index"]
+                        < group_buckets_cur_rank[1][0]["index"]
+                    ):
+                        sample = group_buckets_cur_rank[0].pop(0)
+                    else:
+                        sample = group_buckets_cur_rank[1].pop(0)
+                elif len(group_buckets_cur_rank[0]) > 0:
+                    sample = group_buckets_cur_rank[0].pop(0)
+                elif len(group_buckets_cur_rank[1]) > 0:
+                    sample = group_buckets_cur_rank[1].pop(0)
+                else:
+                    sample = self._next_sample(rank)
+                group_id = sample["group_id"]
+                sample_ids_cur_rank.append(sample["image_id"])
+                sample_cnt_cur_rank += 1
+                while sample_cnt_cur_rank < self._batch_size_per_shard:
+                    if len(group_buckets_cur_rank[group_id]) > 0:
+                        sample = group_buckets_cur_rank[group_id].pop(0)
+                        sample_ids_cur_rank.append(sample["image_id"])
+                        sample_cnt_cur_rank += 1
+                        continue
+                    sample = self._next_sample(rank)
+                    if sample["group_id"] == group_id:
+                        sample_ids_cur_rank.append(sample["image_id"])
+                        sample_cnt_cur_rank += 1
+                    else:
+                        group_buckets_cur_rank[sample["group_id"]].append(sample)
+                sample_ids.extend(sample_ids_cur_rank)
+            yield sample_ids
+
+    def _next_sample(self, rank):
+        sample = self._images[self._sample_idx[rank]]
+        if self._stride_sample:
+            self._sample_idx[rank] += self._shards
+        else:
+            self._sample_idx_in_shard[rank] += 1
+            self._sample_idx[rank] += 1
+            if self._sample_idx_in_shard[rank] == self._shard_size:
+                self._sample_idx[rank] += (self._shards - 1) * self._shard_size
+                self._sample_idx_in_shard[rank] = 0
+        if self._sample_idx[rank] >= len(self._images):
+            self._sample_idx[rank] %= len(self._images)
+        return sample
+
+
+@unittest.skipIf(True, "skip for now because of single-client tensor_list removed")
+class TestCocoReader(flow.unittest.TestCase):
+    def test_coco_reader(test_case, verbose=VERBOSE):
+        anno_file = "/dataset/mscoco_2017/annotations/instances_val2017.json"
+        image_dir = "/dataset/mscoco_2017/val2017"
+        of_coco_load_fn = _make_coco_data_load_fn(
+            anno_file, image_dir, 1, 2, True, True
+        )
+        (
+            image_id,
+            image_size,
+            image,
+            bbox,
+            label,
+            poly,
+            poly_index,
+        ) = of_coco_load_fn().get()
+        image_id = image_id.numpy()
+        image_size = image_size.numpy()
+        image = image.numpy_lists()
+        bbox = bbox.numpy_lists()
+        label = label.numpy_lists()
+        poly = poly.numpy_lists()
+        poly_index = poly_index.numpy_lists()
+        samples = _get_coco_image_samples(anno_file, image_dir, image_id)
+        for (i, sample) in enumerate(samples):
+            if verbose:
+                print(
+                    "#{} of label:\n".format(i),
+                    label[0][i].squeeze(),
+                    type(label[0][i].squeeze()),
+                    label[0][i].squeeze().shape,
+                )
+                print(
+                    "#{} coco label:\n".format(i),
+                    sample["label"],
+                    type(sample["label"]),
+                    sample["label"].shape,
+                )
+            test_case.assertTrue(np.array_equal(image[0][i].squeeze(), sample["image"]))
+            test_case.assertTrue(np.array_equal(image_size[i], sample["image_size"]))
+            test_case.assertTrue(np.allclose(bbox[0][i].squeeze(), sample["bbox"]))
+            cur_label = label[0][i].squeeze()
+            if len(cur_label.shape) == 0:
+                cur_label = np.array([cur_label])
+            test_case.assertTrue(np.array_equal(cur_label, sample["label"]))
+            test_case.assertTrue(np.allclose(poly[0][i].squeeze(), sample["poly"]))
+            test_case.assertTrue(
+                np.array_equal(poly_index[0][i].squeeze(), sample["poly_index"])
+            )
+
+    def test_coco_reader_distributed_stride(test_case, verbose=VERBOSE):
+        anno_file = "/dataset/mscoco_2017/annotations/instances_val2017.json"
+        image_dir = "/dataset/mscoco_2017/val2017"
+        image_info_list = _get_coco_sorted_imgs(anno_file)
+        if verbose:
+            print("Info of the first 20 images:")
+            for (i, image_info) in enumerate(image_info_list[:20]):
+                print(
+                    "index: {}, image_id: {}, group_id: {}, anno len: {}".format(
+                        i,
+                        image_info["image_id"],
+                        image_info["group_id"],
+                        image_info["anno_len"],
+                    )
+                )
+        sampler = GroupedDistributedSampler(4, 8, image_info_list, True)
+        of_coco_load_fn = _make_coco_data_load_fn(
+            anno_file, image_dir, 4, 8, True, False, True
+        )
+        for (i, sample_ids) in enumerate(sampler):
+            image_id = of_coco_load_fn().get().numpy()
+            if verbose:
+                print("#{} image_id:".format(i), image_id)
+                print("#{} sample_ids:".format(i), sample_ids)
+            test_case.assertTrue(np.array_equal(image_id, sample_ids))
+
+    def test_coco_reader_distributed_contiguous(test_case, verbose=VERBOSE):
+        anno_file = "/dataset/mscoco_2017/annotations/instances_val2017.json"
+        image_dir = "/dataset/mscoco_2017/val2017"
+        image_info_list = _get_coco_sorted_imgs(anno_file)
+        sampler = GroupedDistributedSampler(4, 8, image_info_list, False)
+        of_coco_load_fn = _make_coco_data_load_fn(
+            anno_file, image_dir, 4, 8, False, False, True
+        )
+        for (i, sample_ids) in enumerate(sampler):
+            image_id = of_coco_load_fn().get().numpy()
+            if verbose:
+                print("#{} image_id:".format(i), image_id)
+                print("#{} sample_ids:".format(i), sample_ids)
+            test_case.assertTrue(np.array_equal(image_id, sample_ids))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_combined_margin_loss.py b/python/oneflow/compatible/single_client/test/ops/test_combined_margin_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5458044c8ee30b592ab473beceaa03b4ff928a0
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_combined_margin_loss.py
@@ -0,0 +1,143 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import test_global_storage
+from test_util import Args, GenArgDict, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def margin_loss(loss_m1, loss_m2, loss_m3, s, inputs, labels):
+    inputs = inputs * s
+    class_num = inputs.shape[1]
+    if loss_m1 != 1.0 or loss_m2 != 0.0 or loss_m3 != 0.0:
+        if loss_m1 == 1.0 and loss_m2 == 0.0:
+            s_m = s * loss_m3
+            gt_one_hot = flow.one_hot(
+                labels, depth=class_num, on_value=s_m, off_value=0.0, dtype=flow.float
+            )
+            inputs = inputs - gt_one_hot
+        else:
+            labels_expand = flow.reshape(labels, (labels.shape[0], 1))
+            zy = flow.gather(inputs, labels_expand, batch_dims=1)
+            cos_t = zy * (1 / s)
+            t = flow.math.acos(cos_t)
+            if loss_m1 != 1.0:
+                t = t * loss_m1
+            if loss_m2 > 0.0:
+                t = t + loss_m2
+            body = flow.math.cos(t)
+            if loss_m3 > 0.0:
+                body = body - loss_m3
+            new_zy = body * s
+            diff = new_zy - zy
+            gt_one_hot = flow.one_hot(
+                labels, depth=class_num, on_value=1.0, off_value=0.0, dtype=flow.float
+            )
+            body = gt_one_hot * diff
+            inputs = inputs + body
+    return inputs
+
+
+def test_combined_margin_loss(
+    test_case, device_type, input_shape, label_shape, data_type, m1, m2, m3, s
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    flow.config.gpu_device_num(4)
+    func_config = flow.FunctionConfig()
+    func_config.default_logical_view(flow.scope.consistent_view())
+    func_config.default_data_type(flow.float32)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def test_job(
+        x: oft.Numpy.Placeholder(input_shape, dtype=flow.float32),
+        labels: oft.Numpy.Placeholder(label_shape, dtype=flow.int32),
+    ):
+        with flow.scope.placement(device_type, "0:0"):
+            v = flow.get_variable(
+                name="v",
+                shape=(1,),
+                dtype=flow.float32,
+                initializer=flow.zeros_initializer(),
+            )
+            x = x + v
+            x1 = flow.identity(x)
+            x2 = flow.identity(x)
+            flow.watch_diff(x1, test_global_storage.Setter("x1_diff"))
+            flow.watch_diff(x2, test_global_storage.Setter("x2_diff"))
+            x1 = flow.cast(x1, data_type)
+            x2 = flow.cast(x2, data_type)
+        with flow.scope.placement(device_type, "0:0-3"):
+            y1 = (
+                flow.combined_margin_loss(
+                    x1.with_distribute(flow.distribute.split(1)),
+                    labels.with_distribute(flow.distribute.broadcast()),
+                    m1,
+                    m2,
+                    m3,
+                )
+                * s
+            )
+            y2 = margin_loss(m1, m2, m3, s, x2, labels)
+        with flow.scope.placement(device_type, "0:0"):
+            y1 = flow.cast(y1, flow.float)
+            y2 = flow.cast(y2, flow.float)
+            flow.watch(y1, test_global_storage.Setter("y1"))
+            flow.watch(y2, test_global_storage.Setter("y2"))
+            loss = y1 + y2
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+            ).minimize(flow.math.reduce_sum(loss))
+        return loss
+
+    x = np.random.uniform(low=-1, high=1, size=input_shape).astype(np.float32)
+    labels = np.random.randint(0, 1000, size=(*label_shape,)).astype(np.int32)
+    test_job(x, labels).get()
+    tol = 0.002
+    y1 = test_global_storage.Get("y1")
+    y2 = test_global_storage.Get("y2")
+    test_case.assertTrue(np.allclose(y1, y2, rtol=tol, atol=tol))
+    x1_diff = test_global_storage.Get("x1_diff")
+    x2_diff = test_global_storage.Get("x2_diff")
+    test_case.assertTrue(np.allclose(x1_diff, x2_diff, rtol=tol, atol=tol))
+
+
+@flow.unittest.skip_unless_1n4d()
+class TestCombinedMarginLoss(flow.unittest.TestCase):
+    def test_case(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(64, 1000)]
+        arg_dict["label_shape"] = [(64,)]
+        arg_dict["data_type"] = [flow.float32]
+        arg_dict["m1"] = [0.3]
+        arg_dict["m2"] = [0.5]
+        arg_dict["m3"] = [0.4]
+        arg_dict["s"] = [5]
+        for arg in GenArgDict(arg_dict):
+            test_combined_margin_loss(test_case, **arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_compat_conv2d.py b/python/oneflow/compatible/single_client/test/ops/test_compat_conv2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..d07c8a71dc068adfc2d3732a04dba7b49dbcfe04
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_compat_conv2d.py
@@ -0,0 +1,232 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import test_global_storage
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def grouped_convolution2D(
+    inputs, filters, padding, num_groups, strides=None, dilation_rate=None
+):
+    input_list = tf.split(inputs, num_groups, axis=-1)
+    filter_list = tf.split(filters, num_groups, axis=-1)
+    output_list = []
+    for (conv_idx, (input_tensor, filter_tensor)) in enumerate(
+        zip(input_list, filter_list)
+    ):
+        output_list.append(
+            tf.nn.conv2d(
+                input_tensor,
+                filter_tensor,
+                padding="VALID",
+                strides=[1, 1, 1, 1],
+                data_format="NHWC",
+            )
+        )
+    outputs = tf.concat(output_list, axis=-1)
+    return outputs
+
+
+def compare_with_tensorflow(
+    device_type, x_shape, filters, kernel_size, groups, padding="VALID", stride=1
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def ConvJob():
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "x",
+                shape=x_shape,
+                dtype=flow.float,
+                initializer=flow.random_uniform_initializer(minval=0, maxval=100),
+                trainable=True,
+            )
+            weight_shape = (filters, int(x.shape[1] / groups), kernel_size, kernel_size)
+            weight = flow.get_variable(
+                "conv-weight",
+                shape=weight_shape,
+                dtype=flow.float,
+                initializer=flow.random_uniform_initializer(minval=0, maxval=100),
+            )
+            loss = flow.nn.compat_conv2d(
+                x,
+                weight,
+                strides=[stride, stride],
+                padding=padding,
+                data_format="NCHW",
+                dilations=[1, 1],
+                groups=groups,
+            )
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch(weight, test_global_storage.Setter("weight"))
+            flow.watch_diff(weight, test_global_storage.Setter("weight_diff"))
+            flow.watch(loss, test_global_storage.Setter("loss"))
+            flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))
+            return loss
+
+    of_out = ConvJob().get()
+    with tf.GradientTape(persistent=True) as tape:
+        x = tf.Variable(test_global_storage.Get("x").transpose(0, 2, 3, 1))
+        assert groups > 0
+        assert x_shape[1] % groups == 0
+        assert filters % groups == 0
+        if groups == 1:
+            weight = tf.Variable(
+                test_global_storage.Get("weight").transpose(2, 3, 1, 0)
+            )
+            tf_out = tf.nn.conv2d(
+                x,
+                weight,
+                strides=[1, stride, stride, 1],
+                padding=padding,
+                data_format="NHWC",
+            )
+        else:
+            weight = tf.Variable(
+                test_global_storage.Get("weight").transpose(2, 3, 1, 0)
+            )
+            tf_out = grouped_convolution2D(
+                x, weight, padding=padding, num_groups=groups
+            )
+    loss_diff = test_global_storage.Get("loss_diff").transpose(0, 2, 3, 1)
+    tf_x_diff = tape.gradient(tf_out, x, loss_diff)
+    tf_weight_diff = tape.gradient(tf_out, weight, loss_diff)
+    assert np.allclose(
+        of_out.numpy().transpose(0, 2, 3, 1), tf_out.numpy(), rtol=0.005, atol=0.005
+    )
+    assert np.allclose(
+        test_global_storage.Get("x_diff").transpose(0, 2, 3, 1),
+        tf_x_diff.numpy(),
+        rtol=0.005,
+        atol=0.005,
+    )
+    assert np.allclose(
+        test_global_storage.Get("weight_diff").transpose(2, 3, 1, 0),
+        tf_weight_diff.numpy(),
+        rtol=0.005,
+        atol=0.005,
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestCompatConv2d(flow.unittest.TestCase):
+    def test_conv1(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(10, 32, 20, 20)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [3]
+        arg_dict["groups"] = [1]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_conv2(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(10, 32, 20, 20)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [3]
+        arg_dict["groups"] = [4]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_conv3(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(10, 32, 20, 20)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [3]
+        arg_dict["groups"] = [8]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_conv4(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(10, 32, 20, 20)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [3]
+        arg_dict["groups"] = [32]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_conv5(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(10, 32, 20, 20)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [1]
+        arg_dict["groups"] = [8]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_conv6(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(10, 32, 20, 20)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [1]
+        arg_dict["groups"] = [32]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_conv7(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(2, 4, 8, 8)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [4]
+        arg_dict["groups"] = [1]
+        arg_dict["padding"] = ["SAME"]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_conv8(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(2, 4, 8, 8)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [5]
+        arg_dict["groups"] = [1]
+        arg_dict["padding"] = ["SAME"]
+        arg_dict["stride"] = [2]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_concat.py b/python/oneflow/compatible/single_client/test/ops/test_concat.py
new file mode 100644
index 0000000000000000000000000000000000000000..879f5f15e70d59ea17919cd43ef3648e84348674
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_concat.py
@@ -0,0 +1,378 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import math
+import os
+import random
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import test_global_storage
+from test_util import GenArgList, type_name_to_flow_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def compare_with_tensorflow(device_type, x_shape, y_shape, dtype, axis):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_logical_view(flow.scope.mirrored_view())
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def ConcatJob():
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "x",
+                shape=x_shape,
+                dtype=type_name_to_flow_type[dtype],
+                initializer=flow.random_uniform_initializer(minval=-10, maxval=10),
+                trainable=True,
+            )
+            y = flow.get_variable(
+                "y",
+                shape=y_shape,
+                dtype=type_name_to_flow_type[dtype],
+                initializer=flow.random_uniform_initializer(minval=-10, maxval=10),
+                trainable=True,
+            )
+            x = flow.cast_to_current_logical_view(x)
+            y = flow.cast_to_current_logical_view(y)
+            loss = flow.concat([x, y], axis)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch(y, test_global_storage.Setter("y"))
+            flow.watch_diff(y, test_global_storage.Setter("y_diff"))
+            flow.watch(loss, test_global_storage.Setter("loss"))
+            flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))
+            return loss
+
+    of_out = ConcatJob().get()
+    with tf.GradientTape(persistent=True) as tape:
+        x = tf.Variable(test_global_storage.Get("x"))
+        y = tf.Variable(test_global_storage.Get("y"))
+        tf_out = tf.concat([x, y], axis)
+    loss_diff = test_global_storage.Get("loss_diff")
+    tf_x_diff = tape.gradient(tf_out, x, loss_diff)
+    tf_y_diff = tape.gradient(tf_out, y, loss_diff)
+    assert np.array_equal(of_out.numpy(), tf_out.numpy())
+    assert np.array_equal(test_global_storage.Get("x_diff"), tf_x_diff.numpy())
+    assert np.array_equal(test_global_storage.Get("y_diff"), tf_y_diff.numpy())
+
+
+def _of_dynamic_concat(
+    inputs,
+    input_static_shape,
+    axis,
+    device_type,
+    watch_cb=None,
+    make_watch_diff_cb=None,
+):
+    assert isinstance(inputs, (list, tuple))
+    assert len(inputs) >= 2
+    assert callable(make_watch_diff_cb)
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.mirrored_view())
+    func_config.default_placement_scope(flow.scope.placement(device_type, "0:0"))
+
+    @flow.global_function(type="train", function_config=func_config)
+    def dynamic_concat_job(
+        input_0_def: oft.ListNumpy.Placeholder(
+            shape=input_static_shape, dtype=flow.float
+        ),
+        input_1_def: oft.ListNumpy.Placeholder(
+            shape=input_static_shape, dtype=flow.float
+        ),
+    ):
+        var_0 = flow.get_variable(
+            "Var0",
+            shape=(1,),
+            dtype=flow.float,
+            initializer=flow.constant_initializer(value=1, dtype=flow.float),
+            trainable=True,
+        )
+        var_1 = flow.get_variable(
+            "Var1",
+            shape=(1,),
+            dtype=flow.float,
+            initializer=flow.constant_initializer(value=1, dtype=flow.float),
+            trainable=True,
+        )
+        var_0 = flow.cast_to_current_logical_view(var_0)
+        var_1 = flow.cast_to_current_logical_view(var_1)
+        input_0_def = flow.cast_to_current_logical_view(input_0_def)
+        input_1_def = flow.cast_to_current_logical_view(input_1_def)
+        if callable(watch_cb):
+            flow.watch(var_0, watch_cb)
+            flow.watch(var_1, watch_cb)
+            flow.watch(flow.identity(input_0_def), watch_cb)
+            flow.watch(flow.identity(input_1_def), watch_cb)
+        var_0 = var_0 * input_0_def
+        var_1 = var_1 * input_1_def
+        if callable(watch_cb):
+            flow.watch(var_0, watch_cb)
+            flow.watch(var_1, watch_cb)
+        result = flow.concat(
+            [var_0, var_1], axis=axis, max_dim_size=input_static_shape[axis]
+        )
+        flow.optimizer.SGD(
+            flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+        ).minimize(result)
+        flow.watch_diff(var_0, make_watch_diff_cb(0))
+        flow.watch_diff(var_1, make_watch_diff_cb(1))
+        return result
+
+    ret = dynamic_concat_job([inputs[0]], [inputs[1]]).get()
+    return ret.numpy(0)
+
+
+def _rand_part_range(start, stop, part_num):
+    part_size = math.ceil((stop - start) / part_num)
+    begin = start
+    for i in range(part_num):
+        end = part_size * (i + 2)
+        end = random.randrange(begin + 1, min(end, stop - (part_num - i - 1)))
+        yield (begin, end)
+        begin = end
+
+
+def _slice(input, axis, start, stop):
+    slice_list = []
+    for i in range(input.ndim):
+        if i == axis:
+            slice_list.append(slice(start, stop))
+        else:
+            slice_list.append(slice(None))
+    return input[tuple(slice_list)]
+
+
+def _rand_inputs(shape, split_axis, part_num):
+    entire_input = np.random.rand(*shape).astype(np.single)
+    inputs = []
+    last_stop = 0
+    for (start, stop) in _rand_part_range(0, shape[split_axis], part_num):
+        last_stop = stop
+        input_slice = _slice(entire_input, split_axis, start, stop)
+        inputs.append(input_slice)
+    return (_slice(entire_input, split_axis, 0, last_stop), inputs)
+
+
+def _test_dynamic_concat(test_case, shape, axis, device_type, verbose=False):
+    assert axis >= 0 and axis < len(shape)
+    (output, inputs) = _rand_inputs(shape, axis, 2)
+
+    def print_blob(blob):
+        print(blob.numpy(0), blob.numpy(0).shape)
+
+    def make_watch_diff_cb(input_idx):
+        def watch_diff_cb(blob):
+            test_case.assertTrue(
+                np.array_equal(
+                    blob.numpy(0),
+                    np.ones(shape=inputs[input_idx].shape, dtype=np.single),
+                )
+            )
+
+        return watch_diff_cb
+
+    of_output = _of_dynamic_concat(
+        inputs,
+        tuple(shape),
+        axis,
+        device_type,
+        print_blob if verbose else None,
+        make_watch_diff_cb,
+    )
+    if verbose:
+        print("inputs shapes:", [input.shape for input in inputs])
+        print("output shape:", output.shape)
+        print("of_output shape:", of_output.shape)
+        print("output:\n", output)
+        print("of_output:\n", of_output)
+    test_case.assertTrue(np.array_equal(of_output, output))
+
+
+def _test_static_concat(test_case, shape, axis):
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+
+    def compare_var_diff(var_blob):
+        test_case.assertTrue(
+            np.array_equal(var_blob.numpy(), np.ones(shape=shape, dtype=np.single))
+        )
+
+    @flow.global_function(type="train", function_config=func_config)
+    def static_concat_job(
+        input_0_def: oft.Numpy.Placeholder(shape=shape, dtype=flow.float),
+        input_1_def: oft.Numpy.Placeholder(shape=shape, dtype=flow.float),
+    ):
+        var = flow.get_variable(
+            "var",
+            shape=shape,
+            dtype=flow.float,
+            initializer=flow.random_uniform_initializer(),
+            trainable=True,
+        )
+        concated = flow.concat([input_0_def, input_1_def, var], axis=axis)
+        test_case.assertTrue(not concated.is_dynamic)
+        flow.optimizer.SGD(
+            flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+        ).minimize(concated)
+        flow.watch_diff(var, compare_var_diff)
+        return (var, concated)
+
+    inputs = []
+    for i in range(2):
+        inputs.append(np.random.rand(*shape).astype(np.single))
+    (var, concated) = static_concat_job(inputs[0], inputs[1]).get()
+    test_case.assertTrue(
+        np.array_equal(
+            np.concatenate([inputs[0], inputs[1], var.numpy()], axis=axis),
+            concated.numpy(),
+        )
+    )
+
+
+def _test_hybrid_concat(
+    test_case, static_shape, axis, max_dim_size=None, verbose=False
+):
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_logical_view(flow.scope.mirrored_view())
+
+    def compare_var_diff(var_blob):
+        test_case.assertTrue(
+            np.array_equal(
+                var_blob.numpy(), np.ones(shape=static_shape, dtype=np.single)
+            )
+        )
+
+    rand_sub_shape = list(static_shape).copy()
+    rand_sub_shape[axis] = random.randrange(1, static_shape[axis])
+    rand_sub_shape = tuple(rand_sub_shape)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def hybrid_concat_job(
+        input_0_def: oft.ListNumpy.Placeholder(shape=static_shape, dtype=flow.float),
+        input_1_def: oft.ListNumpy.Placeholder(shape=static_shape, dtype=flow.float),
+    ):
+        var = flow.get_variable(
+            "var",
+            shape=static_shape,
+            dtype=flow.float,
+            initializer=flow.random_uniform_initializer(),
+            trainable=True,
+        )
+        constant = flow.constant(1.0, dtype=flow.float, shape=rand_sub_shape)
+        inputs = [
+            flow.cast_to_current_logical_view(input)
+            for input in [var, input_0_def, input_1_def, constant]
+        ]
+        concated = flow.concat(inputs, axis=axis, max_dim_size=max_dim_size)
+        if verbose:
+            print("concated static shape:", concated.shape)
+        flow.optimizer.SGD(
+            flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+        ).minimize(concated)
+        flow.watch_diff(var, compare_var_diff)
+        if max_dim_size is None:
+            test_case.assertTrue(
+                concated.shape[axis] == static_shape[axis] * 3 + rand_sub_shape[axis]
+            )
+        else:
+            test_case.assertTrue(concated.shape[axis] == max_dim_size)
+        return (var, concated)
+
+    (output, inputs) = _rand_inputs(static_shape, axis, 2)
+    if verbose:
+        print("static_shape:", static_shape)
+        print("input_0 shape:", inputs[0].shape)
+        print("input_1 shape:", inputs[1].shape)
+        print("output shape:", output.shape)
+        print("rand_sub_shape:", rand_sub_shape)
+    (var, concated) = hybrid_concat_job([inputs[0]], [inputs[1]]).get()
+    if verbose:
+        print("var shape:", var.numpy().shape)
+        print("concated shape:", concated.numpy(0).shape)
+    test_case.assertTrue(
+        np.array_equal(
+            np.concatenate(
+                [var.numpy(), output, np.ones(shape=rand_sub_shape, dtype=np.single)],
+                axis=axis,
+            ),
+            concated.numpy(0),
+        )
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestConcat(flow.unittest.TestCase):
+    def test_concat(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["x_shape"] = [(10, 20, 30)]
+        arg_dict["y_shape"] = [(10, 20, 30)]
+        arg_dict["dtype"] = ["float32", "double"]
+        arg_dict["axis"] = [0, 1, 2]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_dynamic_concat_case_0(test_case):
+        _test_dynamic_concat(test_case, (64, 4), 0, "gpu")
+
+    def test_dynamic_concat_case_1(test_case):
+        _test_dynamic_concat(test_case, (2, 10), 1, "cpu")
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_dynamic_concat_case_2(test_case):
+        _test_dynamic_concat(test_case, (4, 7, 128), 2, "gpu")
+
+    def test_dynamic_concat_case_3(test_case):
+        _test_dynamic_concat(test_case, (16,), 0, "cpu")
+
+    def test_static_concat_case_0(test_case):
+        _test_static_concat(test_case, (10, 7), 0)
+
+    def test_static_concat_case_1(test_case):
+        _test_static_concat(test_case, (3, 8, 4), 1)
+
+    def test_hybrid_concat_case_0(test_case):
+        _test_hybrid_concat(test_case, (64, 4), 0)
+
+    def test_hybrid_concat_case_1(test_case):
+        _test_hybrid_concat(test_case, (10,), 0, 30)
+
+    def test_hybrid_concat_case_2(test_case):
+        _test_hybrid_concat(test_case, (10, 7, 5), 1, 21)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_constant.py b/python/oneflow/compatible/single_client/test/ops/test_constant.py
new file mode 100644
index 0000000000000000000000000000000000000000..30c22521c8057a1437321ea5d26ec67dfa6eafb2
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_constant.py
@@ -0,0 +1,79 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import math
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+func_config = flow.FunctionConfig()
+func_config.default_data_type(flow.float)
+
+
+def _test(test_case, device_type, type_name_value):
+    (type_name, value) = type_name_value
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    flow_type = type_name_to_flow_type[type_name]
+    np_type = type_name_to_np_type[type_name]
+    shape = (1024, 1024)
+
+    @flow.global_function(function_config=func_config)
+    def constant_job():
+        with flow.scope.placement(device_type, "0:0"):
+            return flow.constant(value, dtype=flow_type, shape=shape)
+
+    of_out = constant_job().get().numpy()
+    test_case.assertTrue(np.array_equal(of_out, np.full(shape, value).astype(np_type)))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestConstant(flow.unittest.TestCase):
+    def test_constant(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["type_name_value"] = [
+            ("float32", 0),
+            ("float32", 0.0),
+            ("float32", 1),
+            ("float32", 1.0),
+            ("float32", -1),
+            ("float32", -1.0),
+            ("float32", math.pi),
+            ("float32", -math.pi),
+            ("float32", float("inf")),
+            ("float32", float("-inf")),
+            ("int32", 0),
+            ("int32", 0.0),
+            ("int32", 1),
+            ("int32", 1.0),
+            ("int32", -1),
+            ("int32", -1.0),
+            ("int32", 2 ** 31 - 1),
+            ("int32", -(2 ** 31)),
+        ]
+        for arg in GenArgList(arg_dict):
+            _test(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_constant_like.py b/python/oneflow/compatible/single_client/test/ops/test_constant_like.py
new file mode 100644
index 0000000000000000000000000000000000000000..94f7c3b3c0a1f3b3231915acd31298b467272d1c
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_constant_like.py
@@ -0,0 +1,103 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def _check(test_case, x, y, value, dtype=None):
+    np_constant_like = np.full(x.shape, value)
+    test_case.assertTrue(np.array_equal(np_constant_like, y))
+
+
+def _run_test(test_case, x, value, dtype=None, device="gpu"):
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.consistent_view())
+
+    @flow.global_function(function_config=func_config)
+    def ConstantLikeJob(x: oft.Numpy.Placeholder(x.shape)):
+        return flow.constant_like(x, value=value, dtype=dtype)
+
+    y = ConstantLikeJob(x).get()
+    _check(test_case, x, y.numpy(), value, dtype=dtype)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestConstantLike(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_constant_like_gpu_float(test_case):
+        x = np.random.rand(10, 3, 32, 1024).astype(np.float32)
+        _run_test(test_case, x, 1.0, flow.float, "gpu")
+
+    def test_constant_like_cpu_float(test_case):
+        x = np.random.rand(10, 3, 32, 1024).astype(np.float32)
+        _run_test(test_case, x, 2.0, flow.float, "cpu")
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_constant_like_gpu_double(test_case):
+        x = np.random.rand(10, 3, 32, 1024).astype(np.float32)
+        _run_test(test_case, x, 3.0, flow.double, "gpu")
+
+    def test_constant_like_cpu_double(test_case):
+        x = np.random.rand(10, 3, 32, 1024).astype(np.float32)
+        _run_test(test_case, x, 4.0, flow.double, "cpu")
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_constant_like_gpu_int8(test_case):
+        x = np.random.rand(10, 3, 32, 1024).astype(np.float32)
+        _run_test(test_case, x, 5.0, flow.int8, "gpu")
+
+    def test_constant_like_cpu_int8(test_case):
+        x = np.random.rand(10, 3, 32, 1024).astype(np.float32)
+        _run_test(test_case, x, 6.0, flow.int8, "cpu")
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_constant_like_gpu_int32(test_case):
+        x = np.random.rand(10, 3, 32, 1024).astype(np.float32)
+        _run_test(test_case, x, 7.0, flow.int32, "gpu")
+
+    def test_constant_like_cpu_int32(test_case):
+        x = np.random.rand(10, 3, 32, 1024).astype(np.float32)
+        _run_test(test_case, x, 8.0, flow.int32, "cpu")
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_constant_like_gpu_int64(test_case):
+        x = np.random.rand(10, 3, 32, 1024).astype(np.float32)
+        _run_test(test_case, x, 9.0, flow.int64, "gpu")
+
+    def test_constant_like_cpu_int64(test_case):
+        x = np.random.rand(10, 3, 32, 1024).astype(np.float32)
+        _run_test(test_case, x, 10.0, flow.int64, "cpu")
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_constant_like_gpu(test_case):
+        x = np.random.rand(10, 3, 32, 1024).astype(np.float32)
+        _run_test(test_case, x, 11.0, device="gpu")
+
+    def test_constant_like_cpu(test_case):
+        x = np.random.rand(10, 3, 32, 1024).astype(np.float32)
+        _run_test(test_case, x, 12.0, device="cpu")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_constant_pad2d.py b/python/oneflow/compatible/single_client/test/ops/test_constant_pad2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a785579446af41eef4ba99ba305c28581375cc4
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_constant_pad2d.py
@@ -0,0 +1,282 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import (
+    Args,
+    Array2Numpy,
+    Coordinate2Index,
+    FlattenArray,
+    GenArgDict,
+    GenArgList,
+    Index2Coordinate,
+)
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+def _make_op_function(
+    test_case,
+    input,
+    padding,
+    constant_value,
+    grad,
+    device_type,
+    value_type,
+    machine_ids,
+    device_counts,
+):
+    flow.clear_default_session()
+    if device_type == "cpu":
+        flow.config.cpu_device_num(device_counts)
+    else:
+        flow.config.gpu_device_num(device_counts)
+    func_config = flow.FunctionConfig()
+    if value_type == flow.float16:
+        func_config.default_data_type(flow.float32)
+    else:
+        func_config.default_data_type(value_type)
+    func_config.default_placement_scope(flow.scope.placement(device_type, machine_ids))
+    func_config.default_logical_view(flow.scope.consistent_view())
+
+    def _compare_diff(blob: tp.Numpy):
+        test_case.assertTrue(np.allclose(grad, blob, 0.001, 0.001))
+
+    if value_type == flow.float32 or value_type == flow.float64:
+
+        @flow.global_function(type="train", function_config=func_config)
+        def op_function(x: tp.Numpy.Placeholder(input.shape, dtype=value_type)):
+            with flow.scope.placement(device_type, "0:0"):
+                x += flow.get_variable(
+                    name="input",
+                    shape=input.shape,
+                    dtype=value_type,
+                    initializer=flow.zeros_initializer(),
+                )
+                out = flow.constant_pad2d(x, padding, constant_value)
+                flow.optimizer.SGD(
+                    flow.optimizer.PiecewiseConstantScheduler([], [0]), momentum=0
+                ).minimize(out)
+            flow.watch_diff(x, _compare_diff)
+            return out
+
+        return op_function
+    elif value_type == flow.int32:
+
+        @flow.global_function(type="train", function_config=func_config)
+        def op_function(x: tp.Numpy.Placeholder(input.shape, dtype=flow.float32)):
+            with flow.scope.placement(device_type, "0:0"):
+                x += flow.get_variable(
+                    name="input",
+                    shape=input.shape,
+                    dtype=flow.float32,
+                    initializer=flow.zeros_initializer(),
+                )
+                y_int32 = flow.constant_pad2d(x, padding, constant_value)
+                y_fp32 = flow.cast(y_int32, dtype=flow.float32)
+                flow.optimizer.SGD(
+                    flow.optimizer.PiecewiseConstantScheduler([], [0]), momentum=0
+                ).minimize(y_fp32)
+            flow.watch_diff(x, _compare_diff)
+            return y_fp32
+
+        return op_function
+    elif value_type == flow.float16:
+
+        @flow.global_function(type="train", function_config=func_config)
+        def op_function(x: tp.Numpy.Placeholder(input.shape, dtype=flow.float32)):
+            with flow.scope.placement(device_type, "0:0"):
+                x_var = flow.get_variable(
+                    name="input",
+                    shape=input.shape,
+                    dtype=flow.float32,
+                    initializer=flow.constant_initializer(0),
+                )
+                x_var = flow.cast_to_current_logical_view(x_var)
+                input_x = x_var + x
+                x_fp32 = flow.cast(input_x, flow.float32)
+                x_fp16 = flow.cast(input_x, dtype=flow.float16)
+                constant_value_fp16 = flow.cast(constant_value, dtype=flow.float16)
+                y_fp16 = flow.constant_pad2d(x_fp16, padding, constant_value_fp16)
+                y_fp32 = flow.cast(y_fp16, dtype=flow.float32)
+                flow.optimizer.SGD(
+                    flow.optimizer.PiecewiseConstantScheduler([], [0]), momentum=0
+                ).minimize(y_fp32)
+            flow.watch_diff(x_fp32, _compare_diff)
+            return y_fp32
+
+        return op_function
+
+
+def gen_numpy_test_sample(input_shape, padding, constant_value, is_float=True):
+    (c_idx, h_idx, w_idx) = (1, 2, 3)
+    pad_left = padding[0]
+    pad_right = padding[1]
+    pad_top = padding[2]
+    pad_bottom = padding[3]
+    pad_shape = ((0, 0), (0, 0), (pad_top, pad_bottom), (pad_left, pad_right))
+
+    def _np_constant_pad2d(input, pad_shape, constant_value):
+        numpy_constant = np.pad(
+            input, pad_shape, "constant", constant_values=constant_value
+        )
+        return numpy_constant
+
+    def _np_constant_pad2d_grad(src, dest):
+        (dx_height, dx_width) = (input.shape[h_idx], input.shape[w_idx])
+        (dy_height, dy_width) = (output.shape[h_idx], output.shape[w_idx])
+        numpy_src = np.ones(src.shape, np.int32)
+        numpy_dest = np.zeros(dest.shape, np.int32)
+        array_src = FlattenArray(numpy_src)
+        array_dest = FlattenArray(numpy_dest)
+        src_num = src.shape[c_idx] * src.shape[h_idx] * src.shape[w_idx]
+        dest_num = dest.shape[c_idx] * dest.shape[h_idx] * dest.shape[w_idx]
+        elements_num = src.shape[0] * src_num
+        for iter_n in range(elements_num):
+            coords = Index2Coordinate(iter_n, src.shape)
+            (n, c, i, j) = (coords[0], coords[c_idx], coords[h_idx], coords[w_idx])
+            ip_x = ip_y = 0
+            if (
+                j >= pad_left
+                and j < dx_width + pad_left
+                and (i >= pad_top)
+                and (i < dx_height + pad_top)
+            ):
+                ip_x = j - pad_left
+                ip_y = i - pad_top
+                src_index = n * src_num + c * dy_width * dy_height + i * dy_width + j
+                dest_index = (
+                    n * dest_num + c * dx_width * dx_height + ip_y * dx_width + ip_x
+                )
+                array_dest[dest_index] += array_src[src_index]
+        numpy_dest = Array2Numpy(array_dest, dest.shape)
+        return numpy_dest
+
+    if is_float:
+        input = np.random.random(input_shape).astype(np.float32)
+    else:
+        input = np.random.randint(0, 100, input_shape)
+    output = _np_constant_pad2d(input, pad_shape, constant_value)
+    grad = _np_constant_pad2d_grad(output, input)
+    numpy_results = {
+        "input": input,
+        "padding": padding,
+        "constant_value": constant_value,
+        "output": output,
+        "grad": grad,
+    }
+    return numpy_results
+
+
+def _compare_op_function_with_samples(
+    test_case, device_type, sample, value_type, machine_ids, device_count
+):
+    op_function = _make_op_function(
+        test_case,
+        sample["input"].astype(value_type[0]),
+        sample["padding"],
+        sample["constant_value"],
+        sample["grad"].astype(value_type[0]),
+        device_type,
+        value_type[1],
+        machine_ids,
+        device_count,
+    )
+    y = (
+        op_function(sample["input"].astype(value_type[0]))
+        .get()
+        .numpy()
+        .astype(value_type[0])
+    )
+    if value_type == flow.float16:
+        test_case.assertTrue(
+            np.allclose(y, sample["output"].astype(np.float32), 0.001, 0.001)
+        )
+    else:
+        test_case.assertTrue(np.allclose(y, sample["output"].astype(value_type[0])))
+
+
+def _gen_arg_dict(
+    device_type="gpu", value_type="float", machine_ids="0:0", device_count=1
+):
+    arg_dict = OrderedDict()
+    arg_dict["device_type"] = [device_type]
+    arg_dict["samples"] = []
+    arg_dict["samples"].append(gen_numpy_test_sample((2, 1, 2, 2), [1, 1, 1, 1], 1.5))
+    arg_dict["samples"].append(gen_numpy_test_sample((4, 2, 3, 3), [2, 2, 2, 2], 0.0))
+    arg_dict["samples"].append(gen_numpy_test_sample((2, 3, 4, 5), [3, 2, 1, 2], -2.0))
+    if value_type == "float":
+        if device_type == "gpu":
+            arg_dict["value_type"] = [(np.float32, flow.float32)]
+        else:
+            arg_dict["value_type"] = [(np.float32, flow.float32)]
+    elif value_type == "int":
+        arg_dict["value_type"] = [(np.float32, flow.int32)]
+    else:
+        raise "float or int for value type only"
+    arg_dict["machine_ids"] = [machine_ids]
+    arg_dict["device_count"] = [device_count]
+    return arg_dict
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestConstantPad2d1n1d(flow.unittest.TestCase):
+    def test_op_function_int_cpu(test_case):
+        arg_dict = _gen_arg_dict("cpu", "int", "0:0", 1)
+        for arg in GenArgList(arg_dict):
+            _compare_op_function_with_samples(test_case, *arg)
+
+    def test_op_function_float_cpu(test_case):
+        arg_dict = _gen_arg_dict("cpu", "float", "0:0", 1)
+        for arg in GenArgList(arg_dict):
+            _compare_op_function_with_samples(test_case, *arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_op_function_int_gpu(test_case):
+        arg_dict = _gen_arg_dict("gpu", "int", "0:0", 1)
+        for arg in GenArgList(arg_dict):
+            _compare_op_function_with_samples(test_case, *arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_op_function_float_gpu(test_case):
+        arg_dict = _gen_arg_dict("gpu", "float", "0:0", 1)
+        for arg in GenArgList(arg_dict):
+            _compare_op_function_with_samples(test_case, *arg)
+
+
+@flow.unittest.skip_unless_1n2d()
+class TestConstantPad2d1n2d(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_op_function_float(test_case):
+        arg_dict = _gen_arg_dict("gpu", "float", "0:0-1", 2)
+        for arg in GenArgList(arg_dict):
+            _compare_op_function_with_samples(test_case, *arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_op_function_int(test_case):
+        arg_dict = _gen_arg_dict("gpu", "int", "0:0-1", 2)
+        for arg in GenArgList(arg_dict):
+            _compare_op_function_with_samples(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_copy_comm_net_pass_empty.py b/python/oneflow/compatible/single_client/test/ops/test_copy_comm_net_pass_empty.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d3c6b2e0d8554ec8a35861f69f55629312c0f0c
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_copy_comm_net_pass_empty.py
@@ -0,0 +1,145 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def ccrelu(x, name):
+    return (
+        flow.user_op_builder(name)
+        .Op("ccrelu")
+        .Input("in", [x])
+        .Output("out")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+@unittest.skipIf(True, "skip for now because of single-client tensor_list removed")
+class TestCopyCommNetPassEmpty(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_multi_node_comm_net(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.consistent_view())
+        func_config.default_data_type(flow.float)
+        flow.config.gpu_device_num(1)
+
+        @flow.global_function(function_config=func_config)
+        def ReluJob(x: oft.Numpy.Placeholder((10, 2))):
+            with flow.scope.placement("gpu", "0:0"):
+                out0 = ccrelu(x, "my_op_0_0")
+            with flow.scope.placement("gpu", "1:0"):
+                out1 = ccrelu(out0, "my_op_1_0")
+            with flow.scope.placement("gpu", "0:0"):
+                out2 = ccrelu(out1, "my_op_print")
+            return out2
+
+        index = [-2, -1, 0, 1, 2]
+        data = []
+        for i in index:
+            data.append(np.ones((10, 2), dtype=np.float32) * i)
+        for i in range(5):
+            ret = ReluJob(data[i]).get().numpy()
+            print(ret)
+            if index[i] > 0:
+                test_case.assertTrue(
+                    np.array_equal(ret, np.ones((10, 2), dtype=np.float32) * index[i])
+                )
+            else:
+                test_case.assertTrue(
+                    np.array_equal(ret, np.zeros((10, 2), dtype=np.float32))
+                )
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_multi_node_comm_net_dynamic(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.mirrored_view())
+        func_config.default_placement_scope(flow.scope.placement("gpu", "0:0"))
+        func_config.default_data_type(flow.float)
+        flow.config.machine_num(2)
+        flow.config.gpu_device_num(1)
+
+        @flow.global_function(function_config=func_config)
+        def ReluJob(x: oft.ListNumpy.Placeholder((10, 2))):
+            with flow.scope.placement("gpu", "0:0"):
+                out0 = flow.math.relu(x)
+            with flow.scope.placement("gpu", "1:0"):
+                out1 = flow.math.relu(out0)
+            with flow.scope.placement("gpu", "0:0"):
+                out2 = flow.math.relu(out1)
+            return out2
+
+        index = [-2, -1, 0, 1, 2]
+        data = []
+        for i in index:
+            data.append(np.ones((5, 2), dtype=np.float32) * i)
+        for i in range(5):
+            ret = ReluJob([data[i]]).get().numpy_list()[0]
+            print(ret)
+            if index[i] > 0:
+                test_case.assertTrue(
+                    np.array_equal(ret, np.ones((5, 2), dtype=np.float32) * index[i])
+                )
+            else:
+                test_case.assertTrue(
+                    np.array_equal(ret, np.zeros((5, 2), dtype=np.float32))
+                )
+
+    def test_multi_node_comm_net_dynamic_empty(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.mirrored_view())
+        func_config.default_placement_scope(flow.scope.placement("cpu", "0:0"))
+        func_config.default_data_type(flow.float)
+        flow.config.machine_num(2)
+        flow.config.gpu_device_num(1)
+
+        @flow.global_function(function_config=func_config)
+        def ReluJob(x: oft.ListNumpy.Placeholder((10, 2))):
+            with flow.scope.placement("cpu", "0:0"):
+                out0 = flow.math.relu(x)
+            with flow.scope.placement("cpu", "1:0"):
+                out1 = flow.math.relu(out0)
+            with flow.scope.placement("cpu", "0:0"):
+                out2 = flow.math.relu(out1)
+            return out2
+
+        index = [-2, -1, 0, 1, 2]
+        data = []
+        for i in index:
+            data.append(np.ones((0, 0), dtype=np.float32) * i)
+        for i in range(5):
+            ret = ReluJob([data[i]]).get().numpy_list()[0]
+            print(ret)
+            if index[i] > 0:
+                test_case.assertTrue(
+                    np.array_equal(ret, np.ones((0, 0), dtype=np.float32) * index[i])
+                )
+            else:
+                test_case.assertTrue(
+                    np.array_equal(ret, np.zeros((0, 0), dtype=np.float32))
+                )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_count_not_finite.py b/python/oneflow/compatible/single_client/test/ops/test_count_not_finite.py
new file mode 100644
index 0000000000000000000000000000000000000000..f57956f9ca975768779a5e89d1113c22911e6c05
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_count_not_finite.py
@@ -0,0 +1,106 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def _run_count_test(test_case, device_type, x_shape, dtype):
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(function_config=func_config)
+    def count_not_finite_job(
+        x: oft.Numpy.Placeholder(x_shape, dtype=type_name_to_flow_type[dtype])
+    ):
+        with flow.scope.placement(device_type, "0:0"):
+            return flow.count_not_finite(x)
+
+    x = np.random.randn(*x_shape).astype(type_name_to_np_type[dtype])
+    x[0] = np.nan
+    x[5][4] = np.inf
+    y = count_not_finite_job(x).get()
+    np_y = x.size - np.sum(np.isfinite(x))
+    assert y.numpy() == np_y
+
+
+def _run_multi_count_test(
+    test_case, device_type, x1_shape, x2_shape, dtype, x1_count, x2_count
+):
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(function_config=func_config)
+    def multi_count_not_finite_job(
+        x1: oft.Numpy.Placeholder(x1_shape, dtype=type_name_to_flow_type[dtype]),
+        x2: oft.Numpy.Placeholder(x2_shape, dtype=type_name_to_flow_type[dtype]),
+    ):
+        x_list = []
+        for i in range(x1_count):
+            x_list.append(x1)
+        for i in range(x2_count):
+            x_list.append(x2)
+        with flow.scope.placement(device_type, "0:0"):
+            return flow.multi_count_not_finite(x_list)
+
+    x1 = np.random.randn(*x1_shape).astype(type_name_to_np_type[dtype])
+    x1[0] = np.nan
+    x1[3] = np.inf
+    x2 = np.random.randn(*x2_shape).astype(type_name_to_np_type[dtype])
+    x2[2] = np.inf
+    x2[6, 5] = np.nan
+    y = multi_count_not_finite_job(x1, x2).get()
+    x1_not_finite = x1.size - np.sum(np.isfinite(x1))
+    x2_not_finite = x2.size - np.sum(np.isfinite(x2))
+    np_y = x1_not_finite * x1_count + x2_not_finite * x2_count
+    assert y.numpy() == np_y
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestCountNotFinite(flow.unittest.TestCase):
+    def test_count_not_finite(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_case"] = [test_case]
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["x_shape"] = [(10, 30)]
+        arg_dict["dtype"] = ["float32", "double"]
+        for arg in GenArgList(arg_dict):
+            _run_count_test(*arg)
+
+    def test_multi_count_not_finite(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_case"] = [test_case]
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["x1_shape"] = [(10, 20, 20)]
+        arg_dict["x2_shape"] = [(10, 20)]
+        arg_dict["dtype"] = ["float32", "double"]
+        arg_dict["x1_count"] = [10]
+        arg_dict["x2_count"] = [30]
+        for arg in GenArgList(arg_dict):
+            _run_multi_count_test(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_cpu_only_user_op.py b/python/oneflow/compatible/single_client/test/ops/test_cpu_only_user_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..866a91e388bc727f6f1aae53bb6afb6004684ea9
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_cpu_only_user_op.py
@@ -0,0 +1,82 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def _cpu_only_relu(x):
+    op = (
+        flow.user_op_builder("CpuOnlyRelu")
+        .Op("cpu_only_relu_test")
+        .Input("in", [x])
+        .Output("out")
+        .Build()
+    )
+    return op.InferAndTryRun().SoleOutputBlob()
+
+
+def _check_cpu_only_relu_device(test_case, verbose=False):
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_placement_scope(flow.scope.placement("cpu", "0:0"))
+
+    @flow.global_function(function_config=func_config)
+    def cpu_only_relu_job(x_def: oft.Numpy.Placeholder(shape=(2, 5), dtype=flow.float)):
+        y = _cpu_only_relu(x_def)
+        if verbose:
+            print("cpu_only_relu output device", y.parallel_conf.device_tag())
+        test_case.assertTrue("cpu" in y.parallel_conf.device_tag())
+        return y
+
+    cpu_only_relu_job(np.random.rand(2, 5).astype(np.single)).get()
+
+
+def _check_non_cpu_only_relu_device(test_case):
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_placement_scope(flow.scope.placement("gpu", "0:0"))
+
+    @flow.global_function(function_config=func_config)
+    def relu_job(x_def: oft.Numpy.Placeholder(shape=(2, 5), dtype=flow.float)):
+        with flow.scope.placement("gpu", "0:0"):
+            y = flow.math.relu(x_def)
+        test_case.assertTrue("gpu" in y.parallel_conf.device_tag())
+        return y
+
+    relu_job(np.random.rand(2, 5).astype(np.single)).get()
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestCpuOnlyUserOp(flow.unittest.TestCase):
+    def test_cpu_only_user_op(test_case):
+        _check_cpu_only_relu_device(test_case)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_non_cpu_only_user_op(test_case):
+        _check_non_cpu_only_relu_device(test_case)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_ctc_greedy_decoder.py b/python/oneflow/compatible/single_client/test/ops/test_ctc_greedy_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..aab961af2a7093fe590270b58ea6ca3f52e6a819
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_ctc_greedy_decoder.py
@@ -0,0 +1,164 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+from typing import Tuple
+
+import numpy as np
+from test_util import GenArgList, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+ninf = -float("inf")
+
+
+def _logsumexp(a, b):
+    if a < b:
+        (a, b) = (b, a)
+    if b == ninf:
+        return a
+    else:
+        return a + np.log(1 + np.exp(b - a))
+
+
+def logsumexp(*args):
+    res = args[0]
+    for e in args[1:]:
+        res = _logsumexp(res, e)
+    return res
+
+
+def log_softmax(logits, axis=0):
+    max_value = np.max(logits, axis, keepdims=True)
+    exp = np.exp(logits - max_value)
+    exp_sum = np.sum(exp, axis, keepdims=True)
+    dist = exp / exp_sum
+    return np.log(dist)
+
+
+def np_ctc_greedy_decoder(log_probs, input_lengths, merge_repeated=True):
+    blank_label = log_probs.shape[2] - 1
+    decodes = np.zeros(
+        (log_probs.shape[1], log_probs.shape[0]), dtype=input_lengths.dtype
+    )
+    neg_sum_logits = np.zeros((input_lengths.size, 1), dtype=log_probs.dtype)
+    for b in range(input_lengths.size):
+        input_length = input_lengths[b]
+        prev_indices = -1
+        t_dec = 0
+        for t in range(input_length):
+            max_indice = np.argmax(log_probs[t, b, :])
+            neg_sum_logits[b, 0] -= log_probs[t, b, max_indice]
+            if max_indice != blank_label and (
+                not (merge_repeated and max_indice == prev_indices)
+            ):
+                decodes[b, t_dec] = max_indice
+                t_dec += 1
+            prev_indices = max_indice
+    return (decodes, neg_sum_logits)
+
+
+def compare_with_np(
+    device_type,
+    device_num,
+    data_type,
+    max_input_length,
+    batch_size,
+    num_classes,
+    merge_repeated,
+):
+    assert data_type in ["float32", "double"]
+    assert device_type in ["gpu", "cpu"]
+    assert merge_repeated in [False, True]
+    flow.clear_default_session()
+    if device_type == "cpu":
+        flow.config.cpu_device_num(device_num)
+    else:
+        flow.config.gpu_device_num(device_num)
+    flow_data_type = type_name_to_flow_type[data_type]
+    np_data_type = type_name_to_np_type[data_type]
+    func_config = flow.FunctionConfig()
+    func_config.default_logical_view(flow.scope.consistent_view())
+    func_config.default_data_type(flow_data_type)
+    func_config.default_placement_scope(
+        flow.scope.placement(device_type, "0:0-{}".format(device_num - 1))
+    )
+    log_probs = np.random.random(
+        size=(max_input_length, batch_size, num_classes)
+    ).astype(np_data_type)
+    log_probs = log_softmax(log_probs, axis=2)
+    input_lengths = np.random.randint(
+        max_input_length / 2, high=max_input_length, size=(batch_size,), dtype=np.int64
+    )
+
+    @flow.global_function(function_config=func_config)
+    def ctc_greedy_decoder_job(
+        log_probs: tp.Numpy.Placeholder(
+            shape=(max_input_length, batch_size, num_classes), dtype=flow_data_type
+        ),
+        input_lengths: tp.Numpy.Placeholder(shape=(batch_size,), dtype=flow.int64),
+    ) -> Tuple[tp.Numpy, tp.Numpy]:
+        with flow.scope.placement(device_type, "0:0"):
+            (decoded, neg_sum_logits) = flow.nn.ctc_greedy_decoder(
+                log_probs, input_lengths, merge_repeated
+            )
+        return (decoded, neg_sum_logits)
+
+    (of_decoded, of_neg_sum_logits) = ctc_greedy_decoder_job(log_probs, input_lengths)
+    (np_decoded, np_neg_sum_logits) = np_ctc_greedy_decoder(
+        log_probs, input_lengths, merge_repeated
+    )
+    np.allclose(of_decoded, np_decoded, atol=1e-05)
+    np.allclose(of_neg_sum_logits, np_neg_sum_logits, atol=1e-05)
+
+
+def gen_arg_list(type):
+    arg_dict = OrderedDict()
+    if type == "1n2d":
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["device_num"] = [2]
+    else:
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["device_num"] = [1]
+    arg_dict["data_type"] = ["float32"]
+    arg_dict["max_input_length"] = [20]
+    arg_dict["batch_size"] = [4]
+    arg_dict["num_classes"] = [5]
+    arg_dict["merge_repeated"] = [False, True]
+    return GenArgList(arg_dict)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestCTCGreedyDecoder1n1d(flow.unittest.TestCase):
+    def test_ctc_greedy_decoder(test_case):
+        for arg in gen_arg_list("1n1d"):
+            compare_with_np(*arg)
+
+
+@flow.unittest.skip_unless_1n2d()
+class TestCTCGreedyDecoder1n2d(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_ctc_greedy_decoder(test_case):
+        for arg in gen_arg_list("1n2d"):
+            compare_with_np(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_ctc_loss.py b/python/oneflow/compatible/single_client/test/ops/test_ctc_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e2ca3979ea51c8d9e20df3c3d9c4d89b1961b2f
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_ctc_loss.py
@@ -0,0 +1,333 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+ninf = -float("inf")
+
+
+def _logsumexp(a, b):
+    if a < b:
+        (a, b) = (b, a)
+    if b == ninf:
+        return a
+    else:
+        return a + np.log(1 + np.exp(b - a))
+
+
+def logsumexp(*args):
+    res = args[0]
+    for e in args[1:]:
+        res = _logsumexp(res, e)
+    return res
+
+
+def log_softmax(logits, axis=0):
+    max_value = np.max(logits, axis, keepdims=True)
+    exp = np.exp(logits - max_value)
+    exp_sum = np.sum(exp, axis, keepdims=True)
+    dist = exp / exp_sum
+    return np.log(dist)
+
+
+def get_target_prime(targets, b, s, blank):
+    if s % 2 == 0:
+        return blank
+    else:
+        return targets[b, s // 2]
+
+
+def ctc_loss_np(log_probs, targets, input_lengths, target_lengths, blank=0):
+    (max_input_length, batch_size, _) = log_probs.shape
+    (_, max_target_length) = targets.shape
+    loss = np.zeros(batch_size)
+    alpha = np.zeros([batch_size, max_input_length, 2 * max_target_length + 1])
+    alpha[:, 0] = ninf
+    for b in range(0, batch_size):
+        input_length = input_lengths[b]
+        target_length = target_lengths[b]
+        alpha[b, 0, 0] = log_probs[0, b, blank]
+        if target_length > 0:
+            current_target_prime = get_target_prime(targets, b, 1, blank)
+            alpha[b, 0, 1] = log_probs[0, b, current_target_prime]
+        for t in range(1, input_length):
+            for s in range(0, 2 * target_length + 1):
+                current_target_prime = get_target_prime(targets, b, s, blank)
+                la1 = alpha[b, t - 1, s]
+                if s > 0:
+                    la2 = alpha[b, t - 1, s - 1]
+                else:
+                    la2 = ninf
+                if (
+                    s > 1
+                    and get_target_prime(targets, b, s - 2, blank)
+                    != current_target_prime
+                ):
+                    la3 = alpha[b, t - 1, s - 2]
+                else:
+                    la3 = ninf
+                alpha[b, t, s] = (
+                    logsumexp(la1, la2, la3) + log_probs[t, b, current_target_prime]
+                )
+        if target_length == 0:
+            loss[b] = -alpha[b, input_length - 1, 0]
+        else:
+            l1 = alpha[b, input_length - 1, target_length * 2]
+            l2 = alpha[b, input_length - 1, target_length * 2 - 1]
+            loss[b] = -logsumexp(l1, l2)
+    return (loss, alpha)
+
+
+def ctc_loss_grad_np(
+    grad_out,
+    loss,
+    alpha,
+    log_probs,
+    targets,
+    input_lengths,
+    target_lengths,
+    blank=0,
+    zero_infinity=False,
+):
+    (max_input_length, batch_size, num_labels) = log_probs.shape
+    (_, max_target_length) = targets.shape
+    beta = np.zeros([batch_size, max_input_length, 2 * max_target_length + 1])
+    grad = np.zeros(log_probs.shape, dtype=log_probs.dtype)
+    grad.fill(ninf)
+    for b in range(0, batch_size):
+        input_length = input_lengths[b]
+        target_length = target_lengths[b]
+        nll = loss[b]
+        if zero_infinity and nll == float("inf"):
+            grad[:, b, :] = 0
+            continue
+        if input_length > 0:
+            beta[b, input_length - 1, :] = ninf
+            beta[b, input_length - 1, 2 * target_length] = log_probs[
+                input_length - 1, b, blank
+            ]
+            grad[input_length - 1, b, blank] = (
+                alpha[b, input_length - 1, 2 * target_length]
+                + beta[b, input_length - 1, 2 * target_length]
+            )
+            if target_length > 0:
+                current_target_prime = get_target_prime(
+                    targets, b, 2 * target_length - 1, blank
+                )
+                beta[b, input_length - 1, 2 * target_length - 1] = log_probs[
+                    input_length - 1, b, current_target_prime
+                ]
+                grad[input_length - 1, b, current_target_prime] = (
+                    alpha[b, input_length - 1, 2 * target_length - 1]
+                    + beta[b, input_length - 1, 2 * target_length - 1]
+                )
+        for t in range(input_length - 2, -1, -1):
+            for s in range(2 * target_length, -1, -1):
+                current_target_prime = get_target_prime(targets, b, s, blank)
+                lb1 = beta[b, t + 1, s]
+                if s < 2 * target_length:
+                    lb2 = beta[b, t + 1, s + 1]
+                else:
+                    lb2 = ninf
+                if (
+                    s < 2 * target_length - 1
+                    and get_target_prime(targets, b, s + 2, blank)
+                    != current_target_prime
+                ):
+                    lb3 = beta[b, t + 1, s + 2]
+                else:
+                    lb3 = ninf
+                beta[b, t, s] = (
+                    logsumexp(lb1, lb2, lb3) + log_probs[t, b, current_target_prime]
+                )
+                alpha_beta = alpha[b, t, s] + beta[b, t, s]
+                lcab = grad[t, b, current_target_prime]
+                if lcab == ninf:
+                    grad[t, b, current_target_prime] = alpha_beta
+                else:
+                    grad[t, b, current_target_prime] = logsumexp(lcab, alpha_beta)
+        for t in range(0, input_length):
+            for c in range(0, num_labels):
+                res = grad[t, b, c]
+                lp = log_probs[t, b, c]
+                grad[t, b, c] = (np.exp(lp) - np.exp(res + nll - lp)) * grad_out[b]
+        if input_length < max_input_length:
+            grad[input_length:max_input_length, b] = 0
+    return grad
+
+
+def compare_with_np(
+    device_type,
+    device_num,
+    data_type,
+    max_input_length,
+    batch_size,
+    num_classes,
+    max_target_length,
+    blank,
+    reduction,
+    zero_infinity,
+):
+    assert data_type in ["float32", "double"]
+    assert device_type in ["gpu", "cpu"]
+    assert reduction in ["none", "mean", "sum"]
+    assert zero_infinity in [False, True]
+    flow.clear_default_session()
+    if device_type == "cpu":
+        flow.config.cpu_device_num(device_num)
+    else:
+        flow.config.gpu_device_num(device_num)
+    flow_data_type = type_name_to_flow_type[data_type]
+    np_data_type = type_name_to_np_type[data_type]
+    func_config = flow.FunctionConfig()
+    func_config.default_logical_view(flow.scope.consistent_view())
+    func_config.default_data_type(flow_data_type)
+    func_config.default_placement_scope(
+        flow.scope.placement(device_type, "0:0-{}".format(device_num - 1))
+    )
+    log_probs = np.random.random(
+        size=(max_input_length, batch_size, num_classes)
+    ).astype(np_data_type)
+    log_probs = log_softmax(log_probs, axis=2)
+    targets = np.random.randint(
+        1, high=num_classes, size=(batch_size, max_target_length), dtype=np.int32
+    )
+    input_lengths = np.random.randint(
+        max_input_length / 2, high=max_input_length, size=(batch_size,), dtype=np.int32
+    )
+    target_lengths = np.random.randint(
+        max_target_length / 2,
+        high=max_target_length,
+        size=(batch_size,),
+        dtype=np.int32,
+    )
+    (np_loss, np_alpha) = ctc_loss_np(
+        log_probs, targets, input_lengths, target_lengths, blank
+    )
+    np_out = np.where(np_loss == float("inf"), 0, np_loss) if zero_infinity else np_loss
+    if reduction == "mean":
+        np_out = np.mean(
+            np.divide(
+                np_out, np.clip(target_lengths, 1, a_max=None).astype(np_data_type)
+            )
+        )
+    elif reduction == "sum":
+        np_out = np.sum(np_out)
+    np_grad_out = np.ones_like(np_loss, dtype=np_data_type)
+    if reduction == "mean":
+        np_grad_out = np.divide(
+            np_grad_out, np.clip(target_lengths, 1, a_max=None).astype(np_data_type)
+        )
+        np_grad_out /= target_lengths.size
+    np_grad = ctc_loss_grad_np(
+        np_grad_out,
+        np_loss,
+        np_alpha,
+        log_probs,
+        targets,
+        input_lengths,
+        target_lengths,
+        blank,
+        zero_infinity,
+    )
+
+    def assert_loss_grad(blob: tp.Numpy):
+        assert np.allclose(blob, np_grad, atol=1e-05, equal_nan=True)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def ctc_loss_job(
+        log_probs: tp.Numpy.Placeholder(
+            shape=(max_input_length, batch_size, num_classes), dtype=flow_data_type
+        ),
+        targets: tp.Numpy.Placeholder(
+            shape=(batch_size, max_target_length), dtype=flow.int32
+        ),
+        input_lengths: tp.Numpy.Placeholder(shape=(batch_size,), dtype=flow.int32),
+        target_lengths: tp.Numpy.Placeholder(shape=(batch_size,), dtype=flow.int32),
+    ) -> tp.Numpy:
+        with flow.scope.placement(device_type, "0:0"):
+            v = flow.get_variable(
+                shape=log_probs.shape,
+                dtype=flow_data_type,
+                initializer=flow.zeros_initializer(),
+                name="x_var",
+            )
+            x_var = log_probs + v
+        flow.watch_diff(x_var, assert_loss_grad)
+        loss = flow.ctc_loss(
+            x_var,
+            targets,
+            input_lengths,
+            target_lengths,
+            blank,
+            reduction,
+            zero_infinity,
+        )
+        with flow.scope.placement(device_type, "0:0"):
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+            ).minimize(loss)
+        return loss
+
+    of_out = ctc_loss_job(log_probs, targets, input_lengths, target_lengths)
+    assert np.allclose(of_out, np_out, atol=1e-05)
+
+
+def gen_arg_list(type):
+    arg_dict = OrderedDict()
+    if type == "1n2d":
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["device_num"] = [2]
+    else:
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["device_num"] = [1]
+    arg_dict["data_type"] = ["float32"]
+    arg_dict["max_input_length"] = [20]
+    arg_dict["batch_size"] = [4]
+    arg_dict["num_classes"] = [5]
+    arg_dict["max_target_length"] = [10]
+    arg_dict["blank"] = [0, 4]
+    arg_dict["reduction"] = ["mean", "none"]
+    arg_dict["zero_infinity"] = [False, True]
+    return GenArgList(arg_dict)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestCTCLoss1n1d(flow.unittest.TestCase):
+    def test_ctc_loss(test_case):
+        for arg in gen_arg_list("1n1d"):
+            compare_with_np(*arg)
+
+
+@flow.unittest.skip_unless_1n2d()
+class TestCTCLoss1n2d(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_ctc_loss(test_case):
+        for arg in gen_arg_list("1n2d"):
+            compare_with_np(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_deconv2d.py b/python/oneflow/compatible/single_client/test/ops/test_deconv2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..38c16c170fab78f7d582051ccdd53fa3ce7797b7
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_deconv2d.py
@@ -0,0 +1,188 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import test_global_storage
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def compare_with_tensorflow(device_type, params_case, dilations, data_format):
+    (input_shape, output_shape, padding, strides, kernel_size) = params_case
+    assert data_format in ["NCHW", "NHWC"]
+    out_channels = output_shape[1] if data_format == "NCHW" else output_shape[3]
+    in_channels = input_shape[1] if data_format == "NCHW" else input_shape[3]
+    assert device_type in ["gpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def DeconvJob():
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "x",
+                shape=input_shape,
+                dtype=flow.float,
+                initializer=flow.random_uniform_initializer(minval=-10, maxval=10),
+                trainable=True,
+            )
+            if data_format == "NCHW":
+                weight = flow.get_variable(
+                    "weight",
+                    shape=(in_channels, out_channels, kernel_size, kernel_size),
+                    dtype=flow.float,
+                    initializer=flow.random_uniform_initializer(minval=-10, maxval=10),
+                    trainable=True,
+                )
+            else:
+                weight = flow.get_variable(
+                    "weight",
+                    shape=(in_channels, kernel_size, kernel_size, out_channels),
+                    dtype=flow.float,
+                    initializer=flow.random_uniform_initializer(minval=-10, maxval=10),
+                    trainable=True,
+                )
+            loss = flow.nn.conv2d_transpose(
+                x,
+                weight,
+                strides=strides,
+                output_shape=output_shape,
+                dilations=dilations,
+                padding=padding,
+                data_format=data_format,
+            )
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch(weight, test_global_storage.Setter("weight"))
+            flow.watch_diff(weight, test_global_storage.Setter("weight_diff"))
+            flow.watch(loss, test_global_storage.Setter("loss"))
+            flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))
+            return loss
+
+    of_out = DeconvJob().get()
+    if data_format == "NCHW":
+        with tf.GradientTape(persistent=True) as tape:
+            x = tf.Variable(test_global_storage.Get("x").transpose(0, 2, 3, 1))
+            output_shape = (
+                output_shape[0],
+                output_shape[2],
+                output_shape[3],
+                output_shape[1],
+            )
+            w = tf.Variable(test_global_storage.Get("weight").transpose(2, 3, 1, 0))
+            tf_out = tf.nn.conv2d_transpose(
+                x,
+                w,
+                output_shape=output_shape,
+                strides=[1, strides, strides, 1],
+                padding=padding,
+                data_format="NHWC",
+            )
+        loss_diff = test_global_storage.Get("loss_diff").transpose(0, 2, 3, 1)
+        tf_x_diff = tape.gradient(tf_out, x, loss_diff)
+        tf_weight_diff = tape.gradient(tf_out, w, loss_diff)
+        assert np.allclose(
+            of_out.numpy().transpose(0, 2, 3, 1), tf_out.numpy(), rtol=0.01, atol=0.01
+        )
+        assert np.allclose(
+            test_global_storage.Get("x_diff").transpose(0, 2, 3, 1),
+            tf_x_diff.numpy(),
+            rtol=0.0001,
+            atol=0.0001,
+        )
+        assert np.allclose(
+            test_global_storage.Get("weight_diff").transpose(2, 3, 1, 0),
+            tf_weight_diff.numpy(),
+            rtol=0.0001,
+            atol=0.0001,
+        )
+    else:
+        with tf.GradientTape(persistent=True) as tape:
+            x = tf.Variable(test_global_storage.Get("x"))
+            w = tf.Variable(test_global_storage.Get("weight").transpose(1, 2, 3, 0))
+            tf_out = tf.nn.conv2d_transpose(
+                x,
+                w,
+                output_shape=output_shape,
+                strides=[1, strides, strides, 1],
+                padding=padding,
+                data_format="NHWC",
+            )
+        loss_diff = test_global_storage.Get("loss_diff")
+        tf_x_diff = tape.gradient(tf_out, x, loss_diff)
+        tf_weight_diff = tape.gradient(tf_out, w, loss_diff)
+        assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=0.01, atol=0.01), (
+            of_out.numpy() - tf_out.numpy()
+        )
+        assert np.allclose(
+            test_global_storage.Get("x_diff"), tf_x_diff.numpy(), rtol=0.01, atol=0.01
+        )
+        assert np.allclose(
+            test_global_storage.Get("weight_diff").transpose(1, 2, 3, 0),
+            tf_weight_diff.numpy(),
+            rtol=0.01,
+            atol=0.01,
+        )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestDeconv2d(flow.unittest.TestCase):
+    def test_deconv2d_NHWC_1n1c(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["params_case"] = [
+            ((32, 3, 3, 4), (32, 3, 3, 8), "SAME", 1, 3),
+            ((32, 3, 3, 2), (32, 6, 6, 8), "SAME", 2, 4),
+            ((32, 2, 2, 1), (32, 5, 5, 2), "VALID", 2, 2),
+            ((32, 2, 2, 16), (32, 8, 8, 4), "VALID", 2, 5),
+        ]
+        arg_dict["dilations"] = [1]
+        arg_dict["data_format"] = ["NHWC"]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_deconv2d_NCHW_1n1c(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["params_case"] = [
+            ((32, 4, 3, 3), (32, 8, 3, 3), "SAME", 1, 3),
+            ((32, 4, 3, 3), (32, 8, 6, 6), "SAME", 2, 5),
+            ((32, 1, 2, 2), (32, 2, 5, 5), "VALID", 2, 2),
+            ((32, 16, 2, 2), (32, 4, 8, 8), "VALID", 2, 5),
+        ]
+        arg_dict["dilations"] = [1]
+        arg_dict["data_format"] = ["NCHW"]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_demo.py b/python/oneflow/compatible/single_client/test/ops/test_demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..371bd37199fd7dfeb83f5626d86cdc99f053bd49
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_demo.py
@@ -0,0 +1,39 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+
+@unittest.skipIf(flow.unittest.env.device_num() != 1, "only runs when device_num is 1")
+class TestDemo(flow.unittest.TestCase):
+    @unittest.skipIf(
+        flow.unittest.env.node_size() != 1, "only runs when node_size is 1"
+    )
+    def test_foo(test_case):
+        pass
+
+    @unittest.skipIf(
+        flow.unittest.env.node_size() != 2, "only runs when node_size is 2"
+    )
+    def test_bar(test_case):
+        pass
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_demo_matmul.py b/python/oneflow/compatible/single_client/test/ops/test_demo_matmul.py
new file mode 100644
index 0000000000000000000000000000000000000000..b73c5f0c973b289b02f09db64bc1c1faa06f324f
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_demo_matmul.py
@@ -0,0 +1,58 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+@flow.unittest.skip_unless_1n2d()
+class TestDemoMatmul(flow.unittest.TestCase):
+    def test_watch(test_case):
+        flow.config.gpu_device_num(2)
+        flow.config.enable_debug_mode(True)
+        expected = np.array(
+            [[30, 30, 30, 30], [30, 30, 30, 30], [30, 30, 30, 30], [30, 30, 30, 30]]
+        ).astype(np.float32)
+
+        def Watch(x: tp.Numpy):
+            test_case.assertTrue(np.allclose(x, expected))
+
+        @flow.global_function()
+        def Matmul(
+            x: tp.Numpy.Placeholder((4, 4), dtype=flow.float32),
+            y: tp.Numpy.Placeholder((4, 4), dtype=flow.float32),
+        ) -> tp.Numpy:
+            s = flow.matmul(x, y)
+            flow.watch(s, Watch)
+            z = flow.matmul(s, x)
+            return z
+
+        x = np.array([[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]]).astype(
+            np.float32
+        )
+        y = np.array([[1, 1, 1, 1], [2, 2, 2, 2], [3, 3, 3, 3], [4, 4, 4, 4]]).astype(
+            np.float32
+        )
+        Matmul(x, y)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_diag.py b/python/oneflow/compatible/single_client/test/ops/test_diag.py
new file mode 100644
index 0000000000000000000000000000000000000000..fac2edca63258d83f8e0afdcec497687fadaaa01
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_diag.py
@@ -0,0 +1,136 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+def diag_grad_np(input, diagonal, output, grad):
+    input_shape = input.shape
+    output_shape = output.shape
+    grad_output = np.zeros(input_shape)
+    if len(input_shape) == 1:
+        stride0 = output_shape[1]
+        beg = diagonal if diagonal >= 0 else stride0 * abs(diagonal)
+        for i in range(input_shape[0]):
+            if i > 0:
+                beg += stride0 + 1
+            if diagonal >= 0:
+                grad_output[i] = grad[i][beg % stride0]
+            if diagonal < 0:
+                grad_output[i] = grad[(beg - i) // stride0][i]
+        return grad_output
+    else:
+        stride01 = input_shape[1]
+        beg = diagonal if diagonal >= 0 else stride01 * abs(diagonal)
+        for i in range(output.shape[0]):
+            if i > 0:
+                beg += stride01 + 1
+            if diagonal >= 0:
+                grad_output[i][beg % stride01] = grad[i]
+            if diagonal < 0:
+                stride02 = input_shape[0]
+                grad_output[(beg - i) // stride02][i] = grad[i]
+        return grad_output
+
+
+def _compare_diag_with_np(device_type, device_num, data_type, input_shape, diagonal):
+    assert device_type in ["gpu", "cpu"]
+    np_data_type = type_name_to_np_type[data_type]
+    flow_data_type = type_name_to_flow_type[data_type]
+    flow.clear_default_session()
+    if device_type == "cpu":
+        flow.config.cpu_device_num(device_num)
+    else:
+        flow.config.gpu_device_num(device_num)
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow_data_type)
+    func_config.default_placement_scope(
+        flow.scope.placement(device_type, "0:0-{}".format(device_num - 1))
+    )
+    func_config.default_logical_view(flow.scope.consistent_view())
+    input_1 = (np.random.random(input_shape) * 100).astype(np_data_type)
+    np_out = np.diag(input_1, diagonal)
+    _grad = np.ones_like(np_out)
+    np_grad = diag_grad_np(input_1, diagonal, np_out, _grad)
+
+    def assert_diag_grad(blob: tp.Numpy):
+        assert np.allclose(blob, np_grad)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def diag_job(
+        input: tp.Numpy.Placeholder(shape=input_shape, dtype=flow.float)
+    ) -> tp.Numpy:
+        with flow.scope.placement(device_type, "0:0"):
+            input_var = flow.get_variable(
+                "input",
+                shape=input_shape,
+                dtype=flow.float,
+                initializer=flow.zeros_initializer(),
+                trainable=True,
+            )
+            input = input + input_var
+        flow.watch_diff(input, assert_diag_grad)
+        output = flow.diag(input, diagonal)
+        if output.dtype in (flow.int8, flow.int32, flow.int64):
+            output = flow.cast(output, flow.float)
+        with flow.scope.placement(device_type, "0:0"):
+            flow.optimizer.Adam(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001])
+            ).minimize(output)
+        return output
+
+    of_out = diag_job(input_1.astype(np.float32))
+    assert np.allclose(of_out, np_out)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestDiag1n1d(flow.unittest.TestCase):
+    def test_diag_1n1d(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["device_num"] = [1]
+        arg_dict["data_type"] = ["float32", "double", "int32", "int64"]
+        arg_dict["input_shape"] = [(3,), (3, 3), (3, 4)]
+        arg_dict["diagonal"] = [0, 2, -1]
+        for arg in GenArgList(arg_dict):
+            _compare_diag_with_np(*arg)
+
+
+@flow.unittest.skip_unless_1n2d()
+class TestDiag1n2d(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_diag_gpu_1n2d(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["device_num"] = [2]
+        arg_dict["data_type"] = ["float32"]
+        arg_dict["input_shape"] = [(3, 3)]
+        arg_dict["diagonal"] = [0]
+        for arg in GenArgList(arg_dict):
+            _compare_diag_with_np(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_dim_gather.py b/python/oneflow/compatible/single_client/test/ops/test_dim_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..94d9569f536634ad2df91c3f8b1d69e3f68bf569
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_dim_gather.py
@@ -0,0 +1,256 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def gen_gather_test_sample(input_shape, index_shape, dim, is_float=True):
+    def _np_dim_scatter_add(src, dim, index, outshape):
+        output = np.zeros(outshape)
+        for srcidx in range(0, src.size):
+            outcoord = np.unravel_index(srcidx, src.shape)
+            outcoord = [*outcoord]
+            outcoord[dim] = index[np.unravel_index(srcidx, index.shape)]
+            output_offset = np.ravel_multi_index(outcoord, outshape)
+            output[np.unravel_index(output_offset, outshape)] += src[
+                np.unravel_index(srcidx, src.shape)
+            ]
+        return output
+
+    if is_float:
+        input = np.random.random(input_shape)
+    else:
+        input = np.random.randint(0, 100, input_shape)
+    index = np.random.randint(0, input_shape[dim], index_shape)
+    output = np.take_along_axis(input, index, dim)
+    grad = _np_dim_scatter_add(np.ones_like(output), dim, index, input_shape)
+    ret = {"input": input, "index": index, "dim": dim, "output": output, "grad": grad}
+    return ret
+
+
+def _make_dim_gather_fn(
+    test_case,
+    input,
+    index,
+    dim,
+    grad,
+    device_type,
+    value_type,
+    index_type,
+    machine_ids,
+    device_counts,
+):
+    flow.clear_default_session()
+    if device_type == "cpu":
+        flow.config.cpu_device_num(device_counts)
+    else:
+        flow.config.gpu_device_num(device_counts)
+    func_config = flow.FunctionConfig()
+    if value_type == flow.float16:
+        func_config.default_data_type(flow.float32)
+    else:
+        func_config.default_data_type(value_type)
+    func_config.default_placement_scope(flow.scope.placement(device_type, machine_ids))
+    func_config.default_logical_view(flow.scope.consistent_view())
+
+    def _compare_diff(blob: oft.Numpy):
+        test_case.assertTrue(np.allclose(grad, blob))
+
+    if value_type == flow.float16:
+
+        @flow.global_function(type="train", function_config=func_config)
+        def gather_fn(
+            params_def: oft.Numpy.Placeholder(input.shape, dtype=flow.float32),
+            indices_def: oft.Numpy.Placeholder(index.shape, dtype=index_type),
+        ) -> oft.Numpy:
+            with flow.scope.placement(device_type, "0:0"):
+                x_var = flow.get_variable(
+                    "input",
+                    shape=input.shape,
+                    dtype=flow.float32,
+                    initializer=flow.constant_initializer(0),
+                )
+                x_var = flow.cast_to_current_logical_view(x_var)
+                x = x_var + params_def
+                x_f16 = flow.cast(x, flow.float16)
+            y_f16 = flow.dim_gather(x_f16, dim, indices_def)
+            x_f32 = flow.cast(x, flow.float32)
+            y_f32 = flow.cast(y_f16, flow.float32)
+            y = flow.dim_gather(x, dim, indices_def)
+            with flow.scope.placement(device_type, "0:0"):
+                flow.optimizer.SGD(
+                    flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+                ).minimize(y_f32)
+            flow.watch_diff(x_f32, _compare_diff)
+            return y_f32
+
+        return gather_fn
+    elif value_type == flow.float32 or value_type == flow.float64:
+
+        @flow.global_function(type="train", function_config=func_config)
+        def gather_fn(
+            params_def: oft.Numpy.Placeholder(input.shape, dtype=value_type),
+            indices_def: oft.Numpy.Placeholder(index.shape, dtype=index_type),
+        ) -> oft.Numpy:
+            with flow.scope.placement(device_type, "0:0"):
+                x_var = flow.get_variable(
+                    "input",
+                    shape=input.shape,
+                    dtype=value_type,
+                    initializer=flow.constant_initializer(0),
+                )
+                x_var = flow.cast_to_current_logical_view(x_var)
+                x = x_var + params_def
+            y = flow.dim_gather(x, dim, indices_def)
+            with flow.scope.placement(device_type, "0:0"):
+                flow.optimizer.SGD(
+                    flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+                ).minimize(y)
+            flow.watch_diff(x, _compare_diff)
+            return y
+
+        return gather_fn
+    elif value_type == flow.int32:
+
+        @flow.global_function(type="train", function_config=func_config)
+        def gather_fn(
+            params_def: oft.Numpy.Placeholder(input.shape, dtype=flow.float32),
+            indices_def: oft.Numpy.Placeholder(index.shape, dtype=index_type),
+        ) -> oft.Numpy:
+            with flow.scope.placement(device_type, "0:0"):
+                x_var = flow.get_variable(
+                    "input",
+                    shape=input.shape,
+                    dtype=flow.float32,
+                    initializer=flow.constant_initializer(0),
+                )
+                x_var = flow.cast_to_current_logical_view(x_var)
+                x = x_var + params_def
+            x_int32 = flow.cast(x, dtype=flow.int32)
+            y_int32 = flow.dim_gather(x, dim, indices_def)
+            y_fp32 = flow.cast(y_int32, dtype=flow.float32)
+            with flow.scope.placement(device_type, "0:0"):
+                flow.optimizer.SGD(
+                    flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+                ).minimize(y_fp32)
+            flow.watch_diff(x, _compare_diff)
+            return y_fp32
+
+        return gather_fn
+
+
+def _compare_dim_gather_with_samples(
+    test_case, device_type, sample, value_type, index_type, machine_ids, device_count
+):
+    gather_fn = _make_dim_gather_fn(
+        test_case,
+        sample["input"].astype(value_type[0]),
+        sample["index"].astype(index_type[0]),
+        sample["dim"],
+        sample["grad"].astype(value_type[0]),
+        device_type,
+        value_type[1],
+        index_type[1],
+        machine_ids,
+        device_count,
+    )
+    y = gather_fn(
+        sample["input"].astype(value_type[0]), sample["index"].astype(index_type[0])
+    )
+    y.astype(value_type[0])
+    if value_type == flow.float16:
+        test_case.assertTrue(
+            np.allclose(y, sample["output"].astype(np.float32), 0.001, 0.001)
+        )
+    else:
+        test_case.assertTrue(np.allclose(y, sample["output"].astype(value_type[0])))
+
+
+def _gen_arg_dict(
+    device_type="gpu", value_type="float", machine_ids="0:0", device_count=1
+):
+    arg_dict = OrderedDict()
+    arg_dict["device_type"] = [device_type]
+    arg_dict["samples"] = []
+    arg_dict["samples"].append(gen_gather_test_sample((2, 2), (2, 2), 1))
+    arg_dict["samples"].append(gen_gather_test_sample((2, 2), (2, 2), 0))
+    arg_dict["samples"].append(gen_gather_test_sample((8, 3, 2), (4, 3, 2), 0))
+    if value_type == "float":
+        arg_dict["value_type"] = [
+            (np.float32, flow.float32),
+            (np.float64, flow.float64),
+        ]
+    elif value_type == "int":
+        arg_dict["value_type"] = [(np.float32, flow.int32)]
+    else:
+        raise "float or int for value type only"
+    arg_dict["index_type"] = [(np.int32, flow.int32), (np.int64, flow.int64)]
+    arg_dict["machine_ids"] = [machine_ids]
+    arg_dict["device_count"] = [device_count]
+    return arg_dict
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestDimGather1n1d(flow.unittest.TestCase):
+    def test_dim_gather_float_cpu(test_case):
+        arg_dict = _gen_arg_dict("cpu", "float", "0:0", 1)
+        for arg in GenArgList(arg_dict):
+            _compare_dim_gather_with_samples(test_case, *arg)
+
+    def test_dim_gather_int_cpu(test_case):
+        arg_dict = _gen_arg_dict("cpu", "int", "0:0", 1)
+        for arg in GenArgList(arg_dict):
+            _compare_dim_gather_with_samples(test_case, *arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_dim_gather_float_gpu(test_case):
+        arg_dict = _gen_arg_dict("gpu", "float", "0:0", 1)
+        for arg in GenArgList(arg_dict):
+            _compare_dim_gather_with_samples(test_case, *arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_dim_gather_int_gpu(test_case):
+        arg_dict = _gen_arg_dict("gpu", "int", "0:0", 1)
+        for arg in GenArgList(arg_dict):
+            _compare_dim_gather_with_samples(test_case, *arg)
+
+
+@flow.unittest.skip_unless_1n2d()
+class TestDimGather1n2d(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_dim_gather_float(test_case):
+        arg_dict = _gen_arg_dict("gpu", "float", "0:0-1", 2)
+        for arg in GenArgList(arg_dict):
+            _compare_dim_gather_with_samples(test_case, *arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_dim_gather_int(test_case):
+        arg_dict = _gen_arg_dict("gpu", "int", "0:0-1", 2)
+        for arg in GenArgList(arg_dict):
+            _compare_dim_gather_with_samples(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_dim_gather_dynamic.py b/python/oneflow/compatible/single_client/test/ops/test_dim_gather_dynamic.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce1f543ec62c386ebf8196edb38dd9610c5b5e16
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_dim_gather_dynamic.py
@@ -0,0 +1,119 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def gen_gather_test_sample(input_shape, index_shape, dim, is_float=True):
+    def _np_dim_scatter_add(src, dim, index, outshape):
+        output = np.zeros(outshape)
+        for srcidx in range(0, src.size):
+            outcoord = np.unravel_index(srcidx, src.shape)
+            outcoord = [*outcoord]
+            outcoord[dim] = index[np.unravel_index(srcidx, index.shape)]
+            output_offset = np.ravel_multi_index(outcoord, outshape)
+            output[np.unravel_index(output_offset, outshape)] += src[
+                np.unravel_index(srcidx, src.shape)
+            ]
+        return output
+
+    if is_float:
+        input = np.random.random(input_shape)
+    else:
+        input = np.random.randint(0, 100, input_shape)
+    index = np.random.randint(0, input_shape[dim], index_shape)
+    output = np.take_along_axis(input, index, dim)
+    grad = _np_dim_scatter_add(np.ones_like(output), dim, index, input_shape)
+    ret = {
+        "input": input.astype(np.float32),
+        "index": index.astype(np.int32),
+        "dim": dim,
+        "output": output.astype(np.float32),
+        "grad": grad.astype(np.float32),
+    }
+    return ret
+
+
+def _make_dim_gather_fn(test_case, sample, datashape):
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float32)
+    func_config.default_logical_view(flow.scope.mirrored_view())
+    func_config.default_placement_scope(flow.scope.placement("gpu", "0:0"))
+
+    def _compare_diff(blob: oft.ListNumpy):
+        test_case.assertTrue(np.allclose(sample["grad"], blob[0]))
+
+    @flow.global_function(type="train", function_config=func_config)
+    def DynamicDimGatherJob(
+        params_def: oft.ListNumpy.Placeholder(datashape, dtype=flow.float32),
+        index_def: oft.ListNumpy.Placeholder(datashape, dtype=flow.int32),
+    ) -> oft.ListNumpy:
+        x_var = flow.get_variable(
+            "input",
+            shape=(1,),
+            dtype=flow.float32,
+            initializer=flow.constant_initializer(0),
+        )
+        x_var = flow.cast_to_current_logical_view(x_var)
+        x = x_var + params_def
+        y = flow.dim_gather(x, sample["dim"], index_def)
+        flow.optimizer.SGD(
+            flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+        ).minimize(y)
+        flow.watch_diff(x, _compare_diff)
+        return y
+
+    return DynamicDimGatherJob
+
+
+def _compare_dim_gather_with_samples(test_case, inputshape, indexshape, dim, maxshape):
+    sample = gen_gather_test_sample(inputshape, indexshape, dim)
+    dynamic_dim_gather = _make_dim_gather_fn(test_case, sample, maxshape)
+    out = dynamic_dim_gather([sample["input"]], [sample["index"]])[0]
+    test_case.assertTrue(
+        np.allclose(out, sample["output"].astype(np.float32), 0.001, 0.001)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestDynamicDimGather(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_dynamic_dim_gather(test_case):
+        _compare_dim_gather_with_samples(
+            test_case, inputshape=(2, 2), indexshape=(2, 2), dim=1, maxshape=(10, 10)
+        )
+        _compare_dim_gather_with_samples(
+            test_case, inputshape=(2, 2), indexshape=(2, 2), dim=0, maxshape=(10, 10)
+        )
+        _compare_dim_gather_with_samples(
+            test_case,
+            inputshape=(4, 4, 3),
+            indexshape=(4, 1, 3),
+            dim=1,
+            maxshape=(10, 10, 10),
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_distribute_concat.py b/python/oneflow/compatible/single_client/test/ops/test_distribute_concat.py
new file mode 100644
index 0000000000000000000000000000000000000000..82faeda126d90dee6c19f69130ffb8d5080b2a89
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_distribute_concat.py
@@ -0,0 +1,48 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+
+@flow.unittest.skip_unless_1n2d()
+class TestDistributeConcat(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_deadlock(test_case):
+        flow.config.gpu_device_num(2)
+        func_config = flow.FunctionConfig()
+        func_config.enable_inplace(False)
+
+        @flow.global_function(function_config=func_config)
+        def DistributeConcat():
+            with flow.scope.placement("gpu", "0:0"):
+                w = flow.get_variable(
+                    "w", (2, 5), initializer=flow.constant_initializer(10)
+                )
+                x = w + 1
+                y = w + 1
+            ret = flow.advanced.distribute_concat([x, y])
+
+        DistributeConcat()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_dropout.py b/python/oneflow/compatible/single_client/test/ops/test_dropout.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3a33c03e1cfc8469284c56d56fad849ed2eb419
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_dropout.py
@@ -0,0 +1,229 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import shutil
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import test_global_storage
+from test_util import GenArgList, type_name_to_flow_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+
+def of_run(device_type, x_shape, data_type, rate, seed):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    if data_type == "float16":
+        dtype = flow.float
+    else:
+        dtype = type_name_to_flow_type[data_type]
+
+    @flow.global_function(type="train", function_config=func_config)
+    def DropoutJob():
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "x",
+                shape=x_shape,
+                dtype=dtype,
+                initializer=flow.random_uniform_initializer(minval=-1, maxval=1),
+                trainable=True,
+            )
+            if data_type == "float16":
+                x = flow.cast(flow.cast(x, flow.float16), dtype)
+                of_out = flow.cast(
+                    flow.nn.dropout(
+                        flow.cast(x, flow.float16), rate=rate, seed=seed, name="dropout"
+                    ),
+                    dtype,
+                )
+            else:
+                of_out = flow.nn.dropout(x, rate=rate, seed=seed, name="dropout")
+            loss = flow.math.square(of_out)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch(of_out, test_global_storage.Setter("out"))
+            flow.watch_diff(of_out, test_global_storage.Setter("out_diff"))
+            return loss
+
+    of_out = DropoutJob().get()
+    of_out = test_global_storage.Get("out")
+    out_diff = test_global_storage.Get("out_diff")
+    assert np.allclose(
+        [1 - np.count_nonzero(of_out) / of_out.size], [rate], atol=rate / 5
+    )
+    x = test_global_storage.Get("x")
+    x_diff = test_global_storage.Get("x_diff")
+    out_scale = of_out[np.where(of_out != 0)] / x[np.where(of_out != 0)]
+    diff_scale = x_diff[np.where(of_out != 0)] / out_diff[np.where(of_out != 0)]
+    assert np.allclose(out_scale, 1.0 / (1.0 - rate), atol=1e-05)
+    assert np.allclose(diff_scale, 1.0 / (1.0 - rate), atol=1e-05)
+
+
+def of_run_module(device_type, x_shape, data_type, rate, seed):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    dtype = type_name_to_flow_type[data_type]
+
+    @flow.global_function(type="train", function_config=func_config)
+    def DropoutJob() -> flow.typing.Numpy:
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "x",
+                shape=x_shape,
+                dtype=dtype,
+                initializer=flow.ones_initializer(),
+                trainable=True,
+            )
+            of_out = flow.nn.dropout(x, rate=rate, seed=seed, name="dropout")
+            loss = flow.math.square(of_out)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            return of_out
+
+    of_out = DropoutJob()
+    of_out2 = DropoutJob()
+    return (of_out, of_out2)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestDropout(flow.unittest.TestCase):
+    def test_dropout(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["x_shape"] = [(100, 100, 10, 20)]
+        arg_dict["data_type"] = ["float32", "double", "float16"]
+        arg_dict["rate"] = [0.75]
+        arg_dict["seed"] = [12345, None]
+        for arg in GenArgList(arg_dict):
+            if arg[0] == "cpu" and arg[2] == "float16":
+                continue
+            of_run(*arg)
+
+    def test_dropout_module(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["x_shape"] = [(2, 2, 2, 2)]
+        arg_dict["data_type"] = ["float32"]
+        arg_dict["rate"] = [0.75]
+        arg_dict["seed"] = [12345]
+        literals = {
+            "cpu": [
+                np.array(
+                    [
+                        4.0,
+                        4.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                        4.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                        4.0,
+                        4.0,
+                        0.0,
+                        0.0,
+                        4.0,
+                    ]
+                ),
+                np.array(
+                    [
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                        4.0,
+                        4.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                        4.0,
+                        0.0,
+                        0.0,
+                    ]
+                ),
+            ],
+            "gpu": [
+                np.array(
+                    [
+                        4.0,
+                        4.0,
+                        0.0,
+                        4.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                        4.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                    ]
+                ),
+                np.array(
+                    [
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                        4.0,
+                        4.0,
+                        0.0,
+                    ]
+                ),
+            ],
+        }
+        for arg in GenArgList(arg_dict):
+            (of_out_a, of_out_b) = of_run_module(*arg)
+            test_case.assertEqual(
+                (np.abs(literals[arg[0]][0] - of_out_a.flatten()) < 1e-06).all(), True
+            )
+            test_case.assertEqual(
+                (np.abs(literals[arg[0]][1] - of_out_b.flatten()) < 1e-06).all(), True
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_dynamic_loss_scale_schedule.py b/python/oneflow/compatible/single_client/test/ops/test_dynamic_loss_scale_schedule.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ec1dd8e854a6fccee524f9a37988a6e89db493c
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_dynamic_loss_scale_schedule.py
@@ -0,0 +1,144 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def dynamic_loss_scale_schedule(
+    count_not_finite, loss_scale, good_step_counter, increment_period, multiplier, name
+):
+    flow.user_op_builder(name).Op("dynamic_loss_scale_schedule").Input(
+        "count_not_finite", [count_not_finite]
+    ).Input("loss_scale", [loss_scale]).Input(
+        "good_step_counter", [good_step_counter]
+    ).Attr(
+        "increment_period", increment_period
+    ).Attr(
+        "multiplier", multiplier
+    ).Build().InferAndTryRun()
+
+
+def _run_test(test_case, device_type, op_param):
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(function_config=func_config)
+    def schedule_job(count_not_finite: oft.Numpy.Placeholder((1,), dtype=flow.int64)):
+        with flow.scope.placement(device_type, "0:0"):
+            good_step_counter = flow.get_variable(
+                name="good_step_counter",
+                shape=(1,),
+                dtype=flow.int64,
+                initializer=flow.constant_initializer(
+                    op_param["good_step_counter_value"], dtype=flow.int64
+                ),
+            )
+            loss_scale = flow.get_variable(
+                name="loss_scale",
+                shape=(1,),
+                dtype=flow.float,
+                initializer=flow.constant_initializer(
+                    op_param["loss_scale_value"], dtype=flow.float
+                ),
+            )
+            dynamic_loss_scale_schedule(
+                count_not_finite,
+                loss_scale,
+                good_step_counter,
+                op_param["increment_period"],
+                op_param["multiplier"],
+                "dynamic_schedule",
+            )
+            return (good_step_counter, loss_scale)
+
+    @flow.global_function(function_config=func_config)
+    def fetch_job():
+        with flow.scope.placement(device_type, "0:0"):
+            good_step_counter = flow.get_variable(
+                name="good_step_counter",
+                shape=(1,),
+                dtype=flow.int64,
+                initializer=flow.constant_initializer(
+                    op_param["good_step_counter_value"], dtype=flow.int64
+                ),
+            )
+            loss_scale = flow.get_variable(
+                name="loss_scale",
+                shape=(1,),
+                dtype=flow.float,
+                initializer=flow.constant_initializer(
+                    op_param["loss_scale_value"], dtype=flow.float
+                ),
+            )
+        return (good_step_counter, loss_scale)
+
+    count_not_finite = np.array([op_param["count_not_finite"]]).astype(np.int64)
+    schedule_job(count_not_finite).get()
+    (good_step_counter, loss_scale) = fetch_job().get()
+    assert good_step_counter.numpy()[0] == op_param["result_step"]
+    assert loss_scale.numpy()[0] == op_param["result_loss_scale"]
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestDynamicLossScaleSchedule(flow.unittest.TestCase):
+    def test_dynamic_loss_scale_schedule(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_case"] = [test_case]
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["op_param"] = [
+            {
+                "count_not_finite": 1,
+                "good_step_counter_value": 1,
+                "loss_scale_value": 100.0,
+                "increment_period": 1,
+                "multiplier": 2.0,
+                "result_step": 0,
+                "result_loss_scale": 50.0,
+            },
+            {
+                "count_not_finite": 0,
+                "good_step_counter_value": 1,
+                "loss_scale_value": 100.0,
+                "increment_period": 1,
+                "multiplier": 2.0,
+                "result_step": 0,
+                "result_loss_scale": 200.0,
+            },
+            {
+                "count_not_finite": 0,
+                "good_step_counter_value": 1,
+                "loss_scale_value": 100.0,
+                "increment_period": 10,
+                "multiplier": 2.0,
+                "result_step": 2,
+                "result_loss_scale": 100.0,
+            },
+        ]
+        for arg in GenArgList(arg_dict):
+            _run_test(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_dynamic_reshape.py b/python/oneflow/compatible/single_client/test/ops/test_dynamic_reshape.py
new file mode 100644
index 0000000000000000000000000000000000000000..855ae2b0227eff26ed6e52c8915e1125b0b2ec8b
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_dynamic_reshape.py
@@ -0,0 +1,60 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+@unittest.skipIf(True, "skip for now because of single-client tensor_list removed")
+class TestDynamicReshape(flow.unittest.TestCase):
+    def test_dynamic_reshape(test_case):
+        data_shape = (10, 10, 10)
+        flow.config.gpu_device_num(2)
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.mirrored_view())
+
+        @flow.global_function(type="train", function_config=func_config)
+        def DynamicReshapeJob(x: oft.ListNumpy.Placeholder(data_shape)):
+            reshape_out1 = flow.reshape(x, (-1, 20))
+            my_model = flow.get_variable(
+                "my_model",
+                shape=(20, 32),
+                dtype=flow.float,
+                initializer=flow.random_uniform_initializer(minval=-10, maxval=10),
+                trainable=True,
+            )
+            my_model = flow.cast_to_current_logical_view(my_model)
+            mm_out = flow.matmul(reshape_out1, my_model)
+            reshape_out2 = flow.reshape(mm_out, (-1, 8, 4))
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(reshape_out2)
+            return reshape_out1
+
+        data = [np.random.rand(*data_shape).astype(np.float32) for i in range(2)]
+        out = DynamicReshapeJob(data).get().numpy_list()
+        for i in range(2):
+            test_case.assertTrue(np.array_equal(np.reshape(data[i], (50, 20)), out[i]))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_elementwise_maximum_minimum.py b/python/oneflow/compatible/single_client/test/ops/test_elementwise_maximum_minimum.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d9f0b3b6669ab29862eca7f0f18d3a7ef0fe0ab
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_elementwise_maximum_minimum.py
@@ -0,0 +1,204 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+from typing import Dict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+def _compare_Xmum_with_np(
+    input_shape,
+    compare_type,
+    device_type,
+    machine_ids,
+    device_counts,
+    value_type,
+    dx_only,
+):
+    input_1 = np.random.random(size=input_shape).astype(value_type["np_type"])
+    if dx_only:
+        input_2 = (np.zeros(input_shape) + 1.5).astype(value_type["np_type"])
+    else:
+        input_2 = np.random.random(size=input_shape).astype(value_type["np_type"])
+    assert compare_type in ["maximum", "minimum"]
+    assert device_type in ["cpu", "gpu"]
+    flow.clear_default_session()
+    if device_type == "cpu":
+        flow.config.cpu_device_num(device_counts)
+    else:
+        flow.config.gpu_device_num(device_counts)
+    func_config = flow.FunctionConfig()
+    func_config.default_placement_scope(flow.scope.placement(device_type, machine_ids))
+
+    def np_Xmum(input1, input2, compare_type):
+        if compare_type == "minimum":
+            return np.minimum(input1, input2)
+        elif compare_type == "maximum":
+            return np.maximum(input1, input2)
+
+    np_out_Xmum = np_Xmum(input_1, input_2, compare_type)
+
+    def np_diff(input1, input2, compare_type):
+        elem_cnt = input1.size
+        init_shape = input1.shape
+        input1 = input1.flatten()
+        input2 = input2.flatten()
+        np_diff = np.zeros_like(input1)
+        for i in range(elem_cnt):
+            if compare_type == "maximum":
+                if input1[i] > input2[i]:
+                    np_diff[i] = 1
+            elif compare_type == "minimum":
+                if input1[i] < input2[i]:
+                    np_diff[i] = 1
+        return np.reshape(np_diff, init_shape)
+
+    _np_grad = np_diff(input_1, input_2, compare_type)
+
+    def assert_prediction_grad(blob: tp.Numpy):
+        assert np.allclose(blob, _np_grad)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def oneflow_Xmum(
+        of_input_1: tp.Numpy.Placeholder(
+            shape=input_1.shape, dtype=value_type["of_type"]
+        ),
+        of_input_2: tp.Numpy.Placeholder(
+            shape=input_2.shape, dtype=value_type["of_type"]
+        ),
+    ) -> tp.Numpy:
+        with flow.scope.placement(device_type, "0:0"):
+            v1 = flow.get_variable(
+                shape=input_1.shape,
+                dtype=value_type["of_type"],
+                initializer=flow.zeros_initializer(),
+                name="x1_var",
+            )
+            x1_var = of_input_1 + v1
+        if not dx_only:
+            v2 = flow.get_variable(
+                shape=input_2.shape,
+                dtype=value_type["of_type"],
+                initializer=flow.zeros_initializer(),
+                name="x2_var",
+            )
+            x2_var = of_input_2 + v2
+        else:
+            x2_var = flow.constant(
+                value=1.5, shape=of_input_2.shape, dtype=value_type["of_type"]
+            )
+        flow.watch_diff(x1_var, assert_prediction_grad)
+        if compare_type == "maximum":
+            of_Xmum_out = flow.math.maximum(x1_var, x2_var)
+        elif compare_type == "minimum":
+            of_Xmum_out = flow.math.minimum(x1_var, x2_var)
+        with flow.scope.placement(device_type, "0:0"):
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+            ).minimize(of_Xmum_out)
+        return of_Xmum_out
+
+    of_out_Xmum = oneflow_Xmum(input_1, input_2)
+    assert np.allclose(of_out_Xmum, np_out_Xmum)
+
+
+def _gen_arg_dict(
+    shape, compare_type, device_type, machine_ids, device_counts, value_type, dx_only
+):
+    arg_dict = OrderedDict()
+    arg_dict["input_shape"] = [*shape]
+    arg_dict["compare_type"] = [*compare_type]
+    arg_dict["device_type"] = [device_type]
+    arg_dict["machine_ids"] = [machine_ids]
+    arg_dict["device_counts"] = [device_counts]
+    arg_dict["value_type"] = [*value_type]
+    arg_dict["dx_only"] = [*dx_only]
+    return arg_dict
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestXmum1n1d(flow.unittest.TestCase):
+    def test_Xmum_cpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=[(3, 3)],
+            compare_type=["maximum", "minimum"],
+            device_type="cpu",
+            machine_ids="0:0",
+            device_counts=1,
+            value_type=[{"np_type": np.float32, "of_type": flow.float32}],
+            dx_only=[True, False],
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_Xmum_with_np(*arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_Xmum_gpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=[(3, 3)],
+            compare_type=["maximum", "minimum"],
+            device_type="gpu",
+            machine_ids="0:0",
+            device_counts=1,
+            value_type=[
+                {"np_type": np.float32, "of_type": flow.float32},
+                {"np_type": np.float64, "of_type": flow.float64},
+            ],
+            dx_only=[True, False],
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_Xmum_with_np(*arg)
+
+
+@flow.unittest.skip_unless_1n2d()
+class TestXmum1n2d(flow.unittest.TestCase):
+    def test_Xmum_cpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=[(3, 3)],
+            compare_type=["maximum", "minimum"],
+            device_type="cpu",
+            machine_ids="0:0-1",
+            device_counts=2,
+            value_type=[{"np_type": np.float32, "of_type": flow.float32}],
+            dx_only=[True, False],
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_Xmum_with_np(*arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_Xmum_gpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=[(3, 3)],
+            compare_type=["maximum", "minimum"],
+            device_type="gpu",
+            machine_ids="0:0-1",
+            device_counts=2,
+            value_type=[{"np_type": np.float32, "of_type": flow.float32}],
+            dx_only=[True, False],
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_Xmum_with_np(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_elementwise_maximum_minimum_dynamic.py b/python/oneflow/compatible/single_client/test/ops/test_elementwise_maximum_minimum_dynamic.py
new file mode 100644
index 0000000000000000000000000000000000000000..69e34ed1a3617cbbc18ff20fada96cce96c29b44
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_elementwise_maximum_minimum_dynamic.py
@@ -0,0 +1,155 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+from typing import Dict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+def _compare_dynamic_Xmum_with_np(
+    input_shape, data_shape, compare_type, device_type, machine_ids, device_counts
+):
+    input_1 = np.random.random(size=input_shape).astype(np.float32)
+    input_2 = np.random.random(size=input_shape).astype(np.float32)
+    assert compare_type in ["maximum", "minimum"]
+    assert device_type in ["cpu", "gpu"]
+    flow.clear_default_session()
+    if device_type == "cpu":
+        flow.config.cpu_device_num(device_counts)
+    else:
+        flow.config.gpu_device_num(device_counts)
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float32)
+    func_config.default_logical_view(flow.scope.mirrored_view())
+    func_config.default_placement_scope(flow.scope.placement("gpu", "0:0"))
+
+    def np_Xmum(input1, input2, compare_type):
+        if compare_type == "minimum":
+            return np.minimum(input1, input2)
+        elif compare_type == "maximum":
+            return np.maximum(input1, input2)
+
+    np_out_Xmum = np_Xmum(input_1, input_2, compare_type)
+
+    def np_diff(input1, input2, compare_type):
+        elem_cnt = input1.size
+        init_shape = input1.shape
+        input1 = input1.flatten()
+        input2 = input2.flatten()
+        np_diff = np.zeros_like(input1)
+        for i in range(elem_cnt):
+            if compare_type == "maximum":
+                if input1[i] > input2[i]:
+                    np_diff[i] = 1
+            elif compare_type == "minimum":
+                if input1[i] < input2[i]:
+                    np_diff[i] = 1
+        return np.reshape(np_diff, init_shape)
+
+    _np_grad = np_diff(input_1, input_2, compare_type)
+
+    def assert_prediction_grad(blob: tp.ListNumpy):
+        assert np.allclose(blob[0], _np_grad)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def oneflow_Xmum(
+        of_input_1: tp.ListNumpy.Placeholder(shape=data_shape),
+        of_input_2: tp.ListNumpy.Placeholder(shape=data_shape),
+    ) -> tp.ListNumpy:
+        with flow.scope.placement(device_type, "0:0"):
+            v1 = flow.get_variable(
+                shape=(1,),
+                dtype=flow.float32,
+                initializer=flow.zeros_initializer(),
+                name="x1_var",
+            )
+            v1 = flow.cast_to_current_logical_view(v1)
+            x1_var = of_input_1 + v1
+            v2 = flow.get_variable(
+                shape=(1,),
+                dtype=flow.float32,
+                initializer=flow.zeros_initializer(),
+                name="x2_var",
+            )
+            v2 = flow.cast_to_current_logical_view(v2)
+            x2_var = of_input_2 + v2
+        flow.watch_diff(x1_var, assert_prediction_grad)
+        if compare_type == "maximum":
+            of_Xmum_out = flow.math.maximum(x1_var, x2_var)
+        elif compare_type == "minimum":
+            of_Xmum_out = flow.math.minimum(x1_var, x2_var)
+        with flow.scope.placement(device_type, "0:0"):
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+            ).minimize(of_Xmum_out)
+        return of_Xmum_out
+
+    of_out_Xmum = oneflow_Xmum([input_1], [input_2])
+    assert np.allclose(of_out_Xmum[0], np_out_Xmum)
+
+
+def _gen_arg_dict(
+    shape, data_shape, compare_type, device_type, machine_ids, device_counts
+):
+    arg_dict = OrderedDict()
+    arg_dict["input_shape"] = [shape]
+    arg_dict["data_shape"] = [data_shape]
+    arg_dict["compare_type"] = [*compare_type]
+    arg_dict["device_type"] = [device_type]
+    arg_dict["machine_ids"] = [machine_ids]
+    arg_dict["device_counts"] = [device_counts]
+    return arg_dict
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestXmum1n1d(flow.unittest.TestCase):
+    def test_Xmum_cpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(4, 4),
+            data_shape=(10, 10),
+            compare_type=["maximum", "minimum"],
+            device_type="cpu",
+            machine_ids="0:0",
+            device_counts=1,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_dynamic_Xmum_with_np(*arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_Xmum_gpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(3, 3),
+            data_shape=(10, 10),
+            compare_type=["maximum", "minimum"],
+            device_type="gpu",
+            machine_ids="0:0",
+            device_counts=1,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_dynamic_Xmum_with_np(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_elu.py b/python/oneflow/compatible/single_client/test/ops/test_elu.py
new file mode 100644
index 0000000000000000000000000000000000000000..faf0e949dc10fdbc1d67c9fcbeade2c40e98f218
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_elu.py
@@ -0,0 +1,208 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import random
+import unittest
+from collections import OrderedDict
+from typing import Dict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+def _compare_elu_with_np(
+    input_shape, alpha, device_type, value_type, machine_ids, device_counts
+):
+    if value_type[1] == flow.float16:
+        input_1 = np.random.uniform(-1, 1, size=input_shape).astype(np.float16)
+        input_1 = np.array(input_1, dtype=value_type[0])
+    else:
+        input_1 = np.random.uniform(-1, 1, size=input_shape).astype(value_type[0])
+    assert device_type in ["cpu", "gpu"]
+    flow.clear_default_session()
+    if device_type == "cpu":
+        flow.config.cpu_device_num(device_counts)
+    else:
+        flow.config.gpu_device_num(device_counts)
+    func_config = flow.FunctionConfig()
+    func_config.default_placement_scope(flow.scope.placement(device_type, machine_ids))
+    if value_type[1] == flow.float16:
+        func_config.default_data_type(flow.float32)
+    else:
+        func_config.default_data_type(value_type[1])
+
+    def np_elu(input, alpha):
+        elem_cnt = input.size
+        init_shape = input.shape
+        input = input.flatten()
+        out = np.zeros_like(input)
+        for i in range(elem_cnt):
+            if input[i] > 0:
+                out[i] = input[i]
+            else:
+                out[i] = alpha * (np.exp(input[i]) - 1)
+        out = np.reshape(out, init_shape)
+        return np.array(out).astype(value_type[0])
+
+    np_out_elu = np_elu(input_1, alpha)
+
+    def np_diff(input, alpha):
+        input_shape = input.shape
+        input = input.flatten()
+        elem_cnt = input.size
+        diff = np.zeros(shape=(elem_cnt,))
+        for i in range(elem_cnt):
+            if input[i] > 0:
+                diff[i] = 1
+            else:
+                diff[i] = alpha * np.exp(input[i])
+        diff = np.reshape(diff, newshape=input_shape)
+        diff = np.array(diff, dtype=value_type[0])
+        return diff
+
+    _np_grad = np_diff(input_1, alpha)
+
+    def assert_prediction_grad(blob: tp.Numpy):
+        if value_type[1] == flow.float16:
+            assert np.allclose(blob, _np_grad, atol=0.001)
+        else:
+            assert np.allclose(blob, _np_grad, atol=1e-05)
+
+    if value_type[1] == flow.float16:
+
+        @flow.global_function(type="train", function_config=func_config)
+        def oneflow_elu(
+            of_input_1: tp.Numpy.Placeholder(shape=input_1.shape, dtype=flow.float32)
+        ) -> tp.Numpy:
+            with flow.scope.placement(device_type, "0:0"):
+                v = flow.get_variable(
+                    shape=input_1.shape,
+                    dtype=flow.float32,
+                    initializer=flow.zeros_initializer(),
+                    name="x_var",
+                )
+                x_var = of_input_1 + v
+                x_f16 = flow.cast(x_var, flow.float16)
+            of_elu_out_f16 = flow.nn.elu(x_f16, alpha)
+            of_elu_out_f32 = flow.cast(of_elu_out_f16, flow.float32)
+            with flow.scope.placement(device_type, "0:0"):
+                flow.optimizer.SGD(
+                    flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+                ).minimize(of_elu_out_f32)
+            flow.watch_diff(x_var, assert_prediction_grad)
+            return of_elu_out_f32
+
+    else:
+
+        @flow.global_function(type="train", function_config=func_config)
+        def oneflow_elu(
+            of_input_1: tp.Numpy.Placeholder(shape=input_1.shape, dtype=value_type[1])
+        ) -> tp.Numpy:
+            with flow.scope.placement(device_type, "0:0"):
+                v = flow.get_variable(
+                    shape=input_1.shape,
+                    dtype=value_type[1],
+                    initializer=flow.zeros_initializer(),
+                    name="x_var",
+                )
+                x_var = of_input_1 + v
+            flow.watch_diff(x_var, assert_prediction_grad)
+            of_elu_out = flow.nn.elu(x_var, alpha)
+            with flow.scope.placement(device_type, "0:0"):
+                flow.optimizer.SGD(
+                    flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+                ).minimize(of_elu_out)
+            return of_elu_out
+
+    of_out_elu = oneflow_elu(input_1)
+    if value_type[1] == flow.float16:
+        assert np.allclose(of_out_elu, np_out_elu, atol=0.001)
+    else:
+        assert np.allclose(of_out_elu, np_out_elu, atol=1e-05)
+
+
+def _gen_arg_dict(shape, alpha, device_type, value_type, machine_ids, device_counts):
+    arg_dict = OrderedDict()
+    arg_dict["input_shape"] = [shape]
+    arg_dict["alpha"] = [alpha]
+    arg_dict["device_type"] = [device_type]
+    if value_type == "float" and device_type == "cpu":
+        arg_dict["value_type"] = [
+            (np.float32, flow.float32),
+            (np.float64, flow.float64),
+        ]
+    else:
+        arg_dict["value_type"] = [
+            (np.float32, flow.float16),
+            (np.float32, flow.float32),
+            (np.float64, flow.float64),
+        ]
+    arg_dict["machine_ids"] = [machine_ids]
+    arg_dict["device_counts"] = [device_counts]
+    return arg_dict
+
+
+@flow.unittest.skip_unless_1n1d()
+class Testelu1n1d(flow.unittest.TestCase):
+    def test_elu_cpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(3, 3),
+            alpha=1.0,
+            device_type="cpu",
+            value_type="float",
+            machine_ids="0:0",
+            device_counts=1,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_elu_with_np(*arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_elu_gpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(4, 4),
+            alpha=2.0,
+            device_type="gpu",
+            value_type="float",
+            machine_ids="0:0",
+            device_counts=1,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_elu_with_np(*arg)
+
+
+@flow.unittest.skip_unless_1n2d()
+class Testelu1n2d(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_elu_gpu_1n2d(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(4, 8, 4),
+            alpha=1.0,
+            device_type="gpu",
+            value_type="float",
+            machine_ids="0:0-1",
+            device_counts=2,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_elu_with_np(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_expand_dims.py b/python/oneflow/compatible/single_client/test/ops/test_expand_dims.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc5f3a04c70830e52a15f2e3dadcd74b59bcc968
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_expand_dims.py
@@ -0,0 +1,79 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def compare_with_tensorflow(device_type, x_shape, axis):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    def check_grad(x_diff_blob):
+        assert np.array_equal(x_diff_blob.numpy(), np.ones(x_shape))
+
+    @flow.global_function(type="train", function_config=func_config)
+    def ExpandDimsJob():
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "var",
+                shape=x_shape,
+                dtype=flow.float,
+                initializer=flow.ones_initializer(),
+                trainable=True,
+            )
+            flow.watch_diff(x, check_grad)
+            loss = flow.expand_dims(x, axis)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            return loss
+
+    of_out = ExpandDimsJob().get().numpy()
+    tf_out = tf.expand_dims(np.ones(x_shape, dtype=np.float32), axis).numpy()
+    assert np.array_equal(of_out, tf_out)
+
+
+def gen_arg_list():
+    arg_dict = OrderedDict()
+    arg_dict["device_type"] = ["cpu", "gpu"]
+    arg_dict["in_shape"] = [(10, 10)]
+    arg_dict["axis"] = [0, 1, 2, -1, -2, -3]
+    return GenArgList(arg_dict)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestExpandDims(flow.unittest.TestCase):
+    def test_expand_dims(test_case):
+        for arg in gen_arg_list():
+            compare_with_tensorflow(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_expand_op.py b/python/oneflow/compatible/single_client/test/ops/test_expand_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..406dc4e051baddc7a326782dd0ebb430f91b35c4
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_expand_op.py
@@ -0,0 +1,189 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+def getExpandGrad(input_shape, expand_size):
+    input = np.random.random(size=input_shape).astype(np.float32)
+    input_stride = [1]
+    for i in range(len(input_shape) - 2, -1, -1):
+        input_stride.insert(0, input_stride[0] * input_shape[i + 1])
+    new_size = []
+    new_stride = []
+    diff = len(expand_size) - len(input_shape)
+    for i in range(len(expand_size) - 1, -1, -1):
+        if i >= diff:
+            if expand_size[i] == -1 or expand_size[i] == input_shape[i - diff]:
+                new_size.insert(0, input_shape[i - diff])
+                new_stride.insert(0, input_stride[i - diff])
+            else:
+                assert expand_size[i] >= 1 and input_shape[i - diff] == 1
+                new_size.insert(0, expand_size[i])
+                new_stride.insert(0, 0)
+        else:
+            assert expand_size[i] >= 1
+            new_size.insert(0, expand_size[i])
+            if expand_size[i] == 1:
+                new_stride.insert(0, new_stride[0])
+            else:
+                new_stride.insert(0, 0)
+    gout = np.random.random(size=tuple(new_size)).astype(np.float32)
+    out_stride = [1]
+    for i in range(len(new_size) - 2, -1, -1):
+        out_stride.insert(0, out_stride[0] * new_size[i + 1])
+    gin = np.zeros(input_shape).flatten()
+    out = np.zeros(np.product(new_size))
+
+    def getOffset(i_offset, stride, expand_stride, n):
+        remain = i_offset
+        o_offset = 0
+        for i in range(n):
+            idx = int(remain / stride[i])
+            o_offset += idx * expand_stride[i]
+            remain = remain - idx * stride[i]
+        return o_offset
+
+    in_flatten = input.flatten()
+    gout_flatten = gout.flatten()
+    num_elem = np.product(new_size)
+    dims = len(new_size)
+    for i in range(num_elem):
+        offset = getOffset(i, out_stride, new_stride, dims)
+        gin[offset] += gout_flatten[i]
+        out[i] = in_flatten[offset]
+    return (input, gout, out.reshape(tuple(new_size)), gin.reshape(input_shape))
+
+
+def _compare_expand_op_with_np(
+    input_shape, expand_dim, data_type, device_type, machine_ids, device_counts
+):
+    assert device_type in ["cpu", "gpu"]
+    if device_type == "cpu" and data_type == flow.float16:
+        return
+    flow.clear_default_session()
+    if device_type == "cpu":
+        flow.config.cpu_device_num(device_counts)
+    else:
+        flow.config.gpu_device_num(device_counts)
+    func_config = flow.FunctionConfig()
+    if data_type == flow.float16:
+        func_config.default_data_type(flow.float32)
+    else:
+        func_config.default_data_type(data_type)
+    func_config.default_placement_scope(flow.scope.placement(device_type, machine_ids))
+    (input, gout, out_np, gin_np) = getExpandGrad(input_shape, expand_dim)
+
+    def assert_prediction_grad(gin_of: tp.Numpy):
+        assert np.allclose(gin_of, gin_np, atol=1e-05)
+
+    if data_type == flow.float32:
+
+        @flow.global_function(type="train", function_config=func_config)
+        def expandJob(
+            of_input: tp.Numpy.Placeholder(shape=input.shape, dtype=data_type),
+            multipler: tp.Numpy.Placeholder(shape=gout.shape, dtype=data_type),
+        ) -> tp.Numpy:
+            with flow.scope.placement(device_type, "0:0"):
+                v = flow.get_variable(
+                    shape=of_input.shape,
+                    dtype=data_type,
+                    initializer=flow.constant_initializer(0),
+                    name="v",
+                )
+                x_var = of_input + v
+                flow.watch_diff(x_var, assert_prediction_grad)
+            out = flow.expand(x_var, expand_dim)
+            with flow.scope.placement(device_type, "0:0"):
+                flow.optimizer.SGD(
+                    flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+                ).minimize(out * multipler)
+            return out
+
+        of_out = expandJob(input, gout)
+        assert np.allclose(of_out, out_np, atol=1e-05)
+    elif data_type == flow.float64:
+
+        @flow.global_function(type="train", function_config=func_config)
+        def expandJob(
+            of_input: tp.Numpy.Placeholder(shape=input.shape, dtype=flow.float32),
+            multipler: tp.Numpy.Placeholder(
+                shape=gout.shape, dtype=flow.float32, batch_axis=diff
+            ),
+        ) -> tp.Numpy:
+            with flow.scope.placement(device_type, "0:0"):
+                v = flow.get_variable(
+                    shape=of_input.shape,
+                    dtype=flow.float32,
+                    initializer=flow.constant_initializer(0),
+                    name="v",
+                )
+                input_x = v + of_input
+                flow.watch_diff(input_x, assert_prediction_grad)
+            x_fp32 = flow.cast(input_x, flow.float32)
+            x_fp16 = flow.cast(input_x, dtype=flow.float16)
+            y_fp16 = flow.expand(x_fp16, expand_dim)
+            y_fp32 = flow.cast(y_fp16, dtype=flow.float32)
+            with flow.scope.placement(device_type, "0:0"):
+                flow.optimizer.SGD(
+                    flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+                ).minimize(y_fp32 * multipler)
+            return y_fp32
+
+        of_out = expandJob(input, gout)
+        assert np.allclose(of_out, out_np, atol=1e-05)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestExpandOp1n1d(flow.unittest.TestCase):
+    def test_expand(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["input_shape"] = [(1, 4, 1, 32)]
+        arg_dict["expand_dim"] = [[1, 4, 2, 32]]
+        arg_dict["expand_dim"] = [[2, 4, 2, 32], [2, 1, 2, 4, 2, 32]]
+        arg_dict["data_type"] = [flow.float32, flow.float16]
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["machine_ids"] = ["0:0"]
+        arg_dict["device_counts"] = [1]
+        for arg in GenArgList(arg_dict):
+            _compare_expand_op_with_np(*arg)
+
+
+@flow.unittest.skip_unless_1n2d()
+class TestExpandOp1n2d(flow.unittest.TestCase):
+    def test_expand(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["input_shape"] = [(2, 4, 1, 32)]
+        arg_dict["expand_dim"] = [[2, 4, 2, 32], [2, 1, 2, 4, 2, 32]]
+        arg_dict["data_type"] = [flow.float32, flow.float16]
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["machine_ids"] = ["0:0-1"]
+        arg_dict["device_counts"] = [2]
+        for arg in GenArgList(arg_dict):
+            _compare_expand_op_with_np(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_flatten.py b/python/oneflow/compatible/single_client/test/ops/test_flatten.py
new file mode 100644
index 0000000000000000000000000000000000000000..c80c21e206342f4bbf7b7a56c15dab5383e23e16
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_flatten.py
@@ -0,0 +1,89 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import test_global_storage
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+
+def compare_with_numpy(test_case, device_type, input_shape, start_end_dim):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    start_dim = start_end_dim[0]
+    end_dim = start_end_dim[1]
+
+    @flow.global_function(type="train", function_config=func_config)
+    def FlattenJob() -> flow.typing.Numpy:
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "in",
+                shape=input_shape,
+                dtype=flow.float,
+                initializer=flow.random_uniform_initializer(minval=2, maxval=5),
+                trainable=True,
+            )
+            loss = flow.flatten(x, start_dim=start_dim, end_dim=end_dim)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            return loss
+
+    of_out = FlattenJob()
+    of_x = test_global_storage.Get("x")
+    of_x_shape = of_x.shape
+    of_x_diff = test_global_storage.Get("x_diff")
+    true_end_dim = end_dim + len(of_x_shape) if end_dim < 0 else end_dim
+    new_shape = []
+    for i in range(0, start_dim):
+        new_shape.append(of_x_shape[i])
+    flatten_dim = 1
+    for i in range(start_dim, true_end_dim + 1):
+        flatten_dim *= of_x_shape[i]
+    new_shape.append(flatten_dim)
+    for i in range(true_end_dim + 1, len(of_x_shape)):
+        new_shape.append(of_x_shape[i])
+    np_out = np.reshape(of_x, tuple(new_shape))
+    test_case.assertTrue(of_out.shape == np_out.shape)
+    test_case.assertTrue(np.allclose(of_out, np_out, rtol=1e-05, atol=1e-05))
+    test_case.assertTrue(
+        np.allclose(of_x_diff, np.ones(of_x_diff.shape), rtol=1e-05, atol=1e-05)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestFlatten(flow.unittest.TestCase):
+    def test_flatten(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_case"] = [test_case]
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(2, 3, 4, 5)]
+        arg_dict["start_end_dim"] = [(0, -1), (1, 3), (2, -2)]
+        for arg in GenArgList(arg_dict):
+            compare_with_numpy(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_function_config.py b/python/oneflow/compatible/single_client/test/ops/test_function_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..5496aafb5d03c1425ed85e4a13334625a46a7292
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_function_config.py
@@ -0,0 +1,62 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestFunctionConfig(flow.unittest.TestCase):
+    def test_default_placement_scope(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_placement_scope(flow.scope.placement("cpu", "0:0"))
+
+        @flow.global_function(function_config=func_config)
+        def Foo():
+            test_case.assertEqual(
+                "cpu", flow.current_scope().device_parallel_desc_symbol.device_tag
+            )
+            return flow.get_variable(
+                "w", (10,), initializer=flow.constant_initializer(1)
+            )
+
+        Foo().get()
+
+    def test_config_setter_getter(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.enable_inplace()
+        test_case.assertEqual(func_config.function_desc.enable_inplace, True)
+
+    def test_global_function_desc(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_placement_scope(flow.scope.placement("cpu", "0:0"))
+
+        @flow.global_function(function_config=func_config)
+        def Foo():
+            test_case.assertEqual(
+                flow.current_global_function_desc().IsTrainable(), False
+            )
+            return flow.get_variable(
+                "w", (10,), initializer=flow.constant_initializer(1)
+            )
+
+        Foo().get()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_function_input_output.py b/python/oneflow/compatible/single_client/test/ops/test_function_input_output.py
new file mode 100644
index 0000000000000000000000000000000000000000..b063499893cfe9de530e627d2563297d805222c2
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_function_input_output.py
@@ -0,0 +1,69 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from typing import Tuple
+
+import numpy as np
+
+import oneflow._oneflow_internal
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+@flow.unittest.skip_unless_1n4d()
+class TestFunctionInputOutput(flow.unittest.TestCase):
+    def test_FixedTensorDef(test_case):
+        @flow.global_function()
+        def Foo(x: oft.Numpy.Placeholder((2, 5))):
+            return x
+
+        data = np.ones((2, 5), dtype=np.float32)
+        of_ret = Foo(data).get()
+        test_case.assertEqual(of_ret.numpy().max(), 1)
+        test_case.assertEqual(of_ret.numpy().min(), 1)
+        test_case.assertTrue(np.allclose(of_ret.numpy(), data))
+
+    def test_FixedTensorDef_2_device(test_case):
+        flow.config.gpu_device_num(2)
+
+        @flow.global_function()
+        def Foo(x: oft.Numpy.Placeholder((2, 5))):
+            return x
+
+        data = np.ones((2, 5), dtype=np.float32)
+        of_ret = Foo(data).get()
+        test_case.assertEqual(of_ret.numpy().max(), 1)
+        test_case.assertEqual(of_ret.numpy().min(), 1)
+        test_case.assertTrue(np.allclose(of_ret.numpy(), data))
+
+    def test_MirroredTensorDef(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.mirrored_view())
+
+        @flow.global_function(function_config=func_config)
+        def Foo(x: oft.ListNumpy.Placeholder((2, 5))):
+            return x
+
+        data = np.ones((1, 5), dtype=np.float32)
+        ndarray_list = Foo([data]).get().numpy_list()
+        test_case.assertEqual(len(ndarray_list), 1)
+        test_case.assertTrue(np.allclose(ndarray_list[0], data))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_fuse_cast_scale.py b/python/oneflow/compatible/single_client/test/ops/test_fuse_cast_scale.py
new file mode 100644
index 0000000000000000000000000000000000000000..f09c20a83762659f74d3ec7b2eea1b4f3fb3d6fc
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_fuse_cast_scale.py
@@ -0,0 +1,138 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import test_global_storage
+from test_util import GenArgList, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def fused_cast_scale(x, scale_by_tensor, scale, name):
+    return (
+        flow.user_op_builder(name)
+        .Op("fused_cast_scale")
+        .Input("x", [x])
+        .Input("scale_by_tensor", [scale_by_tensor])
+        .Output("y")
+        .Attr("scale", float(scale))
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def compare_with_tensorflow(
+    device_type,
+    input_shape,
+    in_dtype,
+    out_dtype,
+    test_fuse_cast_scale_pass,
+    has_scalar_mul,
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.enable_fuse_cast_scale(True)
+
+    @flow.global_function(type="predict", function_config=func_config)
+    def FusedCastScaleJob():
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "in",
+                shape=input_shape,
+                dtype=flow.float,
+                initializer=flow.random_uniform_initializer(),
+                trainable=True,
+            )
+            scale = flow.get_variable(
+                "scale",
+                shape=(1,),
+                dtype=flow.float,
+                initializer=flow.random_uniform_initializer(),
+                trainable=False,
+            )
+            loss = flow.cast(x, dtype=type_name_to_flow_type[in_dtype])
+            if test_fuse_cast_scale_pass:
+                loss = flow.cast(loss, dtype=type_name_to_flow_type[out_dtype])
+                if has_scalar_mul:
+                    loss = loss * 0.125
+                loss = loss * flow.cast(scale, dtype=type_name_to_flow_type[out_dtype])
+            else:
+                if has_scalar_mul:
+                    scale_val = 0.125
+                else:
+                    scale_val = 1.0
+                loss = fused_cast_scale(
+                    loss,
+                    flow.cast(scale, dtype=type_name_to_flow_type[out_dtype]),
+                    scale=scale_val,
+                    name="fused_cast_scale",
+                )
+            loss = flow.cast(loss, dtype=flow.float)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch(scale, test_global_storage.Setter("scale"))
+            flow.watch(loss, test_global_storage.Setter("loss"))
+            return loss
+
+    of_out = FusedCastScaleJob().get()
+    with tf.GradientTape(persistent=True) as tape:
+        x = tf.Variable(test_global_storage.Get("x"))
+        scale = tf.Variable(test_global_storage.Get("scale"))
+        tf_out = tf.cast(x, dtype=type_name_to_np_type[in_dtype])
+        tf_out = tf.cast(tf_out, dtype=type_name_to_np_type[out_dtype]) * tf.cast(
+            scale, dtype=type_name_to_np_type[out_dtype]
+        )
+        if has_scalar_mul:
+            tf_out = tf_out * 0.125
+        tf_out = tf.cast(tf_out, dtype=tf.float32)
+    assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=1e-05, atol=1e-05)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestFusedCastScale(flow.unittest.TestCase):
+    def test_cast(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(5, 4, 3)]
+        arg_dict["in_dtype"] = ["float16", "float32", "double"]
+        arg_dict["out_dtype"] = ["float16", "float32", "double"]
+        arg_dict["test_fuse_cast_scale_pass"] = [True, False]
+        arg_dict["has_scalar_mul"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            if arg[2] == arg[3]:
+                continue
+            if arg[4] == True and (arg[2] != "float16" or arg[3] != "float32"):
+                continue
+            if arg[0] == "cpu" and (arg[2] == "float16" or arg[3] == "float16"):
+                continue
+            compare_with_tensorflow(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_fused_bias_add_dropout.py b/python/oneflow/compatible/single_client/test/ops/test_fused_bias_add_dropout.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bdf19d6bcd5e80c57a697c46a74fce05c9606af
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_fused_bias_add_dropout.py
@@ -0,0 +1,184 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import test_global_storage
+from test_util import Args, GenArgDict, GenArgList, type_name_to_flow_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def compare_with_not_fused(
+    test_case,
+    device_type,
+    x_shape,
+    data_type,
+    data_format,
+    rate,
+    seed,
+    fuse_add_to_output,
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.enable_fuse_add_to_output(fuse_add_to_output)
+    if data_type == "float16":
+        dtype = flow.float
+    else:
+        dtype = type_name_to_flow_type[data_type]
+    if data_format == "NCHW":
+        bias_shape = (x_shape[1],)
+    elif data_format == "NHWC":
+        bias_shape = (x_shape[len(x_shape) - 1],)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def FlowJob(
+        value: oft.Numpy.Placeholder(x_shape),
+        bias: oft.Numpy.Placeholder(bias_shape),
+        addend: oft.Numpy.Placeholder(x_shape),
+    ):
+        with flow.scope.placement(device_type, "0:0"):
+            value += flow.get_variable(
+                name="v1",
+                shape=(1,),
+                dtype=flow.float,
+                initializer=flow.zeros_initializer(),
+            )
+            bias += flow.get_variable(
+                name="v2",
+                shape=(1,),
+                dtype=flow.float,
+                initializer=flow.zeros_initializer(),
+            )
+            addend += flow.get_variable(
+                name="v3",
+                shape=(1,),
+                dtype=flow.float,
+                initializer=flow.zeros_initializer(),
+            )
+            x1 = flow.identity(value)
+            x2 = flow.identity(value)
+            bias1 = flow.identity(bias)
+            bias2 = flow.identity(bias)
+            addend1 = flow.identity(addend)
+            addend2 = flow.identity(addend)
+            flow.watch_diff(x1, test_global_storage.Setter("x1_diff"))
+            flow.watch_diff(x2, test_global_storage.Setter("x2_diff"))
+            flow.watch_diff(bias1, test_global_storage.Setter("bias1_diff"))
+            flow.watch_diff(bias2, test_global_storage.Setter("bias2_diff"))
+            flow.watch_diff(addend1, test_global_storage.Setter("addend1_diff"))
+            flow.watch_diff(addend2, test_global_storage.Setter("addend2_diff"))
+            if data_type == "float16":
+                out1 = flow.nn.dropout(
+                    flow.nn.bias_add(
+                        flow.cast(x1, dtype=flow.float16),
+                        flow.cast(bias1, dtype=flow.float16),
+                        data_format=data_format,
+                    ),
+                    rate=rate,
+                    seed=seed,
+                    name="dropout",
+                )
+                y1 = flow.cast(
+                    out1 + flow.cast(addend1, dtype=flow.float16), dtype=flow.float
+                )
+                out2 = flow.nn.fused_bias_add_dropout(
+                    flow.cast(x2, dtype=flow.float16),
+                    flow.cast(bias2, dtype=flow.float16),
+                    data_format=data_format,
+                    rate=rate,
+                    seed=seed,
+                )
+                y2 = flow.cast(
+                    out2 + flow.cast(addend2, dtype=flow.float16), dtype=flow.float
+                )
+            else:
+                y1 = (
+                    flow.nn.dropout(
+                        flow.nn.bias_add(x1, bias1, data_format=data_format),
+                        rate=rate,
+                        seed=seed,
+                        name="dropout",
+                    )
+                    + addend1
+                )
+                y2 = (
+                    flow.nn.fused_bias_add_dropout(
+                        x2, bias2, data_format=data_format, rate=rate, seed=seed
+                    )
+                    + addend2
+                )
+            flow.watch(y1, test_global_storage.Setter("y1"))
+            flow.watch(y2, test_global_storage.Setter("y2"))
+            flow.watch_diff(y1, test_global_storage.Setter("y1_diff"))
+            flow.watch_diff(y2, test_global_storage.Setter("y2_diff"))
+            loss = y1 + y2
+        flow.optimizer.SGD(
+            flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+        ).minimize(flow.math.reduce_sum(loss))
+        return loss
+
+    x = np.random.uniform(low=0, high=10, size=x_shape).astype(np.float32)
+    bias = np.random.uniform(low=0, high=10, size=bias_shape).astype(np.float32)
+    add = np.random.uniform(low=0, high=10, size=x_shape).astype(np.float32)
+    of_out = FlowJob(x, bias, add).get()
+    y1 = test_global_storage.Get("y1")
+    y2 = test_global_storage.Get("y2")
+    tol = 1e-05
+    test_case.assertTrue(np.allclose(y1, y2, rtol=tol, atol=tol, equal_nan=True))
+    x1_diff = test_global_storage.Get("x1_diff")
+    x2_diff = test_global_storage.Get("x2_diff")
+    test_case.assertTrue(
+        np.allclose(x1_diff, x2_diff, rtol=tol, atol=tol, equal_nan=True)
+    )
+    bias1_diff = test_global_storage.Get("bias1_diff")
+    bias2_diff = test_global_storage.Get("bias2_diff")
+    test_case.assertTrue(
+        np.allclose(bias1_diff, bias2_diff, rtol=tol, atol=tol, equal_nan=True)
+    )
+    bias1_diff = test_global_storage.Get("bias1_diff")
+    bias2_diff = test_global_storage.Get("bias2_diff")
+    test_case.assertTrue(
+        np.allclose(bias1_diff, bias2_diff, rtol=tol, atol=tol, equal_nan=True)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestFusedBiasAdd(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_fused_bias_add(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(10, 10), (10, 5), (1, 10, 10, 10), (2, 10, 10, 10)]
+        arg_dict["data_type"] = ["float16", "float32", "double"]
+        arg_dict["data_format"] = ["NCHW"]
+        arg_dict["rate"] = [0.1]
+        arg_dict["seed"] = [1234]
+        arg_dict["fuse_add_to_output"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            if arg[0] == "cpu" and arg[2] == "float16":
+                continue
+            compare_with_not_fused(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_fused_bias_add_gelu.py b/python/oneflow/compatible/single_client/test/ops/test_fused_bias_add_gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..204ba5fcc44b5c886192d9b965ae338686317e67
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_fused_bias_add_gelu.py
@@ -0,0 +1,137 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import test_global_storage
+from test_util import Args, GenArgDict, GenArgList, type_name_to_flow_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def compare_with_not_fused(test_case, device_type, x_shape, data_type, data_format):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    if data_type == "float16":
+        dtype = flow.float
+    else:
+        dtype = type_name_to_flow_type[data_type]
+    if data_format == "NCHW":
+        bias_shape = (x_shape[1],)
+    elif data_format == "NHWC":
+        bias_shape = (x_shape[len(x_shape) - 1],)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def FlowJob(
+        value: oft.Numpy.Placeholder(x_shape), bias: oft.Numpy.Placeholder(bias_shape)
+    ):
+        with flow.scope.placement(device_type, "0:0"):
+            value += flow.get_variable(
+                name="v1",
+                shape=(1,),
+                dtype=flow.float,
+                initializer=flow.zeros_initializer(),
+            )
+            bias += flow.get_variable(
+                name="v2",
+                shape=(1,),
+                dtype=flow.float,
+                initializer=flow.zeros_initializer(),
+            )
+            x1 = flow.identity(value)
+            x2 = flow.identity(value)
+            bias1 = flow.identity(bias)
+            bias2 = flow.identity(bias)
+            flow.watch_diff(x1, test_global_storage.Setter("x1_diff"))
+            flow.watch_diff(x2, test_global_storage.Setter("x2_diff"))
+            flow.watch_diff(bias1, test_global_storage.Setter("bias1_diff"))
+            flow.watch_diff(bias2, test_global_storage.Setter("bias2_diff"))
+            if data_type == "float16":
+                y1 = flow.cast(
+                    flow.math.gelu(
+                        flow.nn.bias_add(
+                            flow.cast(x1, dtype=flow.float16),
+                            flow.cast(bias1, dtype=flow.float16),
+                            data_format=data_format,
+                        )
+                    ),
+                    dtype=flow.float,
+                )
+                y2 = flow.cast(
+                    flow.nn.fused_bias_add_gelu(
+                        flow.cast(x2, dtype=flow.float16),
+                        flow.cast(bias2, dtype=flow.float16),
+                        data_format=data_format,
+                    ),
+                    dtype=flow.float,
+                )
+            else:
+                y1 = flow.math.gelu(
+                    flow.nn.bias_add(x1, bias1, data_format=data_format)
+                )
+                y2 = flow.nn.fused_bias_add_gelu(x2, bias2, data_format=data_format)
+            flow.watch(y1, test_global_storage.Setter("y1"))
+            flow.watch(y2, test_global_storage.Setter("y2"))
+            flow.watch_diff(y1, test_global_storage.Setter("y1_diff"))
+            flow.watch_diff(y2, test_global_storage.Setter("y2_diff"))
+            loss = y1 + y2
+        flow.optimizer.SGD(
+            flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+        ).minimize(flow.math.reduce_sum(loss))
+        return loss
+
+    x = np.random.uniform(low=0, high=10, size=x_shape).astype(np.float32)
+    bias = np.random.uniform(low=0, high=10, size=bias_shape).astype(np.float32)
+    of_out = FlowJob(x, bias).get()
+    y1 = test_global_storage.Get("y1")
+    y2 = test_global_storage.Get("y2")
+    tol = 1e-05
+    test_case.assertTrue(np.allclose(y1, y2, rtol=tol, atol=tol, equal_nan=True))
+    x1_diff = test_global_storage.Get("x1_diff")
+    x2_diff = test_global_storage.Get("x2_diff")
+    test_case.assertTrue(
+        np.allclose(x1_diff, x2_diff, rtol=tol, atol=tol, equal_nan=True)
+    )
+    bias1_diff = test_global_storage.Get("bias1_diff")
+    bias2_diff = test_global_storage.Get("bias2_diff")
+    test_case.assertTrue(
+        np.allclose(bias1_diff, bias2_diff, rtol=tol, atol=tol, equal_nan=True)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestFusedBiasAdd(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_fused_bias_add(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(10, 10), (10, 5), (1, 10, 10, 10), (2, 10, 10, 10)]
+        arg_dict["data_type"] = ["float16", "float32", "double"]
+        arg_dict["data_format"] = ["NCHW"]
+        for arg in GenArgList(arg_dict):
+            if arg[0] == "cpu" and arg[2] == "float16":
+                continue
+            compare_with_not_fused(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_fused_scale_tril.py b/python/oneflow/compatible/single_client/test/ops/test_fused_scale_tril.py
new file mode 100644
index 0000000000000000000000000000000000000000..135ff0b6ca3875d0088767b7778cbfc055546b91
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_fused_scale_tril.py
@@ -0,0 +1,123 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import (
+    GenArgDict,
+    test_global_storage,
+    type_name_to_flow_type,
+    type_name_to_np_type,
+)
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def _test_fused_scale_tril_fw_bw(
+    test_case, device, shape, type_name, diagonal, fill_value, scale
+):
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    if type_name == "float16":
+        flow_type = flow.float
+        np_type = np.float32
+    else:
+        flow_type = type_name_to_flow_type[type_name]
+        np_type = type_name_to_np_type[type_name]
+
+    @flow.global_function(type="train", function_config=func_config)
+    def test_fused_scale_tril_fw_bw_job(
+        x: oft.Numpy.Placeholder(shape, dtype=flow_type)
+    ):
+        with flow.scope.placement(device, "0:0"):
+            x_var = flow.get_variable(
+                name="xv",
+                shape=(1,),
+                dtype=flow.float,
+                initializer=flow.zeros_initializer(),
+            )
+            x += flow.cast(x_var, dtype=flow_type)
+            if type_name == "float16":
+                out = flow.cast(
+                    flow.math.fused_scale_tril(
+                        flow.cast(x, flow.float16), diagonal, scale=scale
+                    ),
+                    flow.float,
+                )
+            else:
+                out = flow.math.fused_scale_tril(x, diagonal, scale=scale)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(out)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch(out, test_global_storage.Setter("out"))
+            flow.watch_diff(out, test_global_storage.Setter("out_diff"))
+            return out
+
+    x = np.random.randint(low=0, high=100, size=shape)
+    test_fused_scale_tril_fw_bw_job(x.astype(np_type)).get()
+    np_out = np.where(
+        np.tril(np.ones(shape), diagonal),
+        test_global_storage.Get("x") * scale,
+        np.full(shape, fill_value).astype(np_type),
+    )
+    np_x_diff = np.tril(test_global_storage.Get("out_diff"), diagonal) * scale
+    if type_name == "float16":
+        tolerance = 0.001
+    else:
+        tolerance = 1e-05
+    test_case.assertTrue(
+        np.allclose(
+            np_out, test_global_storage.Get("out"), rtol=tolerance, atol=tolerance
+        )
+    )
+    test_case.assertTrue(
+        np.allclose(
+            np_x_diff, test_global_storage.Get("x_diff"), rtol=tolerance, atol=tolerance
+        )
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestFusedScaleTril(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_fused_scale_tril_fw_bw(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device"] = ["gpu"]
+        arg_dict["type_name"] = ["float32", "float16", "double", "int32", "int64"]
+        arg_dict["shape"] = [(3, 6, 8)]
+        arg_dict["diagonal"] = [-8, -1, 0, 8]
+        arg_dict["fill_value"] = [1.0, 0]
+        arg_dict["scale"] = [5.0, 3]
+        for arg in GenArgDict(arg_dict):
+            if isinstance(arg["fill_value"], float) and arg_dict["type_name"] not in [
+                "float32",
+                "float16",
+                "double",
+            ]:
+                continue
+            _test_fused_scale_tril_fw_bw(test_case, **arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_fused_scale_tril_softmax_mask_and_scale.py b/python/oneflow/compatible/single_client/test/ops/test_fused_scale_tril_softmax_mask_and_scale.py
new file mode 100644
index 0000000000000000000000000000000000000000..476ba9673eac27a3344db5afcc3a316c31cb39ee
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_fused_scale_tril_softmax_mask_and_scale.py
@@ -0,0 +1,145 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import test_global_storage
+from test_util import GenArgList, type_name_to_flow_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+
+def compare_with_not_fused(
+    test_case, device_type, x_shape, data_type, diagonal, fill_value, scale, rate, seed
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    if data_type == "float16":
+        dtype = flow.float
+    else:
+        dtype = type_name_to_flow_type[data_type]
+
+    @flow.global_function(type="train", function_config=func_config)
+    def test_fused_scale_tril_softmax_dropout_fw_bw_job():
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "x",
+                shape=x_shape,
+                dtype=dtype,
+                initializer=flow.random_uniform_initializer(minval=-1.0, maxval=1.0),
+                trainable=True,
+            )
+            flow.watch(x, test_global_storage.Setter("x"))
+            x1 = flow.identity(x)
+            x2 = flow.identity(x)
+            flow.watch_diff(x1, test_global_storage.Setter("x1_diff"))
+            flow.watch_diff(x2, test_global_storage.Setter("x2_diff"))
+            if data_type == "float16":
+                y1 = flow.cast(
+                    flow.nn.dropout(
+                        flow.nn.softmax(
+                            flow.math.fused_scale_tril(
+                                flow.cast(x1, dtype=flow.float16),
+                                diagonal=diagonal,
+                                fill_value=fill_value,
+                                scale=scale,
+                            )
+                        ),
+                        rate=rate,
+                        seed=seed,
+                        name="dropout",
+                    ),
+                    dtype=flow.float,
+                )
+                y2 = flow.cast(
+                    flow.nn.fused_scale_tril_softmax_dropout(
+                        flow.cast(x2, dtype=flow.float16),
+                        diagonal=diagonal,
+                        fill_value=fill_value,
+                        scale=scale,
+                        rate=rate,
+                        seed=seed,
+                    ),
+                    dtype=flow.float,
+                )
+            else:
+                y1 = flow.nn.dropout(
+                    flow.nn.softmax(
+                        flow.math.fused_scale_tril(
+                            x1, diagonal=diagonal, fill_value=fill_value, scale=scale
+                        )
+                    ),
+                    rate=rate,
+                    seed=seed,
+                    name="dropout",
+                )
+                y2 = flow.nn.fused_scale_tril_softmax_dropout(
+                    x2,
+                    diagonal=diagonal,
+                    fill_value=fill_value,
+                    scale=scale,
+                    rate=rate,
+                    seed=seed,
+                )
+            flow.watch(y1, test_global_storage.Setter("y1"))
+            flow.watch(y2, test_global_storage.Setter("y2"))
+            flow.watch_diff(y1, test_global_storage.Setter("y1_diff"))
+            flow.watch_diff(y2, test_global_storage.Setter("y2_diff"))
+            loss = y1 + y2
+            total_loss = loss * x
+        flow.optimizer.SGD(
+            flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+        ).minimize(flow.math.reduce_sum(total_loss))
+        return loss
+
+    of_out = test_fused_scale_tril_softmax_dropout_fw_bw_job().get()
+    y1 = test_global_storage.Get("y1")
+    y2 = test_global_storage.Get("y2")
+    tol = 0.001 if data_type == "float16" else 1e-05
+    test_case.assertTrue(np.allclose(y1, y2, rtol=tol, atol=tol, equal_nan=True))
+    x1_diff = test_global_storage.Get("x1_diff")
+    x2_diff = test_global_storage.Get("x2_diff")
+    test_case.assertTrue(
+        np.allclose(x1_diff, x2_diff, rtol=tol, atol=tol, equal_nan=True)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestFusedScaleTrilSoftmaxDropout(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_fused_scale_tril_softmax_dropout(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(2, 2, 5, 5), (10, 20), (32, 12, 128), (10, 960)]
+        arg_dict["data_type"] = ["float16", "float32", "double"]
+        arg_dict["diagonal"] = [-1, 0]
+        arg_dict["fill_value"] = [float("-inf"), 0]
+        arg_dict["scale"] = [0.125]
+        arg_dict["rate"] = [0.5]
+        arg_dict["seed"] = [12345]
+        for arg in GenArgList(arg_dict):
+            if arg[0] == "cpu" and arg[2] == "float16":
+                continue
+            compare_with_not_fused(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_fused_self_attention_query_mul_key_and_value.py b/python/oneflow/compatible/single_client/test/ops/test_fused_self_attention_query_mul_key_and_value.py
new file mode 100644
index 0000000000000000000000000000000000000000..37fc0c69a139d05b7031ce7a35bf4c9f2538cf0f
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_fused_self_attention_query_mul_key_and_value.py
@@ -0,0 +1,182 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import typing
+import unittest
+
+import numpy as np
+import test_global_storage
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+
+def get_func_conf():
+    func_conf = flow.FunctionConfig()
+    func_conf.default_placement_scope(flow.scope.placement("gpu", "0:0"))
+    return func_conf
+
+
+def get_lr_scheduler():
+    return flow.optimizer.PiecewiseConstantScheduler([], [0.001])
+
+
+def get_alpha(head_size):
+    return 1.0
+
+
+def make_self_attn_qk_v_func(batch_size, seq_len, num_heads, head_size, fused, fp16):
+    flow.clear_default_session()
+    hidden_size = num_heads * 3 * head_size
+
+    @flow.global_function(type="predict", function_config=get_func_conf())
+    def self_attn_qk_v_fw_bw(
+        h: flow.typing.Numpy.Placeholder(
+            shape=(seq_len, batch_size, hidden_size), dtype=flow.float32
+        )
+    ) -> typing.Tuple[flow.typing.Numpy, flow.typing.Numpy]:
+        var = flow.get_variable(
+            "var",
+            shape=(1,),
+            dtype=flow.float32,
+            initializer=flow.constant_initializer(1.0, dtype=flow.float32),
+            trainable=True,
+        )
+        h = h * var
+        if fused:
+            flow.watch_diff(h, test_global_storage.Setter("h_grad_fused"))
+        else:
+            flow.watch_diff(h, test_global_storage.Setter("h_grad"))
+        if fp16:
+            h = flow.amp_white_identity(h)
+        alpha = get_alpha(head_size)
+        if fused:
+            (qmk, v) = flow.nn.fused_self_attention_query_mul_key_and_value(
+                h, head_size=head_size, alpha=alpha
+            )
+        else:
+            h = flow.reshape(h, (seq_len, batch_size, -1, 3 * head_size))
+            (q, k, v) = (
+                flow.transpose(
+                    flow.slice(
+                        h,
+                        begin=[None, None, None, head_size * i],
+                        size=[None, None, None, head_size],
+                    ),
+                    perm=[1, 2, 0, 3],
+                )
+                for i in range(3)
+            )
+            qmk = flow.matmul(q, k, transpose_b=True, alpha=alpha)
+        h = flow.matmul(qmk, v)
+        loss = flow.math.reduce_sum(h)
+        flow.optimizer.SGD(get_lr_scheduler(), momentum=0).minimize(loss)
+        return (qmk, v)
+
+    return self_attn_qk_v_fw_bw
+
+
+def gen_random_input(shape):
+    return np.random.rand(*shape).astype(np.float32)
+
+
+def compare_fused_with_no_fused(
+    test_case, batch_size, seq_len, num_heads, head_size, fp16, verbose=False
+):
+    hidden_size = num_heads * 3 * head_size
+    input = gen_random_input((seq_len, batch_size, hidden_size))
+    func = make_self_attn_qk_v_func(
+        batch_size, seq_len, num_heads, head_size, True, fp16
+    )
+    (qmk, v) = func(input)
+    func_ = make_self_attn_qk_v_func(
+        batch_size, seq_len, num_heads, head_size, False, fp16
+    )
+    (qmk_, v_) = func_(input)
+    (_q, _k, _v) = np_qkv(input, head_size)
+    _qmk = np_bgemm(
+        _q.transpose(1, 2, 0, 3), _k.transpose(1, 2, 3, 0), get_alpha(head_size)
+    )
+    _v = _v.transpose(1, 2, 0, 3)
+    if verbose:
+        print("")
+        print("=" * 80)
+        print(f"input: {input.shape}\n{input}")
+        print(f"_q: {_q.shape}\n{_q}")
+        print(f"_k: {_k.shape}\n{_k}")
+        print(f"_v: {_v.shape}\n{_v}")
+        print(f"_qmk: {_qmk.shape}\n{_qmk}")
+        print(f"qmk: {qmk.shape}\n{qmk}")
+        print(f"qmk_: {qmk_.shape}\n{qmk_}")
+        diff = qmk - qmk_
+        print("abs diff mean:", np.abs(diff).mean())
+        print("abs diff max:", np.abs(diff).max())
+    test_case.assertTrue(np.allclose(qmk, qmk_))
+    test_case.assertTrue(np.allclose(qmk, _qmk))
+    test_case.assertTrue(np.allclose(v, v_))
+    test_case.assertTrue(np.allclose(v, _v))
+    h_grad = test_global_storage.Get("h_grad_fused")
+    h_grad_ = test_global_storage.Get("h_grad")
+    if verbose:
+        print(f"h_grad: {h_grad.shape}\n{h_grad}")
+        print(f"h_grad_: {h_grad_.shape}\n{h_grad_}")
+    test_case.assertTrue(np.allclose(h_grad, h_grad_))
+
+
+def np_qkv(h, head_size):
+    h = np.reshape(h, (h.shape[0], h.shape[1], -1, 3 * head_size))
+    q = h[:, :, :, :head_size]
+    k = h[:, :, :, head_size : head_size * 2]
+    v = h[:, :, :, head_size * 2 :]
+    return (q, k, v)
+
+
+def np_bgemm(a, b, alpha):
+    assert a.ndim == b.ndim
+    assert a.ndim >= 2
+    assert a.shape[-1] == b.shape[-2]
+    if a.ndim > 2:
+        a_ = np.reshape(a, (-1, a.shape[-2], a.shape[-1]))
+        b_ = np.reshape(b, (-1, b.shape[-2], b.shape[-1]))
+        assert a_.shape[0] == b_.shape[0]
+        c = np.zeros(shape=(a_.shape[0], a_.shape[-2], b_.shape[-1]), dtype=np.float32)
+        for i in range(a_.shape[0]):
+            c[i] = np.matmul(a_[i], b_[i]) * alpha
+    else:
+        c = np.matmul(a, b) * alpha
+    shape = a.shape[:-2] + c.shape[-2:]
+    return np.reshape(c, shape)
+
+
+@flow.unittest.skip_unless_1n1d()
+@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+class TestFusedSelfAttentionQueryMulKeyAndValue(flow.unittest.TestCase):
+    def test_fp32(self):
+        if flow.eager_execution_enabled():
+            print("\nSkip under erger mode!")
+            return
+        compare_fused_with_no_fused(self, 4, 1024, 12, 64, False)
+
+    def test_fp16(self):
+        if flow.eager_execution_enabled():
+            print("\nSkip under erger mode!")
+            return
+        compare_fused_with_no_fused(self, 4, 1024, 12, 64, True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_gather.py b/python/oneflow/compatible/single_client/test/ops/test_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b014fb12f784068fdd37a797b634b6f758f4dab
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_gather.py
@@ -0,0 +1,179 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def _random_inputs(params_shape, indices_shape, axis):
+    params = np.random.rand(*params_shape).astype(np.float32)
+    indices = np.random.randint(
+        low=0, high=params_shape[axis], size=indices_shape, dtype=np.int32
+    )
+    return (params, indices)
+
+
+def _make_gather_fn(
+    params, indices, axis, batch_dims, device_type, mirrored, compare_fn
+):
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    if mirrored:
+        func_config.default_logical_view(flow.scope.mirrored_view())
+    else:
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+    def do_gather(x_blob, i_blob):
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "params",
+                shape=params.shape,
+                dtype=flow.float32,
+                initializer=flow.constant_initializer(0),
+            )
+            x = flow.cast_to_current_logical_view(x)
+            x = x + x_blob
+            y = flow.gather(x, i_blob, axis=axis, batch_dims=batch_dims)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+            ).minimize(y)
+        flow.watch_diff(x, compare_fn)
+        return y
+
+    if mirrored:
+
+        @flow.global_function(type="train", function_config=func_config)
+        def gather_fn(
+            params_def: oft.ListNumpy.Placeholder(params.shape, dtype=flow.float),
+            indices_def: oft.ListNumpy.Placeholder(indices.shape, dtype=flow.int32),
+        ):
+            return do_gather(params_def, indices_def)
+
+    else:
+
+        @flow.global_function(type="train", function_config=func_config)
+        def gather_fn(
+            params_def: oft.Numpy.Placeholder(params.shape, dtype=flow.float),
+            indices_def: oft.Numpy.Placeholder(indices.shape, dtype=flow.int32),
+        ):
+            return do_gather(params_def, indices_def)
+
+    return gather_fn
+
+
+def _compare_gather_with_tf(
+    test_case,
+    device_type,
+    params_shape,
+    indices_shape,
+    axis,
+    batch_dims,
+    mirrored=False,
+):
+    (params, indices) = _random_inputs(params_shape, indices_shape, axis)
+    i = tf.constant(indices)
+    with tf.GradientTape() as t:
+        x = tf.Variable(params)
+        y = tf.gather(x, i, axis=axis)
+    dy = t.gradient(y, x)
+    if isinstance(dy, tf.IndexedSlices):
+        test_case.assertTrue(
+            np.array_equal(indices.ravel(), dy.indices.numpy().ravel())
+        )
+        zero_params = tf.Variable(np.full(params.shape, 0.0, dtype=np.float32))
+        dy = tf.tensor_scatter_nd_add(zero_params, i, dy.values)
+    if mirrored:
+
+        def compare_dy(params_grad):
+            test_case.assertTrue(
+                np.array_equal(dy.numpy(), params_grad.numpy_list()[0])
+            )
+
+    else:
+
+        def compare_dy(params_grad):
+            test_case.assertTrue(np.array_equal(dy.numpy(), params_grad.numpy()))
+
+    gather_fn = _make_gather_fn(
+        params, indices, axis, batch_dims, device_type, mirrored, compare_dy
+    )
+    if mirrored:
+        of_y = gather_fn([params], [indices]).get().numpy_list()[0]
+    else:
+        of_y = gather_fn(params, indices).get().numpy()
+    test_case.assertTrue(np.array_equal(y.numpy(), of_y))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestGather(flow.unittest.TestCase):
+    def test_gather(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["params_shape"] = [(2, 8)]
+        arg_dict["indices_shape"] = [(2, 1)]
+        arg_dict["axis"] = [0]
+        arg_dict["batch_dims"] = [0]
+        for arg in GenArgList(arg_dict):
+            _compare_gather_with_tf(test_case, *arg)
+
+    def test_gather_case_1(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["params_shape"] = [(2, 10, 2)]
+        arg_dict["indices_shape"] = [(2, 1)]
+        arg_dict["axis"] = [0]
+        arg_dict["batch_dims"] = [0]
+        for arg in GenArgList(arg_dict):
+            _compare_gather_with_tf(test_case, *arg)
+
+    def test_gather_case_2(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["params_shape"] = [(200, 80)]
+        arg_dict["indices_shape"] = [(150, 1)]
+        arg_dict["axis"] = [0]
+        arg_dict["batch_dims"] = [0]
+        arg_dict["mirrored"] = [True]
+        for arg in GenArgList(arg_dict):
+            _compare_gather_with_tf(test_case, *arg)
+
+    def test_gather_case_3(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["params_shape"] = [(30, 150, 50, 2)]
+        arg_dict["indices_shape"] = [(20, 15, 45)]
+        arg_dict["axis"] = [1]
+        arg_dict["batch_dims"] = [0]
+        arg_dict["mirrored"] = [True]
+        for arg in GenArgList(arg_dict):
+            _compare_gather_with_tf(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_gather_model_parallel.py b/python/oneflow/compatible/single_client/test/ops/test_gather_model_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0e78958f86466793628db7d76bcd3e07afa5019
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_gather_model_parallel.py
@@ -0,0 +1,83 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def _gen_test_data(params_shape, indices_shape, axis):
+    params = np.random.rand(*params_shape).astype(np.float32)
+    indices = np.random.randint(
+        low=0, high=params_shape[axis], size=indices_shape
+    ).astype(np.int32)
+    slices = [slice(None)] * len(params_shape)
+    slices[axis] = indices
+    out = params[tuple(slices)]
+    return (params, indices, out)
+
+
+def _test_gather_model_parallel_fw(
+    test_case, device_type, params_shape, indices_shape, axis, split_axis
+):
+    flow.clear_default_session()
+    flow.config.gpu_device_num(4)
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.consistent_view())
+
+    @flow.global_function(function_config=func_config)
+    def gather_model_parallel_fw_job(
+        params: oft.Numpy.Placeholder(params_shape, dtype=flow.float),
+        indices: oft.Numpy.Placeholder(indices_shape, dtype=flow.int32),
+    ):
+        with flow.scope.placement(device_type, "0:0-3"):
+            params = params.with_distribute(flow.distribute.split(split_axis))
+            indices = indices.with_distribute(flow.distribute.broadcast())
+            return flow.gather(params=params, indices=indices, axis=axis)
+
+    (params_arr, indices_arr, out_arr) = _gen_test_data(
+        params_shape, indices_shape, axis
+    )
+    out = gather_model_parallel_fw_job(params_arr, indices_arr).get().numpy()
+    if axis == split_axis:
+        test_case.assertTrue(np.allclose(out, out_arr))
+    else:
+        test_case.assertTrue(np.array_equal(out, out_arr))
+
+
+@flow.unittest.skip_unless_1n4d()
+class TestGatherModelParallel(flow.unittest.TestCase):
+    def test_gather_model_parallel_fw(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["params_shape"] = [(96, 96, 96)]
+        arg_dict["indices_shape"] = [(32, 48)]
+        arg_dict["axis"] = [0, 1, 2]
+        arg_dict["split_axis"] = [0, 1, 2]
+        for arg in GenArgList(arg_dict):
+            _test_gather_model_parallel_fw(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_gather_nd.py b/python/oneflow/compatible/single_client/test/ops/test_gather_nd.py
new file mode 100644
index 0000000000000000000000000000000000000000..f247a81783ab77c66c68ebac0adc2ae3bb941a88
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_gather_nd.py
@@ -0,0 +1,316 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgDict, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+
+def _random_inputs(x_shape, x_dtype, index_shape, index_dtype):
+    assert isinstance(x_shape, (tuple, list))
+    assert isinstance(index_shape, (tuple, list))
+    assert index_dtype == np.int32 or index_dtype == np.int64
+    if x_dtype == np.float32 or x_dtype == np.double:
+        x = np.random.rand(*x_shape).astype(x_dtype)
+    elif x_dtype == np.int32 or x_dtype == np.int64 or x_dtype == np.int8:
+        x = np.random.randint(low=0, high=100, size=x_shape).astype(x_dtype)
+    else:
+        raise NotImplementedError("{}".format(x_dtype))
+    index = []
+    index_rows = np.prod(index_shape[:-1])
+    index_cols = index_shape[-1]
+    for col in range(index_cols):
+        index_col = np.random.randint(
+            low=0, high=x_shape[col], size=(index_rows,), dtype=index_dtype
+        ).reshape(index_shape[:-1])
+        index.append(index_col)
+    index = np.stack(index, axis=len(index_shape) - 1)
+    return (x, index)
+
+
+def _make_gather_nd_fn(
+    x_shape,
+    index_shape,
+    x_dtype,
+    index_type,
+    device_type,
+    device_num,
+    dynamic,
+    need_grad,
+    comp_diff_fn,
+):
+    assert device_num >= 1
+    fn_type = "train" if need_grad else "predict"
+    if device_type == "gpu":
+        flow.config.gpu_device_num(device_num)
+    elif device_type == "cpu":
+        flow.config.cpu_device_num(device_num)
+    else:
+        raise ValueError
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(x_dtype)
+    func_config.default_placement_scope(
+        flow.scope.placement(device_type, "0:0-{}".format(device_num - 1))
+    )
+    if dynamic:
+        func_config.default_logical_view(flow.scope.mirrored_view())
+    else:
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+    def do_gather_nd(x, index):
+        x_var = flow.get_variable(
+            "params",
+            shape=(1,),
+            dtype=x_dtype,
+            initializer=flow.constant_initializer(0, x_dtype),
+        )
+        x = x + flow.cast_to_current_logical_view(x_var)
+        y = flow.gather_nd(x, index)
+        if need_grad:
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+            ).minimize(y)
+            if callable(comp_diff_fn):
+                flow.watch_diff(x, comp_diff_fn)
+        return y
+
+    if dynamic:
+
+        @flow.global_function(type=fn_type, function_config=func_config)
+        def gather_nd_fn(
+            x: flow.typing.ListNumpy.Placeholder(x_shape, dtype=x_dtype),
+            index: flow.typing.ListNumpy.Placeholder(index_shape, dtype=index_type),
+        ) -> flow.typing.ListNumpy:
+            return do_gather_nd(x, index)
+
+    else:
+
+        @flow.global_function(type=fn_type, function_config=func_config)
+        def gather_nd_fn(
+            x: flow.typing.Numpy.Placeholder(x_shape, dtype=x_dtype),
+            index: flow.typing.Numpy.Placeholder(index_shape, dtype=index_type),
+        ) -> flow.typing.Numpy:
+            return do_gather_nd(x, index)
+
+    return gather_nd_fn
+
+
+def _gather_nd_np(x, index, require_grad=False, init_grad_value=1.0):
+    ndim = index.shape[-1]
+    assert ndim <= x.ndim
+    indices = []
+    for dim in range(ndim):
+        indices.append(index[..., dim])
+    y = x[tuple(indices)]
+    dy = None
+    dx = None
+    if require_grad:
+        dy = np.zeros(shape=y.shape, dtype=np.float32)
+        dy.fill(init_grad_value)
+        dx = np.zeros(shape=x.shape, dtype=np.float32)
+        flat_index = index.reshape(-1, ndim)
+        flat_dy = dy.reshape(-1, *y.shape[index.ndim - 1 :])
+        for (i, nd_index) in enumerate(flat_index):
+            if dx.ndim == ndim:
+                ravel_index = np.ravel_multi_index(nd_index, dx.shape)
+                dx_partial = np.zeros(shape=dx.shape, dtype=np.float32)
+                np.put(dx_partial, ravel_index, flat_dy[i])
+                dx += dx_partial
+            else:
+                dx[tuple(nd_index)] += flat_dy[i]
+    return (y, dx)
+
+
+def _is_floating_dtype(dtype):
+    if dtype in ("float32", "double", "float16"):
+        return True
+    return False
+
+
+def _compare_with_np(
+    test_case,
+    shape,
+    index_shape,
+    dynamic_shape=None,
+    dynamic_index_shape=None,
+    dtype="float32",
+    index_dtype="int32",
+    device_type="gpu",
+    device_num=1,
+    dynamic=False,
+):
+    x_is_floating = _is_floating_dtype(dtype)
+    need_grad = True if x_is_floating else False
+    x_of_dtype = type_name_to_flow_type[dtype]
+    index_of_dtype = type_name_to_flow_type[index_dtype]
+    x_dtype = type_name_to_np_type[dtype]
+    index_dtype = type_name_to_np_type[index_dtype]
+    if dynamic_shape is None:
+        dynamic_shape = shape
+    else:
+        dynamic = True
+    if dynamic_index_shape is None:
+        dynamic_index_shape = index_shape
+    else:
+        dynamic = True
+    if dynamic:
+        (x, index, y, dx) = ([], [], [], [])
+        for _ in range(device_num):
+            (x_, index_) = _random_inputs(
+                dynamic_shape, x_dtype, dynamic_index_shape, index_dtype
+            )
+            (y_, dx_) = _gather_nd_np(x_, index_, need_grad)
+            x.append(x_)
+            index.append(index_)
+            y.append(y_)
+            dx.append(dx_)
+
+        def comp_diff(dx_blob: flow.typing.ListNumpy):
+            for (dx_blob_, dx_) in zip(dx_blob, dx):
+                test_case.assertTrue(np.array_equal(dx_blob_, dx_))
+
+    else:
+        (x, index) = _random_inputs(
+            dynamic_shape, x_dtype, dynamic_index_shape, index_dtype
+        )
+        (y, dx) = _gather_nd_np(x, index, need_grad)
+
+        def comp_diff(dx_blob: flow.typing.Numpy):
+            test_case.assertTrue(np.array_equal(dx_blob, dx))
+
+    flow.clear_default_session()
+    gather_nd_fn = _make_gather_nd_fn(
+        shape,
+        index_shape,
+        x_of_dtype,
+        index_of_dtype,
+        device_type,
+        device_num,
+        dynamic,
+        need_grad,
+        comp_diff if device_num == 1 else None,
+    )
+    ret_y = gather_nd_fn(x, index)
+    if dynamic:
+        for (ret_y_, y_) in zip(ret_y, y):
+            test_case.assertTrue(np.array_equal(ret_y_, y_))
+    else:
+        test_case.assertTrue(np.array_equal(ret_y, y))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestGatherNd(flow.unittest.TestCase):
+    def test_gather_nd(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(10,)]
+        arg_dict["index_shape"] = [(5, 1)]
+        arg_dict["dtype"] = ["float32", "int32", "double"]
+        arg_dict["index_dtype"] = ["int32", "int64"]
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["dynamic"] = [False, True]
+        for arg in GenArgDict(arg_dict):
+            _compare_with_np(test_case, **arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_gather_nd_case_1(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(20, 10, 10, 3, 3)]
+        arg_dict["index_shape"] = [(2, 3, 3)]
+        arg_dict["device_type"] = ["gpu"]
+        for arg in GenArgDict(arg_dict):
+            _compare_with_np(test_case, **arg)
+
+    def test_gather_nd_case_2(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(10, 8, 4)]
+        arg_dict["index_shape"] = [(2, 2)]
+        arg_dict["dtype"] = ["float32", "int32"]
+        arg_dict["index_dtype"] = ["int32", "int64"]
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["dynamic"] = [True]
+        for arg in GenArgDict(arg_dict):
+            _compare_with_np(test_case, **arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_gather_nd_case_3(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(32, 60, 80, 25)]
+        arg_dict["index_shape"] = [(128, 2)]
+        arg_dict["device_type"] = ["gpu"]
+        for arg in GenArgDict(arg_dict):
+            _compare_with_np(test_case, **arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_gather_nd_case_4(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(128, 64, 2, 16, 7)]
+        arg_dict["index_shape"] = [(30, 10, 3)]
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["dynamic"] = [True]
+        for arg in GenArgDict(arg_dict):
+            _compare_with_np(test_case, **arg)
+
+    def test_with_dynamic_x(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(32, 16)]
+        arg_dict["dynamic_shape"] = [(30, 15)]
+        arg_dict["index_shape"] = [(12, 1)]
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        for arg in GenArgDict(arg_dict):
+            _compare_with_np(test_case, **arg)
+
+    def test_with_dynamic_index(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(25, 10)]
+        arg_dict["index_shape"] = [(15, 1)]
+        arg_dict["dynamic_index_shape"] = [(11, 1)]
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        for arg in GenArgDict(arg_dict):
+            _compare_with_np(test_case, **arg)
+
+    def test_with_empty_index(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(12, 13, 7)]
+        arg_dict["index_shape"] = [(5, 10, 2)]
+        arg_dict["dynamic_index_shape"] = [(5, 0, 2)]
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        for arg in GenArgDict(arg_dict):
+            _compare_with_np(test_case, **arg)
+
+
+@unittest.skipIf(True, "skip for now because of single-client tensor_list removed")
+class TestGatherNdParallel(flow.unittest.TestCase):
+    def test_case_1(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(12, 5)]
+        arg_dict["index_shape"] = [(4, 8, 2)]
+        arg_dict["dtype"] = ["float32", "int32", "double"]
+        arg_dict["index_dtype"] = ["int32", "int64"]
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["device_num"] = [4]
+        arg_dict["dynamic"] = [True, False]
+        for arg in GenArgDict(arg_dict):
+            _compare_with_np(test_case, **arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_gelu.py b/python/oneflow/compatible/single_client/test/ops/test_gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba2eee798d392e58f4b77f6d0b4c98b3b38314ad
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_gelu.py
@@ -0,0 +1,63 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import math
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+from test_util import GenArgDict, RunOneflowOp
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def tf_gelu(x):
+    inv_sqrt2 = math.sqrt(0.5)
+    with tf.GradientTape(persistent=True) as tape:
+        x = tf.Variable(x)
+        y = 0.5 * x * (1 + tf.math.erf(inv_sqrt2 * x))
+    x_diff = tape.gradient(y, x)
+    return (y.numpy(), x_diff.numpy())
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestGelu(flow.unittest.TestCase):
+    def test_gelu(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["flow_op"] = [flow.math.gelu]
+        arg_dict["flow_args"] = [[]]
+        arg_dict["x"] = [
+            np.random.uniform(low=-100, high=100, size=(10, 20, 30, 40)).astype(
+                np.float32
+            )
+        ]
+        for arg in GenArgDict(arg_dict):
+            (of_y, of_x_diff) = RunOneflowOp(**arg)
+            (tf_y, tf_x_diff) = tf_gelu(arg["x"])
+            assert np.allclose(of_y, tf_y, rtol=1e-05, atol=1e-05)
+            assert np.allclose(of_x_diff, tf_x_diff, rtol=1e-05, atol=1e-05)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_get_variable.py b/python/oneflow/compatible/single_client/test/ops/test_get_variable.py
new file mode 100644
index 0000000000000000000000000000000000000000..65432581b73c0f9b5ff0863144ec330ed09bd0d5
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_get_variable.py
@@ -0,0 +1,144 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestGetVariable(flow.unittest.TestCase):
+    def test_get_variable_with_same_name(test_case):
+        flow.clear_default_session()
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+
+        def get_v():
+            return flow.get_variable(
+                name="var",
+                shape=(5, 2),
+                dtype=flow.float32,
+                initializer=flow.random_uniform_initializer(),
+            )
+
+        @flow.global_function(function_config=func_config)
+        def TestJob0():
+            v1 = get_v()
+            v2 = get_v()
+            return (v1, v2)
+
+        @flow.global_function(function_config=func_config)
+        def TestJob1():
+            return get_v()
+
+        (j0_v1, j0_v2) = TestJob0().get()
+        j1_v = TestJob1().get()
+        test_case.assertTrue(np.array_equal(j0_v1.numpy(), j0_v2.numpy()))
+        test_case.assertTrue(np.array_equal(j0_v1.numpy(), j1_v.numpy()))
+
+    def test_get_job_shared_variable(test_case):
+        flow.clear_default_session()
+
+        def get_var(name, shape=(2, 5), dtype=flow.float, trainable=False):
+            return flow.get_variable(
+                name=name,
+                shape=shape,
+                dtype=dtype,
+                trainable=trainable,
+                initializer=flow.random_uniform_initializer(),
+            )
+
+        learning_rate = 0.01
+
+        @flow.global_function(type="train", function_config=flow.FunctionConfig())
+        def train(x_def: oft.Numpy.Placeholder(shape=(2, 5), dtype=flow.float)):
+            var = get_var("var", trainable=True)
+            loss = var + x_def
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [learning_rate]),
+                momentum=0,
+            ).minimize(loss)
+            return var
+
+        @flow.global_function()
+        def eval():
+            return get_var("var")
+
+        variables = []
+        for i in range(10):
+            input = np.random.rand(2, 5).astype(np.single)
+            eval_var = eval().get()
+            train_var = train(input).get()
+            test_case.assertTrue(np.array_equal(eval_var.numpy(), train_var.numpy()))
+            if i > 0:
+                test_case.assertTrue(
+                    np.allclose(
+                        eval_var.numpy(), variables[-1] - learning_rate / eval_var.size
+                    )
+                )
+            variables.append(eval_var.numpy())
+
+    def test_get_job_inter_and_intra_shared_variable(test_case):
+        flow.clear_default_session()
+        variable_shape = (2, 5)
+
+        def get_var(name, shape=variable_shape, dtype=flow.float, trainable=False):
+            return flow.get_variable(
+                name=name,
+                shape=shape,
+                dtype=dtype,
+                trainable=trainable,
+                initializer=flow.random_uniform_initializer(),
+            )
+
+        learning_rate = 0.01
+
+        @flow.global_function(type="train", function_config=flow.FunctionConfig())
+        def train(x_def: oft.Numpy.Placeholder(shape=variable_shape, dtype=flow.float)):
+            var = get_var("var", trainable=True)
+            loss = var + x_def
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [learning_rate]),
+                momentum=0,
+            ).minimize(loss)
+            return var
+
+        @flow.global_function()
+        def eval():
+            v1 = get_var("var")
+            v2 = get_var("var")
+            return (v1, v2)
+
+        variables = []
+        for i in range(10):
+            input = np.random.rand(*variable_shape).astype(np.single)
+            (var1, var2) = eval().get()
+            train_var = train(input).get()
+            test_case.assertTrue(np.array_equal(var1.numpy(), var2.numpy()))
+            test_case.assertTrue(np.array_equal(var1.numpy(), train_var.numpy()))
+            if i > 0:
+                test_case.assertTrue(
+                    np.allclose(var1.numpy(), variables[-1] - learning_rate / var1.size)
+                )
+            variables.append(var1.numpy())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_global_function_input_output.py b/python/oneflow/compatible/single_client/test/ops/test_global_function_input_output.py
new file mode 100644
index 0000000000000000000000000000000000000000..950a8409456cd94ea1af585147ba3ec9250d2d90
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_global_function_input_output.py
@@ -0,0 +1,215 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import random
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def _test_input_ndarray_not_contiguous(test_case, shape):
+    assert len(shape) > 1
+    more_than_one_dim_list = []
+    for (axis, dim) in enumerate(shape[1:], 1):
+        if dim > 1:
+            more_than_one_dim_list.append((axis, dim))
+    assert len(more_than_one_dim_list) > 0
+    input = np.random.rand(*shape).astype(np.single)
+    rand_axis = random.choice(more_than_one_dim_list)[0]
+    rand_dim_slice_start = random.randrange(0, input.shape[rand_axis] - 1)
+    rand_dim_slice_stop = random.randrange(
+        rand_dim_slice_start + 1, input.shape[rand_axis]
+    )
+    slice_list = []
+    for axis in range(input.ndim):
+        if axis == rand_axis:
+            slice_list.append(slice(rand_dim_slice_start, rand_dim_slice_stop))
+        else:
+            slice_list.append(slice(None))
+    slice_input = input[tuple(slice_list)]
+    test_case.assertFalse(slice_input.data.c_contiguous)
+    flow.clear_default_session()
+
+    @flow.global_function()
+    def foo_job(
+        x_def: oft.Numpy.Placeholder(shape=slice_input.shape, dtype=flow.float)
+    ):
+        y = x_def + flow.constant(1.0, shape=(1,), dtype=flow.float)
+        return y
+
+    ret = foo_job(slice_input).get()
+    test_case.assertTrue(ret.numpy().data.c_contiguous)
+    test_case.assertTrue(np.array_equal(ret.numpy(), slice_input + 1.0))
+    flow.clear_default_session()
+
+    @flow.global_function()
+    def foo_job(
+        x_def: oft.Numpy.Placeholder(shape=input.shape[::-1], dtype=flow.float)
+    ):
+        y = x_def + flow.constant(1.0, shape=(1,), dtype=flow.float)
+        return y
+
+    transpose_input = input.T
+    test_case.assertFalse(transpose_input.data.c_contiguous)
+    ret = foo_job(transpose_input).get()
+    test_case.assertTrue(ret.numpy().data.c_contiguous)
+    test_case.assertTrue(np.array_equal(ret.numpy(), transpose_input + 1.0))
+
+
+@flow.unittest.skip_unless_1n1d()
+@unittest.skipIf(os.getenv("ONEFLOW_DRY_RUN"), "can't run in dry run")
+class TestGlobalFunctionInputOutput(flow.unittest.TestCase):
+    def test_lazy_input_output(test_case):
+        flow.clear_default_session()
+        flow.enable_eager_execution(False)
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.mirrored_view())
+
+        @flow.global_function(function_config=func_config)
+        def foo_job(input_def: oft.Numpy.Placeholder(shape=(2, 5))):
+            var = flow.get_variable(
+                name="var",
+                shape=(2, 5),
+                dtype=flow.float,
+                initializer=flow.ones_initializer(),
+            )
+            input_def = flow.cast_to_current_logical_view(input_def)
+            var = flow.cast_to_current_logical_view(var)
+            output = var + input_def
+            return output
+
+        input = np.arange(10).reshape(2, 5).astype(np.single)
+        ret = foo_job(input).get()
+        output = input + np.ones(shape=(2, 5), dtype=np.single)
+        test_case.assertTrue(np.array_equal(output, ret.numpy()))
+
+    def test_eager_output(test_case):
+        flow.clear_default_session()
+        flow.enable_eager_execution()
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.mirrored_view())
+
+        @flow.global_function(function_config=func_config)
+        def foo_job():
+            x = flow.constant(1, shape=(2, 5), dtype=flow.float)
+            return x
+
+        ret = foo_job().get()
+        test_case.assertTrue(
+            np.array_equal(np.ones(shape=(2, 5), dtype=np.single), ret.numpy_list()[0])
+        )
+
+    def test_eager_multi_output(test_case):
+        flow.clear_default_session()
+        flow.enable_eager_execution()
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.mirrored_view())
+
+        @flow.global_function(function_config=func_config)
+        def foo_job():
+            x = flow.constant(1, shape=(2, 5), dtype=flow.float)
+            y = flow.get_variable(
+                name="var",
+                shape=(64, 4),
+                dtype=flow.float,
+                initializer=flow.zeros_initializer(),
+            )
+            return (x, y)
+
+        (x, y) = foo_job().get()
+        test_case.assertTrue(
+            np.array_equal(np.ones(shape=(2, 5), dtype=np.single), x.numpy_list()[0])
+        )
+        test_case.assertTrue(
+            np.array_equal(np.zeros(shape=(64, 4), dtype=np.single), y.numpy())
+        )
+
+    def test_eager_input(test_case):
+        flow.clear_default_session()
+        flow.enable_eager_execution()
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.mirrored_view())
+        input = np.random.rand(2, 5).astype(np.single)
+        output = np.maximum(input, 0)
+
+        @flow.global_function(function_config=func_config)
+        def foo_job(x_def: oft.ListNumpy.Placeholder(shape=(2, 5), dtype=flow.float)):
+            y = flow.math.relu(x_def)
+            test_case.assertTrue(np.allclose(y.numpy(0), output))
+
+        foo_job([input])
+
+    def test_eager_input_fixed(test_case):
+        flow.clear_default_session()
+        flow.enable_eager_execution()
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.mirrored_view())
+        input = np.arange(10).astype(np.single)
+        output = input + 1.0
+
+        @flow.global_function(function_config=func_config)
+        def foo_job(x_def: oft.Numpy.Placeholder(shape=(10,), dtype=flow.float)):
+            y = x_def + flow.constant(1.0, shape=(1,), dtype=flow.float)
+            test_case.assertTrue(np.allclose(y.numpy(0), output))
+
+        foo_job(input)
+
+    def test_eager_multi_input(test_case):
+        flow.clear_default_session()
+        flow.enable_eager_execution()
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.mirrored_view())
+        input_1 = np.random.rand(3, 4).astype(np.single)
+        input_2 = np.array([2]).astype(np.single)
+        output = input_1 * input_2
+
+        @flow.global_function(function_config=func_config)
+        def foo_job(
+            x_def: oft.ListNumpy.Placeholder(shape=(3, 4), dtype=flow.float),
+            y_def: oft.ListNumpy.Placeholder(shape=(1,), dtype=flow.float),
+        ):
+            y = x_def * y_def
+            test_case.assertTrue(np.allclose(y.numpy(0), output))
+
+        foo_job([input_1], [input_2])
+
+    def test_eager_input_output(test_case):
+        flow.clear_default_session()
+        flow.enable_eager_execution()
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.mirrored_view())
+        input = np.random.rand(5, 4).astype(np.single)
+        output = input * 2.0
+
+        @flow.global_function(function_config=func_config)
+        def foo_job(x_def: oft.ListNumpy.Placeholder(shape=(5, 4), dtype=flow.float)):
+            y = x_def * flow.constant(2.0, shape=(1,), dtype=flow.float)
+            return y
+
+        ret = foo_job([input]).get()
+        test_case.assertTrue(np.allclose(output, ret.numpy_list()[0]))
+
+    def test_input_ndarray_not_contiguous(test_case):
+        _test_input_ndarray_not_contiguous(test_case, (10, 20, 30))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_global_function_signature.py b/python/oneflow/compatible/single_client/test/ops/test_global_function_signature.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb6124ce5c4beafd1e4c00738474b96c05197b94
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_global_function_signature.py
@@ -0,0 +1,390 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from typing import Dict, List, Tuple
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestGlobalFunctionSignature(flow.unittest.TestCase):
+    def test_annotation_return_None(test_case):
+        flow.config.gpu_device_num(1)
+
+        @flow.global_function()
+        def foo(x: oft.Numpy.Placeholder((10,))) -> None:
+            pass
+
+        data = np.ones((10,), dtype=np.float32)
+        test_case.assertTrue(foo(data) is None)
+
+    def test_annotation_Numpy(test_case):
+        flow.config.gpu_device_num(1)
+
+        @flow.global_function()
+        def foo(x: oft.Numpy.Placeholder((10,))) -> oft.Numpy:
+            return x
+
+        data = np.ones((10,), dtype=np.float32)
+        test_case.assertTrue(np.array_equal(foo(data), data))
+
+    def test_annotation_ListNumpy(test_case):
+        flow.config.gpu_device_num(1)
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.mirrored_view())
+
+        @flow.global_function(function_config=func_config)
+        def foo(x: oft.ListNumpy.Placeholder((10,))) -> oft.ListNumpy:
+            return x
+
+        data = np.ones((10,), dtype=np.float32)
+        test_case.assertTrue(np.array_equal(foo([data])[0], data))
+
+    def test_annotation_watch_Numpy(test_case):
+        data = np.ones((10,), dtype=np.float32)
+
+        def Watch(x: oft.Numpy):
+            test_case.assertTrue(np.array_equal(x, data))
+
+        flow.config.gpu_device_num(1)
+
+        @flow.global_function()
+        def foo(x: oft.Numpy.Placeholder((10,))) -> oft.Numpy:
+            flow.watch(x, Watch)
+            return x
+
+        foo(data)
+
+    def test_annotation_watch_ListNumpy(test_case):
+        data = np.ones((10,), dtype=np.float32)
+
+        def Watch(x: oft.ListNumpy):
+            test_case.assertTrue(np.array_equal(x[0], data))
+
+        flow.config.gpu_device_num(1)
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.mirrored_view())
+
+        @flow.global_function(function_config=func_config)
+        def foo(x: oft.ListNumpy.Placeholder((10,))) -> oft.ListNumpy:
+            flow.watch(x, Watch)
+            return x
+
+        foo([data])
+
+    def test_annotation_Dict_Numpy(test_case):
+        flow.config.gpu_device_num(1)
+
+        @flow.global_function()
+        def foo(x: oft.Numpy.Placeholder((10,))) -> Dict[str, oft.Numpy]:
+            return {"x": x}
+
+        data = np.ones((10,), dtype=np.float32)
+        test_case.assertTrue(np.array_equal(foo(data)["x"], data))
+
+    def test_annotation_Dict_ListNumpy(test_case):
+        flow.config.gpu_device_num(1)
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.mirrored_view())
+
+        @flow.global_function(function_config=func_config)
+        def foo(x: oft.ListNumpy.Placeholder((10,))) -> Dict[str, oft.ListNumpy]:
+            return {"x": x}
+
+        data = np.ones((10,), dtype=np.float32)
+        test_case.assertTrue(np.array_equal(foo([data])["x"][0], data))
+
+    def test_annotation_Dict_Nesting_Numpy(test_case):
+        flow.config.gpu_device_num(1)
+
+        @flow.global_function()
+        def foo(x: oft.Numpy.Placeholder((10,))) -> Dict[str, Dict[str, oft.Numpy]]:
+            return {"x": {"x": x}}
+
+        data = np.ones((10,), dtype=np.float32)
+        test_case.assertTrue(np.array_equal(foo(data)["x"]["x"], data))
+
+    def test_annotation_Dict_Nesting_ListNumpy(test_case):
+        flow.config.gpu_device_num(1)
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.mirrored_view())
+
+        @flow.global_function(function_config=func_config)
+        def foo(
+            x: oft.ListNumpy.Placeholder((10,))
+        ) -> Dict[str, Dict[str, oft.ListNumpy]]:
+            return {"x": {"x": x}}
+
+        data = np.ones((10,), dtype=np.float32)
+        test_case.assertTrue(np.array_equal(foo([data])["x"]["x"][0], data))
+
+    def test_annotation_Tuple_Numpy(test_case):
+        flow.config.gpu_device_num(1)
+
+        @flow.global_function()
+        def foo(x: Tuple[oft.Numpy.Placeholder((10,))]) -> Tuple[oft.Numpy]:
+            return x
+
+        data = np.ones((10,), dtype=np.float32)
+        test_case.assertTrue(np.array_equal(foo((data,))[0], data))
+
+    def test_annotation_Tuple_ListNumpy(test_case):
+        flow.config.gpu_device_num(1)
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.mirrored_view())
+
+        @flow.global_function(function_config=func_config)
+        def foo(x: Tuple[oft.ListNumpy.Placeholder((10,))]) -> Tuple[oft.ListNumpy]:
+            return x
+
+        data = np.ones((10,), dtype=np.float32)
+        test_case.assertTrue(np.array_equal(foo(([data],))[0][0], data))
+
+    def test_annotation_Callback_Numpy(test_case):
+        data = np.ones((10,), dtype=np.float32)
+
+        def Test(x: oft.Numpy):
+            test_case.assertTrue(np.array_equal(x, data))
+
+        flow.config.gpu_device_num(1)
+
+        @flow.global_function()
+        def foo(x: oft.Numpy.Placeholder((10,))) -> oft.Callback[oft.Numpy]:
+            return x
+
+        foo(data)(Test)
+
+    def test_annotation_Callback_ListNumpy(test_case):
+        data = np.ones((10,), dtype=np.float32)
+
+        def Test(x: oft.ListNumpy):
+            test_case.assertTrue(np.array_equal(x[0], data))
+
+        flow.config.gpu_device_num(1)
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.mirrored_view())
+
+        @flow.global_function(function_config=func_config)
+        def foo(x: oft.ListNumpy.Placeholder((10,))) -> oft.Callback[oft.ListNumpy]:
+            return x
+
+        foo([data])(Test)
+
+    def test_annotation_Callback_Tuple_Numpy(test_case):
+        data = np.ones((10,), dtype=np.float32)
+
+        def Test(x: Tuple[oft.Numpy]):
+            test_case.assertTrue(np.array_equal(x[0], data))
+
+        flow.config.gpu_device_num(1)
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.mirrored_view())
+
+        @flow.global_function(function_config=func_config)
+        def foo(x: oft.Numpy.Placeholder((10,))) -> oft.Callback[Tuple[oft.Numpy]]:
+            return (x,)
+
+        foo(data)(Test)
+
+    def test_annotation_Callback_Tuple_ListNumpy(test_case):
+        data = np.ones((10,), dtype=np.float32)
+
+        def Test(x: Tuple[oft.ListNumpy]):
+            test_case.assertTrue(np.array_equal(x[0][0], data))
+
+        flow.config.gpu_device_num(1)
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.mirrored_view())
+
+        @flow.global_function(function_config=func_config)
+        def foo(
+            x: oft.ListNumpy.Placeholder((10,))
+        ) -> oft.Callback[Tuple[oft.ListNumpy]]:
+            return (x,)
+
+        foo([data])(Test)
+
+    def test_annotation_Bundle_Numpy(test_case):
+        flow.config.gpu_device_num(1)
+
+        @flow.global_function()
+        def foo(x: oft.Numpy.Placeholder((10,))) -> oft.Bundle[oft.Numpy]:
+            return x
+
+        data = np.ones((10,), dtype=np.float32)
+        test_case.assertTrue(np.array_equal(foo(data), data))
+
+    def test_annotation_Bundle_List_Numpy(test_case):
+        flow.config.gpu_device_num(1)
+
+        @flow.global_function()
+        def foo(x: oft.Numpy.Placeholder((10,))) -> oft.Bundle[oft.Numpy]:
+            return [x]
+
+        data = np.ones((10,), dtype=np.float32)
+        test_case.assertTrue(np.array_equal(foo(data)[0], data))
+
+    def test_annotation_Bundle_Dict_Numpy(test_case):
+        flow.config.gpu_device_num(1)
+
+        @flow.global_function()
+        def foo(x: oft.Numpy.Placeholder((10,))) -> oft.Bundle[oft.Numpy]:
+            return {"x": x}
+
+        data = np.ones((10,), dtype=np.float32)
+        test_case.assertTrue(np.array_equal(foo(data)["x"], data))
+
+    def test_annotation_Bundle_Tuple_Numpy(test_case):
+        flow.config.gpu_device_num(1)
+
+        @flow.global_function()
+        def foo(x: oft.Numpy.Placeholder((10,))) -> oft.Bundle[oft.Numpy]:
+            return (x,)
+
+        data = np.ones((10,), dtype=np.float32)
+        test_case.assertTrue(np.array_equal(foo(data)[0], data))
+
+    def test_annotation_Bundle_Mix_Nesting_Numpy(test_case):
+        flow.config.gpu_device_num(1)
+
+        @flow.global_function()
+        def foo(x: oft.Numpy.Placeholder((10,))) -> oft.Bundle[oft.Numpy]:
+            return (x, (x,), [x, x, x], {"x": {256: x}})
+
+        data = np.ones((10,), dtype=np.float32)
+        test_case.assertTrue(np.array_equal(foo(data)[0], data))
+        test_case.assertTrue(np.array_equal(foo(data)[1][0], data))
+        test_case.assertTrue(np.array_equal(foo(data)[2][0], data))
+        test_case.assertTrue(np.array_equal(foo(data)[2][1], data))
+        test_case.assertTrue(np.array_equal(foo(data)[2][2], data))
+        test_case.assertTrue(np.array_equal(foo(data)[3]["x"][256], data))
+
+    def test_annotation_Bundle_ListNumpy(test_case):
+        flow.config.gpu_device_num(1)
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.mirrored_view())
+
+        @flow.global_function(function_config=func_config)
+        def foo(x: oft.ListNumpy.Placeholder((10,))) -> oft.Bundle[oft.ListNumpy]:
+            return x
+
+        data = np.ones((10,), dtype=np.float32)
+        test_case.assertTrue(np.array_equal(foo([data])[0], data))
+
+    def test_annotation_Bundle_List_ListNumpy(test_case):
+        flow.config.gpu_device_num(1)
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.mirrored_view())
+
+        @flow.global_function(function_config=func_config)
+        def foo(x: oft.ListNumpy.Placeholder((10,))) -> oft.Bundle[oft.ListNumpy]:
+            return [x]
+
+        data = np.ones((10,), dtype=np.float32)
+        test_case.assertTrue(np.array_equal(foo([data])[0][0], data))
+
+    def test_annotation_Bundle_Dict_ListNumpy(test_case):
+        flow.config.gpu_device_num(1)
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.mirrored_view())
+
+        @flow.global_function(function_config=func_config)
+        def foo(x: oft.ListNumpy.Placeholder((10,))) -> oft.Bundle[oft.ListNumpy]:
+            return {"x": x}
+
+        data = np.ones((10,), dtype=np.float32)
+        test_case.assertTrue(np.array_equal(foo([data])["x"][0], data))
+
+    def test_annotation_Bundle_Tuple_ListNumpy(test_case):
+        flow.config.gpu_device_num(1)
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.mirrored_view())
+
+        @flow.global_function(function_config=func_config)
+        def foo(x: oft.ListNumpy.Placeholder((10,))) -> oft.Bundle[oft.ListNumpy]:
+            return (x,)
+
+        data = np.ones((10,), dtype=np.float32)
+        test_case.assertTrue(np.array_equal(foo([data])[0][0], data))
+
+    def test_annotation_Bundle_Mix_Nesting_ListNumpy(test_case):
+        flow.config.gpu_device_num(1)
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.mirrored_view())
+
+        @flow.global_function(function_config=func_config)
+        def foo(x: oft.ListNumpy.Placeholder((10,))) -> oft.Bundle[oft.ListNumpy]:
+            return (x, (x,), [x, x, x], {"x": {256: x}})
+
+        data = np.ones((10,), dtype=np.float32)
+        test_case.assertTrue(np.array_equal(foo([data])[0][0], data))
+        test_case.assertTrue(np.array_equal(foo([data])[1][0][0], data))
+        test_case.assertTrue(np.array_equal(foo([data])[2][0][0], data))
+        test_case.assertTrue(np.array_equal(foo([data])[2][1][0], data))
+        test_case.assertTrue(np.array_equal(foo([data])[2][2][0], data))
+        test_case.assertTrue(np.array_equal(foo([data])[3]["x"][256][0], data))
+
+    def test_annotation_return_List_Numpy(test_case):
+        data = np.ones((10,), dtype=np.float32)
+        flow.clear_default_session()
+        flow.config.gpu_device_num(1)
+
+        @flow.global_function()
+        def foo(x: oft.Numpy.Placeholder(shape=data.shape)) -> List[oft.Numpy]:
+            return [x, x, x]
+
+        (x, y, z) = foo(data)
+        test_case.assertTrue(np.array_equal(x, data))
+        test_case.assertTrue(np.array_equal(y, data))
+        test_case.assertTrue(np.array_equal(z, data))
+
+    def test_annotation_return_List_ListNumpy(test_case):
+        data = np.ones((10,), dtype=np.float32)
+        flow.clear_default_session()
+
+        def foo(x: oft.ListNumpy.Placeholder(shape=data.shape)) -> List[oft.ListNumpy]:
+            return [x, x]
+
+        (x, y) = foo([data])
+        test_case.assertTrue(np.array_equal(x[0], data))
+        test_case.assertTrue(np.array_equal(y[0], data))
+
+    def test_annotation_return_List_Nesting_Tuple(test_case):
+        x = np.random.rand(5).astype(np.float32)
+        y = np.random.rand(10).astype(np.float32)
+        flow.clear_default_session()
+
+        def foo(
+            x: oft.Numpy.Placeholder(shape=x.shape),
+            y: oft.ListNumpy.Placeholder(shape=y.shape),
+        ) -> Tuple[List[oft.Numpy], List[oft.ListNumpy]]:
+            return ([x, x, x], [y, y])
+
+        (x_list, y_list) = foo(x, [y])
+        test_case.assertTrue(np.array_equal(x_list[0], x))
+        test_case.assertTrue(np.array_equal(x_list[1], x))
+        test_case.assertTrue(np.array_equal(x_list[2], x))
+        test_case.assertTrue(np.array_equal(y_list[0][0], y))
+        test_case.assertTrue(np.array_equal(y_list[1][0], y))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_global_storage.py b/python/oneflow/compatible/single_client/test/ops/test_global_storage.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6f5a7254b2efe1c3357e099ce1f123f697ea328
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_global_storage.py
@@ -0,0 +1,32 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import oneflow.compatible.single_client.unittest
+
+global_storage = {}
+
+
+def Get(name):
+    return global_storage.get(name).numpy()
+
+
+def Setter(name):
+    global global_storage
+
+    def _set(x):
+        global_storage[name] = x
+
+    return _set
diff --git a/python/oneflow/compatible/single_client/test/ops/test_gpt_data_loader.py b/python/oneflow/compatible/single_client/test/ops/test_gpt_data_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..3aeca4b22af1744cc316220d23ea1c1f96a924df
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_gpt_data_loader.py
@@ -0,0 +1,207 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+
+def _make_gpt_data_loader_func(
+    data_file_prefix,
+    seq_length,
+    num_samples,
+    batch_size,
+    dtype,
+    shuffle=None,
+    random_seed=None,
+    split_sizes=None,
+    split_index=None,
+    machine_num=1,
+    device_num=1,
+    parallel_distribution=None,
+    start_from_saved_progress=False,
+):
+    assert machine_num > 0
+    assert device_num > 0 and device_num <= 4
+    parallel_hierachy = None
+    if machine_num == 1:
+        device_strs = "0:0-{}".format(device_num - 1)
+    elif machine_num > 1:
+        device_strs = [
+            "{}:0-{}".format(machine_id, device_num - 1)
+            for machine_id in range(machine_num)
+        ]
+        parallel_hierachy = (machine_num, device_num)
+    else:
+        raise ValueError("invalid machine_num", machine_num)
+    flow.clear_default_session()
+    flow.config.cpu_device_num(4)
+    flow.config.enable_legacy_model_io(True)
+    func_cfg = flow.FunctionConfig()
+    func_cfg.default_logical_view(flow.scope.consistent_view())
+
+    @flow.global_function("predict", function_config=func_cfg)
+    def gpt_loader_fn() -> flow.typing.Numpy:
+        with flow.scope.placement("cpu", device_strs, parallel_hierachy):
+            tokens = flow.data.megatron_gpt_mmap_data_loader(
+                data_file_prefix=data_file_prefix,
+                seq_length=seq_length,
+                num_samples=num_samples,
+                batch_size=batch_size,
+                dtype=dtype,
+                shuffle=shuffle,
+                random_seed=random_seed,
+                split_sizes=split_sizes,
+                split_index=split_index,
+                parallel_distribution=parallel_distribution,
+                start_from_saved_progress=start_from_saved_progress,
+                name="GPTDataLoader",
+            )
+            if (
+                isinstance(parallel_distribution, list)
+                and len(parallel_distribution) > 1
+            ):
+                tokens = flow.hierarchical_parallel_cast(
+                    tokens, parallel_distribution=["B", "B"]
+                )
+        tokens = flow.hierarchical_parallel_cast(tokens, parallel_distribution=["B"])
+        return tokens
+
+    check_point = flow.train.CheckPoint()
+    check_point.init()
+    return gpt_loader_fn
+
+
+@unittest.skipIf(
+    os.getenv("ONEFLOW_TEST_GITHUB_HOSTED"),
+    "/dataset not available on GitHub hosted servers",
+)
+class TestGPTDataLoader(flow.unittest.TestCase):
+    DATA_FILE_PREFIX = "/dataset/Megatron-LM/dummy/gpt_sample_dataset_text_document"
+    SEQ_LENGTH = 1024
+    RANDOM_SEED = 12345
+
+    @flow.unittest.skip_unless_1n1d()
+    @unittest.skipIf(
+        flow.unittest.env.eager_execution_enabled(),
+        "2-D SBP doesn't work in eager mode",
+    )
+    def test_simple(self):
+        of_gpt_data_loader_fn = _make_gpt_data_loader_func(
+            data_file_prefix=self.DATA_FILE_PREFIX,
+            seq_length=10,
+            num_samples=10,
+            batch_size=2,
+            dtype=flow.int64,
+            shuffle=False,
+            start_from_saved_progress=True,
+        )
+        tokens = of_gpt_data_loader_fn()
+        cmp_tokens = np.array(
+            [
+                [40, 1101, 845, 845, 3772, 13, 428, 318, 257, 1492, 13],
+                [13, 612, 318, 257, 18739, 550, 257, 3290, 13, 50256, 464],
+            ],
+            dtype=np.int64,
+        )
+        self.assertTrue(np.array_equal(tokens, cmp_tokens))
+
+    @unittest.skipIf(
+        flow.unittest.env.eager_execution_enabled(),
+        "2-D SBP doesn't work in eager mode",
+    )
+    def test_1n1d(self):
+        of_gpt_data_loader_fn = _make_gpt_data_loader_func(
+            data_file_prefix=self.DATA_FILE_PREFIX,
+            seq_length=self.SEQ_LENGTH,
+            num_samples=648,
+            batch_size=8,
+            split_sizes=[949, 50, 1],
+            split_index=0,
+            dtype=flow.int64,
+            shuffle=True,
+            random_seed=self.RANDOM_SEED,
+        )
+        tokens_list = []
+        for _ in range(5):
+            tokens = of_gpt_data_loader_fn()
+            tokens_list.append(tokens)
+        return np.stack(tokens_list, axis=0)
+
+    @flow.unittest.skip_unless_1n4d()
+    @unittest.skipIf(
+        flow.unittest.env.eager_execution_enabled(),
+        "2-D SBP doesn't work in eager mode",
+    )
+    def test_1n4d(self):
+        of_gpt_data_loader_fn = _make_gpt_data_loader_func(
+            data_file_prefix=self.DATA_FILE_PREFIX,
+            seq_length=self.SEQ_LENGTH,
+            num_samples=648,
+            batch_size=8,
+            split_sizes=[949, 50, 1],
+            split_index=0,
+            dtype=flow.int64,
+            shuffle=True,
+            random_seed=self.RANDOM_SEED,
+            device_num=4,
+            parallel_distribution=["S(0)"],
+        )
+        tokens_list = []
+        for _ in range(5):
+            tokens = of_gpt_data_loader_fn()
+            tokens_list.append(tokens)
+        result_1n4d = np.stack(tokens_list, axis=0)
+        result_1n1d = self.test_1n1d()
+        self.assertTrue(np.array_equal(result_1n4d, result_1n1d))
+        return result_1n4d
+
+    @flow.unittest.skip_unless_2n4d()
+    @unittest.skipIf(
+        flow.unittest.env.eager_execution_enabled(),
+        "2-D SBP doesn't work in eager mode",
+    )
+    def test_2n4d(self):
+        of_gpt_data_loader_fn = _make_gpt_data_loader_func(
+            data_file_prefix=self.DATA_FILE_PREFIX,
+            seq_length=self.SEQ_LENGTH,
+            num_samples=648,
+            batch_size=8,
+            split_sizes=[949, 50, 1],
+            split_index=0,
+            dtype=flow.int64,
+            shuffle=True,
+            random_seed=self.RANDOM_SEED,
+            machine_num=2,
+            device_num=4,
+            parallel_distribution=["S(0)", "B"],
+        )
+        tokens_list = []
+        for _ in range(5):
+            tokens = of_gpt_data_loader_fn()
+            tokens_list.append(tokens)
+        result_2n4d = np.stack(tokens_list, axis=0)
+        result_1n1d = self.test_1n1d()
+        self.assertTrue(np.array_equal(result_2n4d, result_1n1d))
+        return result_2n4d
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_group_norm_op.py b/python/oneflow/compatible/single_client/test/ops/test_group_norm_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..468f2db8a9175690c64b11942756b84c24d33827
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_group_norm_op.py
@@ -0,0 +1,147 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+def getGroupNormOutAndGrad(input, gout, num_groups, eps):
+    assert len(input.shape) == len(gout.shape)
+    assert len(input.shape) >= 3
+    channel = input.shape[1]
+    assert channel % num_groups == 0
+    orig_shape = input.shape
+    input_reshape_to_1d = np.reshape(input, (input.shape[0], num_groups, -1))
+    gout_reshape_to_1d = np.reshape(gout, (gout.shape[0], num_groups, -1))
+    gamma = np.ones((1, channel, 1), dtype=np.float32)
+    mean_np = np.mean(input_reshape_to_1d, axis=2, keepdims=True)
+    in_sub_mean = input_reshape_to_1d - mean_np
+    var_np = np.mean(np.square(in_sub_mean), axis=2, keepdims=True)
+    invar_np = 1.0 / np.sqrt(var_np + eps)
+    out_np = np.reshape(in_sub_mean * invar_np, (input.shape[0], channel, -1)) * gamma
+    gvar = (
+        np.reshape(
+            gout_reshape_to_1d * in_sub_mean * -0.5 * np.power(var_np + eps, -1.5),
+            (gout.shape[0], channel, -1),
+        )
+        * gamma
+    )
+    gvar = np.reshape(gvar, (gout.shape[0], num_groups, -1))
+    gvar = np.sum(gvar, axis=2, keepdims=True)
+    gmean = np.reshape(gout_reshape_to_1d, (gout.shape[0], channel, -1)) * gamma
+    gmean = np.sum(
+        np.reshape(gmean, (gout.shape[0], num_groups, -1)), axis=2, keepdims=True
+    )
+    gmean *= -invar_np
+    scale = 1.0 / input_reshape_to_1d.shape[2]
+    tmp = scale * np.sum(-2.0 * in_sub_mean, axis=2, keepdims=True) * gvar
+    gmean += tmp
+    gin_np = (
+        np.reshape(
+            gout_reshape_to_1d * invar_np
+            + gvar * scale * 2.0 * in_sub_mean
+            + gmean * scale,
+            (input.shape[0], channel, -1),
+        )
+        * gamma
+    )
+    return (np.reshape(out_np, list(orig_shape)), np.reshape(gin_np, list(orig_shape)))
+
+
+def _compare_group_norm_nd_with_np(
+    input_shape, device_type, machine_ids, device_counts, num_groups, eps, affine
+):
+    assert device_type in ["cpu", "gpu"]
+    assert len(input_shape) >= 3 and len(input_shape) <= 5
+    flow.clear_default_session()
+    if device_type == "cpu":
+        flow.config.cpu_device_num(device_counts)
+    else:
+        flow.config.gpu_device_num(device_counts)
+    func_config = flow.FunctionConfig()
+    func_config.default_placement_scope(flow.scope.placement(device_type, machine_ids))
+    input = np.random.random(size=input_shape).astype(np.float32)
+    gout = np.random.random(size=input_shape).astype(np.float32)
+    (out_np, gin_np) = getGroupNormOutAndGrad(input, gout, num_groups, eps)
+
+    def assert_prediction_grad(gin_of: tp.Numpy):
+        assert np.allclose(gin_of, gin_np, atol=1e-05)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def groupNormJob(
+        of_input: tp.Numpy.Placeholder(shape=input.shape),
+        multipler: tp.Numpy.Placeholder(shape=input.shape),
+    ) -> tp.Numpy:
+        with flow.scope.placement(device_type, "0:0"):
+            v = flow.get_variable(
+                shape=of_input.shape,
+                dtype=flow.float32,
+                initializer=flow.constant_initializer(0),
+                name="v",
+            )
+            x_var = of_input + v
+            flow.watch_diff(x_var, assert_prediction_grad)
+        out = flow.nn.GroupNorm(x_var, num_groups=num_groups, eps=eps, affine=True)
+        with flow.scope.placement(device_type, "0:0"):
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+            ).minimize(out * multipler)
+        return out
+
+    of_out = groupNormJob(input, gout)
+    assert np.allclose(of_out, out_np, atol=1e-05)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestGroupNormND1n1d(flow.unittest.TestCase):
+    def test_group_norm(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["input_shape"] = [(4, 8, 32, 32)]
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["machine_ids"] = ["0:0"]
+        arg_dict["device_counts"] = [1]
+        arg_dict["num_groups"] = [4, 8]
+        arg_dict["eps"] = [0.001]
+        arg_dict["affine"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            _compare_group_norm_nd_with_np(*arg)
+
+
+@flow.unittest.skip_unless_1n2d()
+class TestGroupNormND1n2d(flow.unittest.TestCase):
+    def test_group_norm(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["input_shape"] = [(4, 8, 32, 32)]
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["machine_ids"] = ["0:0-1"]
+        arg_dict["device_counts"] = [2]
+        arg_dict["num_groups"] = [4, 8]
+        arg_dict["eps"] = [0.001]
+        arg_dict["affine"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            _compare_group_norm_nd_with_np(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_hardsigmoid.py b/python/oneflow/compatible/single_client/test/ops/test_hardsigmoid.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7f316e5dd64a7d086b94a7eb910318f5913d9c2
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_hardsigmoid.py
@@ -0,0 +1,205 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import random
+import unittest
+from collections import OrderedDict
+from typing import Dict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+def _compare_hardsigmoid_with_np(
+    input_shape, device_type, value_type, machine_ids, device_counts
+):
+    if value_type[1] == flow.float16:
+        input_1 = np.random.uniform(-3.5, 3.5, size=input_shape).astype(np.float16)
+        input_1 += np.random.randn(*input_shape).astype(np.float16)
+        input_1 = np.array(input_1, dtype=value_type[0])
+    else:
+        input_1 = np.random.uniform(-3.5, 3.5, size=input_shape).astype(value_type[0])
+        input_1 += np.random.randn(*input_shape).astype(value_type[0])
+    assert device_type in ["cpu", "gpu"]
+    flow.clear_default_session()
+    if device_type == "cpu":
+        flow.config.cpu_device_num(device_counts)
+    else:
+        flow.config.gpu_device_num(device_counts)
+    func_config = flow.FunctionConfig()
+    func_config.default_placement_scope(flow.scope.placement(device_type, machine_ids))
+    if value_type[1] == flow.float16:
+        func_config.default_data_type(flow.float32)
+    else:
+        func_config.default_data_type(value_type[1])
+
+    def np_hardsigmoid(input):
+        input_shape = input.shape
+        input = input.flatten()
+        elem_cnt = input.size
+        _zero = np.zeros_like(input)
+        for i in range(elem_cnt):
+            if input[i] >= 3:
+                _zero[i] = 1
+            elif input[i] <= -3:
+                _zero[i] = 0
+            else:
+                _zero[i] = input[i] / 6 + 0.5
+        np_hsigmoid_out = np.reshape(_zero, newshape=input_shape)
+        return np.array(np_hsigmoid_out).astype(value_type[0])
+
+    np_out_hardsigmoid = np_hardsigmoid(input_1)
+
+    def np_diff(input):
+        input_shape = input.shape
+        input = input.flatten()
+        elem_cnt = input.size
+        diff = np.zeros(shape=(elem_cnt,), dtype=value_type[0])
+        for i in range(elem_cnt):
+            if input[i] > -3 and input[i] < 3:
+                diff[i] = 1 / 6
+        diff = np.reshape(diff, newshape=input_shape)
+        return diff
+
+    _np_grad = np_diff(input_1)
+
+    def assert_prediction_grad(blob: tp.Numpy):
+        if value_type[1] == flow.float16:
+            assert np.allclose(blob, _np_grad, atol=0.001)
+        else:
+            assert np.allclose(blob, _np_grad, atol=1e-05)
+
+    if value_type[1] == flow.float16:
+
+        @flow.global_function(type="train", function_config=func_config)
+        def oneflow_hardsigmoid(
+            of_input_1: tp.Numpy.Placeholder(shape=input_1.shape, dtype=flow.float32)
+        ) -> tp.Numpy:
+            with flow.scope.placement(device_type, "0:0"):
+                v = flow.get_variable(
+                    shape=input_1.shape,
+                    dtype=flow.float32,
+                    initializer=flow.zeros_initializer(),
+                    name="x_var",
+                )
+                x_var = of_input_1 + v
+                x_f16 = flow.cast(x_var, flow.float16)
+            of_hardsigmoid_out_f16 = flow.nn.hardsigmoid(x_f16)
+            of_hardsigmoid_out_f32 = flow.cast(of_hardsigmoid_out_f16, flow.float32)
+            with flow.scope.placement(device_type, "0:0"):
+                flow.optimizer.SGD(
+                    flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+                ).minimize(of_hardsigmoid_out_f32)
+            flow.watch_diff(x_var, assert_prediction_grad)
+            return of_hardsigmoid_out_f32
+
+    else:
+
+        @flow.global_function(type="train", function_config=func_config)
+        def oneflow_hardsigmoid(
+            of_input_1: tp.Numpy.Placeholder(shape=input_1.shape, dtype=value_type[1])
+        ) -> tp.Numpy:
+            with flow.scope.placement(device_type, "0:0"):
+                v = flow.get_variable(
+                    shape=input_1.shape,
+                    dtype=value_type[1],
+                    initializer=flow.zeros_initializer(),
+                    name="x_var",
+                )
+                x_var = of_input_1 + v
+            flow.watch_diff(x_var, assert_prediction_grad)
+            of_hardsigmoid_out = flow.nn.hardsigmoid(x_var)
+            with flow.scope.placement(device_type, "0:0"):
+                flow.optimizer.SGD(
+                    flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+                ).minimize(of_hardsigmoid_out)
+            return of_hardsigmoid_out
+
+    of_out_hardsigmoid = oneflow_hardsigmoid(input_1)
+    if value_type[1] == flow.float16:
+        assert np.allclose(of_out_hardsigmoid, np_out_hardsigmoid, atol=0.01)
+    else:
+        assert np.allclose(of_out_hardsigmoid, np_out_hardsigmoid, atol=1e-05)
+
+
+def _gen_arg_dict(shape, device_type, value_type, machine_ids, device_counts):
+    arg_dict = OrderedDict()
+    arg_dict["input_shape"] = [shape]
+    arg_dict["device_type"] = [device_type]
+    if value_type == "float" and device_type == "cpu":
+        arg_dict["value_type"] = [
+            (np.float32, flow.float32),
+            (np.float64, flow.float64),
+        ]
+    else:
+        arg_dict["value_type"] = [
+            (np.float32, flow.float16),
+            (np.float32, flow.float32),
+            (np.float64, flow.float64),
+        ]
+    arg_dict["machine_ids"] = [machine_ids]
+    arg_dict["device_counts"] = [device_counts]
+    return arg_dict
+
+
+@flow.unittest.skip_unless_1n1d()
+class Testhardsigmoid1n1d(flow.unittest.TestCase):
+    def test_hardsigmoid_cpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(3, 16),
+            device_type="cpu",
+            value_type="float",
+            machine_ids="0:0",
+            device_counts=1,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_hardsigmoid_with_np(*arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_hardsigmoid_gpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(16, 16),
+            device_type="gpu",
+            value_type="float",
+            machine_ids="0:0",
+            device_counts=1,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_hardsigmoid_with_np(*arg)
+
+
+@flow.unittest.skip_unless_1n2d()
+class Testhardsigmoid1n2d(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_hardsigmoid_gpu_1n2d(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(4, 8, 16),
+            device_type="gpu",
+            value_type="float",
+            machine_ids="0:0-1",
+            device_counts=2,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_hardsigmoid_with_np(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_hardswish.py b/python/oneflow/compatible/single_client/test/ops/test_hardswish.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf17fc76d8e2a8f701a5da788131c6866a67c923
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_hardswish.py
@@ -0,0 +1,217 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import random
+import unittest
+from collections import OrderedDict
+from typing import Dict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+def _compare_hardswish_with_np(
+    input_shape, device_type, value_type, machine_ids, device_counts
+):
+    min_val = random.randint(-4, -1)
+    max_val = random.randint(0, 4)
+    assert min_val < max_val
+    if value_type[1] == flow.float16:
+        input_1 = np.random.uniform(
+            min_val - 0.5, max_val + 0.5, size=input_shape
+        ).astype(np.float16)
+        input_1 += np.random.randn(*input_shape).astype(np.float16)
+        input_1 = np.array(input_1, dtype=value_type[0])
+    else:
+        input_1 = np.random.uniform(
+            min_val - 0.5, max_val + 0.5, size=input_shape
+        ).astype(value_type[0])
+        input_1 += np.random.randn(*input_shape).astype(value_type[0])
+    assert device_type in ["cpu", "gpu"]
+    flow.clear_default_session()
+    if device_type == "cpu":
+        flow.config.cpu_device_num(device_counts)
+    else:
+        flow.config.gpu_device_num(device_counts)
+    func_config = flow.FunctionConfig()
+    func_config.default_placement_scope(flow.scope.placement(device_type, machine_ids))
+    if value_type[1] == flow.float16:
+        func_config.default_data_type(flow.float32)
+    else:
+        func_config.default_data_type(value_type[1])
+
+    def np_hardswish(input):
+        elem_cnt = input.size
+        init_shape = input.shape
+        input = input.flatten()
+        out = np.zeros_like(input)
+        for i in range(elem_cnt):
+            if input[i] >= 3:
+                out[i] = input[i]
+            elif input[i] <= -3:
+                pass
+            else:
+                out[i] = input[i] * (input[i] + 3) / 6
+        out = np.reshape(out, init_shape)
+        return np.array(out).astype(value_type[0])
+
+    np_out_hardswish = np_hardswish(input_1)
+
+    def np_diff(input):
+        input_shape = input.shape
+        input = input.flatten()
+        elem_cnt = input.size
+        diff = np.zeros(shape=(elem_cnt,))
+        for i in range(elem_cnt):
+            if input[i] > -3 and input[i] < 3:
+                diff[i] = input[i] / 3 + 0.5
+            elif input[i] >= 3:
+                diff[i] = 1
+            else:
+                pass
+        diff = np.reshape(diff, newshape=input_shape)
+        diff = np.array(diff, dtype=value_type[0])
+        return diff
+
+    _np_grad = np_diff(input_1)
+
+    def assert_prediction_grad(blob: tp.Numpy):
+        if value_type[1] == flow.float16:
+            assert np.allclose(blob, _np_grad, atol=0.001)
+        else:
+            assert np.allclose(blob, _np_grad, atol=1e-05)
+
+    if value_type[1] == flow.float16:
+
+        @flow.global_function(type="train", function_config=func_config)
+        def oneflow_hardswish(
+            of_input_1: tp.Numpy.Placeholder(shape=input_1.shape, dtype=flow.float32)
+        ) -> tp.Numpy:
+            with flow.scope.placement(device_type, "0:0"):
+                v = flow.get_variable(
+                    shape=input_1.shape,
+                    dtype=flow.float32,
+                    initializer=flow.zeros_initializer(),
+                    name="x_var",
+                )
+                x_var = of_input_1 + v
+                x_f16 = flow.cast(x_var, flow.float16)
+            of_hardswish_out_f16 = flow.nn.hardswish(x_f16)
+            of_hardswish_out_f32 = flow.cast(of_hardswish_out_f16, flow.float32)
+            with flow.scope.placement(device_type, "0:0"):
+                flow.optimizer.SGD(
+                    flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+                ).minimize(of_hardswish_out_f32)
+            flow.watch_diff(x_var, assert_prediction_grad)
+            return of_hardswish_out_f32
+
+    else:
+
+        @flow.global_function(type="train", function_config=func_config)
+        def oneflow_hardswish(
+            of_input_1: tp.Numpy.Placeholder(shape=input_1.shape, dtype=value_type[1])
+        ) -> tp.Numpy:
+            with flow.scope.placement(device_type, "0:0"):
+                v = flow.get_variable(
+                    shape=input_1.shape,
+                    dtype=value_type[1],
+                    initializer=flow.zeros_initializer(),
+                    name="x_var",
+                )
+                x_var = of_input_1 + v
+            flow.watch_diff(x_var, assert_prediction_grad)
+            of_hardswish_out = flow.nn.hardswish(x_var)
+            with flow.scope.placement(device_type, "0:0"):
+                flow.optimizer.SGD(
+                    flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+                ).minimize(of_hardswish_out)
+            return of_hardswish_out
+
+    of_out_hardswish = oneflow_hardswish(input_1)
+    if value_type[1] == flow.float16:
+        assert np.allclose(of_out_hardswish, np_out_hardswish, atol=0.001)
+    else:
+        assert np.allclose(of_out_hardswish, np_out_hardswish, atol=1e-05)
+
+
+def _gen_arg_dict(shape, device_type, value_type, machine_ids, device_counts):
+    arg_dict = OrderedDict()
+    arg_dict["input_shape"] = [shape]
+    arg_dict["device_type"] = [device_type]
+    if value_type == "float" and device_type == "cpu":
+        arg_dict["value_type"] = [
+            (np.float32, flow.float32),
+            (np.float64, flow.float64),
+        ]
+    else:
+        arg_dict["value_type"] = [
+            (np.float32, flow.float16),
+            (np.float32, flow.float32),
+            (np.float64, flow.float64),
+        ]
+    arg_dict["machine_ids"] = [machine_ids]
+    arg_dict["device_counts"] = [device_counts]
+    return arg_dict
+
+
+@flow.unittest.skip_unless_1n1d()
+class Testhardswish1n1d(flow.unittest.TestCase):
+    def test_hardswish_cpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(3, 3),
+            device_type="cpu",
+            value_type="float",
+            machine_ids="0:0",
+            device_counts=1,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_hardswish_with_np(*arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_hardswish_gpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(4, 4, 4),
+            device_type="gpu",
+            value_type="float",
+            machine_ids="0:0",
+            device_counts=1,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_hardswish_with_np(*arg)
+
+
+@flow.unittest.skip_unless_1n2d()
+class Testhardswish1n2d(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_hardswish_gpu_1n2d(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(4, 8, 4),
+            device_type="gpu",
+            value_type="float",
+            machine_ids="0:0-1",
+            device_counts=2,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_hardswish_with_np(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_hardtanh.py b/python/oneflow/compatible/single_client/test/ops/test_hardtanh.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc1c04439015f43a44d2124ed33a85d0aa5f5eb7
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_hardtanh.py
@@ -0,0 +1,201 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import random
+import unittest
+from collections import OrderedDict
+from typing import Dict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+def _compare_hardtanh_with_np(
+    input_shape, device_type, value_type, machine_ids, device_counts
+):
+    min_val = random.randint(-10, -1)
+    max_val = random.randint(0, 10)
+    assert min_val < max_val
+    if value_type[1] == flow.float16:
+        input_1 = np.random.uniform(
+            min_val - 0.5, max_val + 0.5, size=input_shape
+        ).astype(np.float16)
+        input_1 += np.random.randn(*input_shape).astype(np.float16)
+        input_1 = np.array(input_1, dtype=value_type[0])
+    else:
+        input_1 = np.random.uniform(
+            min_val - 0.5, max_val + 0.5, size=input_shape
+        ).astype(value_type[0])
+        input_1 += np.random.randn(*input_shape).astype(value_type[0])
+    assert device_type in ["cpu", "gpu"]
+    flow.clear_default_session()
+    if device_type == "cpu":
+        flow.config.cpu_device_num(device_counts)
+    else:
+        flow.config.gpu_device_num(device_counts)
+    func_config = flow.FunctionConfig()
+    func_config.default_placement_scope(flow.scope.placement(device_type, machine_ids))
+    if value_type[1] == flow.float16:
+        func_config.default_data_type(flow.float32)
+    else:
+        func_config.default_data_type(value_type[1])
+
+    def np_hardtanh(input, min_val, max_val):
+        out = np.clip(input, min_val, max_val)
+        return np.array(out).astype(value_type[0])
+
+    np_out_hardtanh = np_hardtanh(input_1, min_val, max_val)
+
+    def np_diff(input, min_val, max_val):
+        input_shape = input.shape
+        input = input.flatten()
+        elem_cnt = input.size
+        diff = np.zeros(shape=(elem_cnt,))
+        for i in range(elem_cnt):
+            if input[i] > min_val and input[i] < max_val:
+                diff[i] = 1
+        diff = np.reshape(diff, newshape=input_shape)
+        return diff
+
+    _np_grad = np_diff(input_1, min_val, max_val)
+
+    def assert_prediction_grad(blob: tp.Numpy):
+        if value_type[1] == flow.float16:
+            assert np.allclose(blob, _np_grad, atol=0.001)
+        else:
+            assert np.allclose(blob, _np_grad, atol=1e-05)
+
+    if value_type[1] == flow.float16:
+
+        @flow.global_function(type="train", function_config=func_config)
+        def oneflow_hardtanh(
+            of_input_1: tp.Numpy.Placeholder(shape=input_1.shape, dtype=flow.float32)
+        ) -> tp.Numpy:
+            with flow.scope.placement(device_type, "0:0"):
+                v = flow.get_variable(
+                    shape=input_1.shape,
+                    dtype=flow.float32,
+                    initializer=flow.zeros_initializer(),
+                    name="x_var",
+                )
+                x_var = of_input_1 + v
+                x_f16 = flow.cast(x_var, flow.float16)
+            of_hardtanh_out_f16 = flow.nn.hardtanh(x_f16, min_val, max_val)
+            of_hardtanh_out_f32 = flow.cast(of_hardtanh_out_f16, flow.float32)
+            with flow.scope.placement(device_type, "0:0"):
+                flow.optimizer.SGD(
+                    flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+                ).minimize(of_hardtanh_out_f32)
+            flow.watch_diff(x_var, assert_prediction_grad)
+            return of_hardtanh_out_f32
+
+    else:
+
+        @flow.global_function(type="train", function_config=func_config)
+        def oneflow_hardtanh(
+            of_input_1: tp.Numpy.Placeholder(shape=input_1.shape, dtype=value_type[1])
+        ) -> tp.Numpy:
+            with flow.scope.placement(device_type, "0:0"):
+                v = flow.get_variable(
+                    shape=input_1.shape,
+                    dtype=value_type[1],
+                    initializer=flow.zeros_initializer(),
+                    name="x_var",
+                )
+                x_var = of_input_1 + v
+            flow.watch_diff(x_var, assert_prediction_grad)
+            of_hardtanh_out = flow.nn.hardtanh(x_var, min_val, max_val)
+            with flow.scope.placement(device_type, "0:0"):
+                flow.optimizer.SGD(
+                    flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+                ).minimize(of_hardtanh_out)
+            return of_hardtanh_out
+
+    of_out_hardtanh = oneflow_hardtanh(input_1)
+    if value_type[1] == flow.float16:
+        assert np.allclose(of_out_hardtanh, np_out_hardtanh, atol=0.01)
+    else:
+        assert np.allclose(of_out_hardtanh, np_out_hardtanh, atol=1e-05)
+
+
+def _gen_arg_dict(shape, device_type, value_type, machine_ids, device_counts):
+    arg_dict = OrderedDict()
+    arg_dict["input_shape"] = [shape]
+    arg_dict["device_type"] = [device_type]
+    if value_type == "float" and device_type == "cpu":
+        arg_dict["value_type"] = [
+            (np.float32, flow.float32),
+            (np.float64, flow.float64),
+        ]
+    else:
+        arg_dict["value_type"] = [
+            (np.float32, flow.float16),
+            (np.float32, flow.float32),
+            (np.float64, flow.float64),
+        ]
+    arg_dict["machine_ids"] = [machine_ids]
+    arg_dict["device_counts"] = [device_counts]
+    return arg_dict
+
+
+@flow.unittest.skip_unless_1n1d()
+class Testhardtanh1n1d(flow.unittest.TestCase):
+    def test_hardtanh_cpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(16, 16),
+            device_type="cpu",
+            value_type="float",
+            machine_ids="0:0",
+            device_counts=1,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_hardtanh_with_np(*arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_hardtanh_gpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(4, 8, 16),
+            device_type="gpu",
+            value_type="float",
+            machine_ids="0:0",
+            device_counts=1,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_hardtanh_with_np(*arg)
+
+
+@flow.unittest.skip_unless_1n2d()
+class Testhardtanh1n2d(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_hardtanh_gpu_1n2d(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(4, 8, 16),
+            device_type="gpu",
+            value_type="float",
+            machine_ids="0:0-1",
+            device_counts=2,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_hardtanh_with_np(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_hierarchical_parallel_cast.py b/python/oneflow/compatible/single_client/test/ops/test_hierarchical_parallel_cast.py
new file mode 100644
index 0000000000000000000000000000000000000000..d83e9b84ddded46766915660f29add51895552d8
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_hierarchical_parallel_cast.py
@@ -0,0 +1,386 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import random
+import unittest
+from collections import OrderedDict
+from typing import Dict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+def _test(test_case, device_num):
+    (m, k, n) = (5, 6, 7)
+    a_shape = (m, k)
+    b_shape = (k, n)
+    c_shape = (n,)
+    flow.config.gpu_device_num(device_num)
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float32)
+
+    @flow.global_function("train", function_config=func_config)
+    def test_fn(
+        a: flow.typing.Numpy.Placeholder(a_shape),
+        b: flow.typing.Numpy.Placeholder(b_shape),
+        c: flow.typing.Numpy.Placeholder(c_shape),
+    ) -> flow.typing.Numpy:
+        var_a = flow.get_variable(
+            name="var_a",
+            shape=a_shape,
+            dtype=flow.float32,
+            initializer=flow.ones_initializer(),
+            distribute=flow.distribute.split(1),
+        )
+        a = flow.hierarchical_parallel_cast(a, parallel_distribution=["S(1)"])
+        a = var_a * a
+        out = flow.matmul(a, b)
+        out = flow.hierarchical_parallel_cast(out, parallel_distribution=["B"])
+        c = flow.hierarchical_parallel_cast(c, parallel_distribution=["B"])
+        out = flow.nn.bias_add(out, c)
+        lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.001])
+        flow.optimizer.SGD(lr_scheduler, momentum=0).minimize(out)
+        return out
+
+    a = np.random.rand(*a_shape).astype(np.float32)
+    b = np.random.rand(*b_shape).astype(np.float32)
+    c = np.random.rand(*c_shape).astype(np.float32)
+    out = test_fn(a, b, c)
+    test_case.assertTrue(np.allclose(out, np.matmul(a, b) + c))
+
+
+@flow.unittest.skip_unless_1n2d()
+@unittest.skipIf(
+    flow.unittest.env.eager_execution_enabled(), "2-D SBP doesn't work in eager mode"
+)
+class TestParallelCast(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_on_gpu(test_case):
+        _test(test_case, 2)
+
+
+def _test_gather(test_case, src, dst):
+    flow.clear_default_session()
+    flow.config.gpu_device_num(4)
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float32)
+
+    @flow.global_function("predict", function_config=func_config)
+    def test_fn(
+        x: flow.typing.Numpy.Placeholder((1024, 1024)),
+        indices: flow.typing.Numpy.Placeholder(shape=(64,), dtype=flow.int32),
+    ) -> flow.typing.Numpy:
+        with flow.scope.placement("gpu", "0:0-3", (2, 2)):
+            if src[0] == "S(0)":
+                x = flow.hierarchical_parallel_cast(x, parallel_distribution=["B", "B"])
+                indices = flow.hierarchical_parallel_cast(
+                    indices, parallel_distribution=["S(0)", "S(0)"]
+                )
+                if src[1] == "S(0)":
+                    x = flow.hierarchical_parallel_cast(
+                        x, parallel_distribution=["B", "B"]
+                    )
+                    indices = flow.hierarchical_parallel_cast(
+                        indices, parallel_distribution=["S(0)", "S(0)"]
+                    )
+                elif src[1] == "S(1)":
+                    x = flow.hierarchical_parallel_cast(
+                        x, parallel_distribution=["B", "S(1)"]
+                    )
+                    indices = flow.hierarchical_parallel_cast(
+                        indices, parallel_distribution=["S(0)", "B"]
+                    )
+                elif src[1] == "P":
+                    x = flow.hierarchical_parallel_cast(
+                        x, parallel_distribution=["B", "S(0)"]
+                    )
+                    indices = flow.hierarchical_parallel_cast(
+                        indices, parallel_distribution=["S(0)", "B"]
+                    )
+                elif src[1] == "B":
+                    x = flow.hierarchical_parallel_cast(
+                        x, parallel_distribution=["B", "B"]
+                    )
+                    indices = flow.hierarchical_parallel_cast(
+                        indices, parallel_distribution=["S(0)", "B"]
+                    )
+            elif src[0] == "P":
+                x = flow.hierarchical_parallel_cast(
+                    x, parallel_distribution=["S(0)", "S(0)"]
+                )
+                indices = flow.hierarchical_parallel_cast(
+                    indices, parallel_distribution=["B", "B"]
+                )
+                if src[1] == "S(0)":
+                    x = flow.hierarchical_parallel_cast(
+                        x, parallel_distribution=["S(0)", "B"]
+                    )
+                    indices = flow.hierarchical_parallel_cast(
+                        indices, parallel_distribution=["B", "S(0)"]
+                    )
+                elif src[1] == "S(1)":
+                    x = flow.hierarchical_parallel_cast(
+                        x, parallel_distribution=["S(0)", "S(1)"]
+                    )
+                    indices = flow.hierarchical_parallel_cast(
+                        indices, parallel_distribution=["B", "B"]
+                    )
+                elif src[1] == "P":
+                    x = flow.hierarchical_parallel_cast(
+                        x, parallel_distribution=["S(0)", "S(0)"]
+                    )
+                    indices = flow.hierarchical_parallel_cast(
+                        indices, parallel_distribution=["B", "B"]
+                    )
+                elif src[1] == "B":
+                    x = flow.hierarchical_parallel_cast(
+                        x, parallel_distribution=["S(0)", "B"]
+                    )
+                    indices = flow.hierarchical_parallel_cast(
+                        indices, parallel_distribution=["B", "B"]
+                    )
+            elif src[0] == "B":
+                x = flow.hierarchical_parallel_cast(x, parallel_distribution=["B", "B"])
+                indices = flow.hierarchical_parallel_cast(
+                    indices, parallel_distribution=["B", "B"]
+                )
+                if src[1] == "S(0)":
+                    x = flow.hierarchical_parallel_cast(
+                        x, parallel_distribution=["B", "B"]
+                    )
+                    indices = flow.hierarchical_parallel_cast(
+                        indices, parallel_distribution=["B", "S(0)"]
+                    )
+                elif src == "S(1)":
+                    x = flow.hierarchical_parallel_cast(
+                        x, parallel_distribution=["B", "S(1)"]
+                    )
+                    indices = flow.hierarchical_parallel_cast(
+                        indices, parallel_distribution=["B", "B"]
+                    )
+                elif src == "P":
+                    x = flow.hierarchical_parallel_cast(
+                        x, parallel_distribution=["B", "S(0)"]
+                    )
+                    indices = flow.hierarchical_parallel_cast(
+                        indices, parallel_distribution=["B", "B"]
+                    )
+                elif src == "B":
+                    x = flow.hierarchical_parallel_cast(
+                        x, parallel_distribution=["B", "B"]
+                    )
+                    indices = flow.hierarchical_parallel_cast(
+                        indices, parallel_distribution=["B", "B"]
+                    )
+                else:
+                    raise NotImplementedError
+            x = flow.gather(x, indices)
+            x = flow.hierarchical_parallel_cast(
+                x, parallel_distribution=dst, name="gather_cast"
+            )
+            if dst[0] == "S(0)":
+                x = flow.hierarchical_parallel_cast(
+                    x, parallel_distribution=["S(0)", "S(0)"]
+                )
+            elif dst[0] == "B":
+                x = flow.hierarchical_parallel_cast(x, parallel_distribution=["B", "B"])
+            elif dst[0] == "S(1)":
+                x = flow.hierarchical_parallel_cast(
+                    x, parallel_distribution=["S(1)", "S(1)"]
+                )
+            else:
+                raise NotImplementedError
+        x = flow.hierarchical_parallel_cast(x, parallel_distribution=["B"])
+        return x
+
+    x_arr = np.random.rand(1024, 1024).astype(np.float32)
+    indices = np.random.randint(low=0, high=1024, size=(64,))
+    y_arr = test_fn(x_arr, indices)
+    gather_out = x_arr[indices]
+    test_case.assertTrue(np.allclose(y_arr.flatten(), gather_out.flatten()))
+
+
+def _test_train(test_case):
+    flow.clear_default_session()
+    flow.config.gpu_device_num(4)
+    flow.config.enable_legacy_model_io(True)
+    flow.config.enable_model_io_v2(True)
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float32)
+
+    @flow.global_function("train", function_config=func_config)
+    def test_fn(
+        x: flow.typing.Numpy.Placeholder((1024, 4)),
+        indices: flow.typing.Numpy.Placeholder(shape=(12,), dtype=flow.int32),
+    ) -> flow.typing.Numpy:
+        with flow.scope.placement("gpu", "0:0-3", (2, 2)):
+            x = flow.hierarchical_parallel_cast(
+                x, parallel_distribution=["S(0)", "S(0)"]
+            )
+            indices = flow.hierarchical_parallel_cast(
+                indices, parallel_distribution=["B", "B"]
+            )
+            x = flow.hierarchical_parallel_cast(x, parallel_distribution=["S(0)", "B"])
+            v = flow.get_variable(
+                name="v",
+                shape=(1024, 4),
+                parallel_distribution=["S(0)", "B"],
+                initializer=flow.zeros_initializer(),
+            )
+            x = x + v
+            indices = flow.hierarchical_parallel_cast(
+                indices, parallel_distribution=["B", "S(0)"]
+            )
+            x = flow.gather(x, indices)
+            x = flow.hierarchical_parallel_cast(
+                x,
+                parallel_distribution=["B", "S(0)"],
+                grad_mode="manual",
+                grad_parallel_distribution=["B", "S(0)"],
+            )
+            x = flow.math.relu(x)
+            x = flow.hierarchical_parallel_cast(x, parallel_distribution=["B", "B"])
+        x = flow.hierarchical_parallel_cast(x, parallel_distribution=["B"])
+        flow.optimizer.SGD(
+            flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+        ).minimize(x)
+        return x
+
+    x_arr = np.random.rand(1024, 4).astype(np.float32)
+    indices = np.random.randint(low=0, high=20, size=(12,))
+    checkpoint = flow.train.CheckPoint()
+    checkpoint.init()
+    y_arr = test_fn(x_arr, indices)
+    gather_out = x_arr[indices]
+    test_case.assertTrue(np.allclose(y_arr.flatten(), gather_out.flatten()))
+
+
+def _test_reshape(test_case):
+    flow.clear_default_session()
+    flow.config.gpu_device_num(4)
+    flow.config.enable_legacy_model_io(True)
+    flow.config.enable_model_io_v2(True)
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float32)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def FlowJob(x: flow.typing.Numpy.Placeholder((4, 6), dtype=flow.float)):
+        with flow.scope.placement("gpu", "0:0-3", (2, 2)):
+            v = flow.get_variable(
+                "x",
+                shape=(4, 6),
+                dtype=flow.float,
+                initializer=flow.constant_initializer(0),
+                trainable=True,
+                parallel_distribution=["S(0)", "S(1)"],
+            )
+            x = flow.hierarchical_parallel_cast(
+                x, parallel_distribution=["S(0)", "S(1)"]
+            )
+            x += v
+            loss = flow.reshape(x, (4, 2, 3))
+        loss = flow.hierarchical_parallel_cast(loss, parallel_distribution=["S(0)"])
+        flow.optimizer.SGD(
+            flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+        ).minimize(loss)
+        return loss
+
+    x = np.random.randn(4, 6).astype(np.float32)
+    my_loss = FlowJob(x).get()
+    test_case.assertTrue(np.allclose(x.flatten(), my_loss.numpy().flatten()))
+
+
+def _test_reshape_like(test_case):
+    flow.clear_default_session()
+    flow.config.gpu_device_num(4)
+    flow.config.enable_legacy_model_io(True)
+    flow.config.enable_model_io_v2(True)
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float32)
+
+    @flow.global_function(type="predict", function_config=func_config)
+    def FlowJob(x: flow.typing.Numpy.Placeholder((4, 3, 2, 3), dtype=flow.float)):
+        with flow.scope.placement("gpu", "0:0-3", (2, 2)):
+            v1 = flow.get_variable(
+                "v1",
+                shape=(4, 3, 2, 3),
+                dtype=flow.float,
+                initializer=flow.constant_initializer(0),
+                trainable=True,
+                parallel_distribution=["S(0)", "S(2)"],
+            )
+            v2 = flow.get_variable(
+                "v2",
+                shape=(4, 3, 6),
+                dtype=flow.float,
+                initializer=flow.constant_initializer(0),
+                trainable=True,
+                parallel_distribution=["S(0)", "S(2)"],
+            )
+            x = flow.hierarchical_parallel_cast(
+                x, parallel_distribution=["S(0)", "S(2)"]
+            )
+            x += v1
+            loss = flow.reshape_like(x, v2)
+        loss = flow.hierarchical_parallel_cast(loss, parallel_distribution=["S(0)"])
+        return loss
+
+    x = np.random.randn(4, 3, 2, 3).astype(np.float32)
+    my_loss = FlowJob(x).get()
+    test_case.assertTrue(np.allclose(x.flatten(), my_loss.numpy().flatten()))
+
+
+@flow.unittest.skip_unless_1n4d()
+@unittest.skipIf(
+    flow.unittest.env.eager_execution_enabled(), "2-D SBP doesn't work in eager mode"
+)
+class TestHierarchicalParallelCast(flow.unittest.TestCase):
+    def test_change_axis1(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["src"] = [
+            ["S(0)", "S(0)"],
+            ["S(0)", "S(1)"],
+            ["S(0)", "P"],
+            ["S(0)", "B"],
+        ]
+        arg_dict["dst"] = [["S(0)", "S(0)"], ["S(0)", "S(1)"], ["S(0)", "B"]]
+        for arg in GenArgList(arg_dict):
+            _test_gather(test_case, *arg)
+
+    def test_change_axis0(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["src"] = [["B", "S(0)"], ["P", "S(0)"]]
+        arg_dict["dst"] = [["B", "S(0)"], ["S(1)", "S(0)"]]
+        for arg in GenArgList(arg_dict):
+            _test_gather(test_case, *arg)
+
+    def test_train(test_case):
+        _test_train(test_case)
+
+    def test_reshape(test_case):
+        _test_reshape(test_case)
+
+    def test_reshape_like(test_case):
+        _test_reshape_like(test_case)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_identity_n.py b/python/oneflow/compatible/single_client/test/ops/test_identity_n.py
new file mode 100644
index 0000000000000000000000000000000000000000..707b6f6e46a8492de7d6dea06f47fa78925e15b3
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_identity_n.py
@@ -0,0 +1,44 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from typing import Tuple
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+func_config = flow.FunctionConfig()
+func_config.default_data_type(flow.float)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestIdentityN(flow.unittest.TestCase):
+    def test_identity_n(test_case):
+        @flow.global_function(function_config=func_config)
+        def identity_n_job(xs: Tuple[(oft.Numpy.Placeholder((5, 2)),) * 3]):
+            return flow.identity_n(xs)
+
+        inputs = tuple((np.random.rand(5, 2).astype(np.float32) for i in range(3)))
+        res = identity_n_job(inputs).get()
+        for i in range(3):
+            test_case.assertTrue(np.array_equal(res[i].numpy(), inputs[i]))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_image_batch_align.py b/python/oneflow/compatible/single_client/test/ops/test_image_batch_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..d95236a245d3fa233507a8f41ab90facd60d6796
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_image_batch_align.py
@@ -0,0 +1,112 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import cv2
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def _of_image_batch_align(images, input_shape, output_shape, alignment):
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.mirrored_view())
+
+    @flow.global_function(function_config=func_config)
+    def image_batch_align_job(
+        images_def: oft.ListListNumpy.Placeholder(shape=input_shape, dtype=flow.float)
+    ):
+        images_buffer = flow.tensor_list_to_tensor_buffer(images_def)
+        image = flow.image_batch_align(
+            images_buffer, shape=output_shape[1:], dtype=flow.float, alignment=alignment
+        )
+        return image
+
+    image = image_batch_align_job([images]).get()
+    return image.numpy_list()[0]
+
+
+def _read_images_by_cv(image_files):
+    images = [cv2.imread(image_file).astype(np.single) for image_file in image_files]
+    return [np.expand_dims(image, axis=0) for image in images]
+
+
+def _get_images_static_shape(images):
+    image_shapes = [image.shape for image in images]
+    image_static_shape = np.amax(image_shapes, axis=0)
+    assert isinstance(
+        image_static_shape, np.ndarray
+    ), "image_shapes: {}, image_static_shape: {}".format(
+        str(image_shapes), str(image_static_shape)
+    )
+    image_static_shape = image_static_shape.tolist()
+    assert image_static_shape[0] == 1, str(image_static_shape)
+    image_static_shape[0] = len(image_shapes)
+    return image_static_shape
+
+
+def _roundup(x, n):
+    return int((x + n - 1) / n) * n
+
+
+def _compare_image_batch_align(
+    test_case, image_files, alignment, print_debug_info=False
+):
+    images = _read_images_by_cv(image_files)
+    image_shape = _get_images_static_shape(images)
+    assert len(image_shape) == 4
+    aligned_image_shape = [
+        image_shape[0],
+        _roundup(image_shape[1], alignment),
+        _roundup(image_shape[2], alignment),
+        image_shape[3],
+    ]
+    if print_debug_info:
+        print("image_shape:", image_shape)
+        print("aligned_image_shape:", aligned_image_shape)
+    image_tensor = _of_image_batch_align(
+        images, tuple(image_shape), tuple(aligned_image_shape), alignment
+    )
+    test_case.assertTrue(np.array_equal(aligned_image_shape, image_tensor.shape))
+    empty_image_array = np.zeros(aligned_image_shape, np.single)
+    for (empty_image, image) in zip(empty_image_array, images):
+        image = image.squeeze()
+        empty_image[0 : image.shape[0], 0 : image.shape[1], :] = image
+    test_case.assertTrue(np.array_equal(image_tensor, empty_image_array))
+
+
+@unittest.skipIf(True, "skip for now because of single-client tensor_list removed")
+class TestImageBatchAlign(flow.unittest.TestCase):
+    def test_image_batch_align(test_case):
+        _compare_image_batch_align(
+            test_case,
+            [
+                "/dataset/mscoco_2017/val2017/000000000139.jpg",
+                "/dataset/mscoco_2017/val2017/000000000632.jpg",
+                "/dataset/mscoco_2017/val2017/000000000785.jpg",
+                "/dataset/mscoco_2017/val2017/000000001000.jpg",
+            ],
+            16,
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_image_decode.py b/python/oneflow/compatible/single_client/test/ops/test_image_decode.py
new file mode 100644
index 0000000000000000000000000000000000000000..12ac219c1d15d11e9c6c0e4f465d0ea75539516e
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_image_decode.py
@@ -0,0 +1,102 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+from PIL import Image
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def _of_image_decode(images):
+    image_files = [open(im, "rb") for im in images]
+    images_bytes = [imf.read() for imf in image_files]
+    static_shape = (len(images_bytes), max([len(bys) for bys in images_bytes]))
+    for imf in image_files:
+        imf.close()
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.mirrored_view())
+
+    @flow.global_function(function_config=func_config)
+    def image_decode_job(
+        images_def: oft.ListListNumpy.Placeholder(shape=static_shape, dtype=flow.int8)
+    ):
+        images_buffer = flow.tensor_list_to_tensor_buffer(images_def)
+        decoded_images_buffer = flow.image_decode(images_buffer)
+        return flow.tensor_buffer_to_tensor_list(
+            decoded_images_buffer, shape=(640, 640, 3), dtype=flow.uint8
+        )
+
+    images_np_arr = [
+        np.frombuffer(bys, dtype=np.byte).reshape(1, -1) for bys in images_bytes
+    ]
+    decoded_images = image_decode_job([images_np_arr]).get().numpy_lists()
+    return decoded_images[0]
+
+
+def _compare_jpg_decode_with_pil(test_case, images, print_debug_info=False):
+    """
+    The jpg image's decoded results with opencv and pil image are slightly different,
+    their green channels have difference of 1.
+    """
+    of_decoded_images = _of_image_decode(images)
+    pil_images = [Image.open(image) for image in images]
+    pil_decoded_images = [np.array(image)[:, :, ::-1] for image in pil_images]
+    for (of_decoded_image, pil_decoded_image) in zip(
+        of_decoded_images, pil_decoded_images
+    ):
+        of_decoded_image = of_decoded_image.squeeze()
+        test_case.assertTrue(len(of_decoded_image.shape) == 3)
+        test_case.assertTrue(len(pil_decoded_image.shape) == 3)
+        diff = of_decoded_image - pil_decoded_image
+        diff_index = np.where(diff != 0)
+        diff_abs_values = diff[diff_index]
+        if print_debug_info:
+            print("of_decoded_image:\n", of_decoded_image, of_decoded_image.shape)
+            print("pil_decoded_image:\n", pil_decoded_image, pil_decoded_image.shape)
+            print("diff_index:\n", diff_index)
+            print("diff_abs_values:\n", diff_abs_values)
+            print(
+                "of_decoded_image diff:\n",
+                of_decoded_image[diff_index[0], diff_index[1]],
+            )
+            print(
+                "pil_decoded_image diff:\n",
+                pil_decoded_image[diff_index[0], diff_index[1]],
+            )
+        test_case.assertTrue(np.all(diff_index[-1] == 1))
+        test_case.assertTrue(np.all(diff_abs_values == 1))
+
+
+@unittest.skipIf(True, "skip for now because of single-client tensor_list removed")
+class TestImageDecode(flow.unittest.TestCase):
+    def test_image_decode(test_case):
+        _compare_jpg_decode_with_pil(
+            test_case,
+            [
+                "/dataset/mscoco_2017/val2017/000000000139.jpg",
+                "/dataset/mscoco_2017/val2017/000000000632.jpg",
+            ],
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_image_flip.py b/python/oneflow/compatible/single_client/test/ops/test_image_flip.py
new file mode 100644
index 0000000000000000000000000000000000000000..449d807c7719a45a1c52838fd9feb13380cffec0
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_image_flip.py
@@ -0,0 +1,89 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import cv2
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def _of_image_flip(images, image_shape, flip_code):
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.mirrored_view())
+
+    @flow.global_function(function_config=func_config)
+    def image_flip_job(
+        images_def: oft.ListListNumpy.Placeholder(shape=image_shape, dtype=flow.float)
+    ):
+        images_buffer = flow.tensor_list_to_tensor_buffer(images_def)
+        flip_images = flow.image_flip(images_buffer, flip_code)
+        return flow.tensor_buffer_to_tensor_list(
+            flip_images, shape=image_shape[1:], dtype=flow.float
+        )
+
+    image_tensor = image_flip_job([images]).get()
+    return image_tensor.numpy_lists()[0]
+
+
+def _read_images_by_cv(image_files):
+    images = [cv2.imread(image_file).astype(np.single) for image_file in image_files]
+    return [np.expand_dims(image, axis=0) for image in images]
+
+
+def _get_images_static_shape(images):
+    image_shapes = [image.shape for image in images]
+    image_static_shape = np.amax(image_shapes, axis=0)
+    assert isinstance(
+        image_static_shape, np.ndarray
+    ), "image_shapes: {}, image_static_shape: {}".format(
+        str(image_shapes), str(image_static_shape)
+    )
+    image_static_shape = image_static_shape.tolist()
+    assert image_static_shape[0] == 1, str(image_static_shape)
+    image_static_shape[0] = len(image_shapes)
+    return image_static_shape
+
+
+def _compare_image_flip_with_cv(test_case, image_files):
+    images = _read_images_by_cv(image_files)
+    assert all([len(image.shape) == 4 for image in images])
+    image_shape = _get_images_static_shape(images)
+    flip_images = _of_image_flip(images, tuple(image_shape), 1)
+    for (image, flip_image) in zip(images, flip_images):
+        exp_flip_image = cv2.flip(image.squeeze(), 1)
+        test_case.assertTrue(np.allclose(exp_flip_image, flip_image))
+
+
+@unittest.skipIf(True, "skip for now because of single-client tensor_list removed")
+class TestImageFlip(flow.unittest.TestCase):
+    def test_image_flip(test_case):
+        _compare_image_flip_with_cv(
+            test_case,
+            [
+                "/dataset/mscoco_2017/val2017/000000000139.jpg",
+                "/dataset/mscoco_2017/val2017/000000000632.jpg",
+            ],
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_image_normalize.py b/python/oneflow/compatible/single_client/test/ops/test_image_normalize.py
new file mode 100644
index 0000000000000000000000000000000000000000..817e01c0bf31a457e3795536cc8885b3d967fb62
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_image_normalize.py
@@ -0,0 +1,93 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import cv2
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def _of_image_normalize(images, image_shape, std, mean):
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.mirrored_view())
+
+    @flow.global_function(function_config=func_config)
+    def image_normalize_job(
+        images_def: oft.ListListNumpy.Placeholder(shape=image_shape, dtype=flow.float)
+    ):
+        images_buffer = flow.tensor_list_to_tensor_buffer(images_def)
+        norm_images = flow.image_normalize(images_buffer, std, mean)
+        return flow.tensor_buffer_to_tensor_list(
+            norm_images, shape=image_shape[1:], dtype=flow.float
+        )
+
+    image_tensor = image_normalize_job([images]).get()
+    return image_tensor.numpy_lists()[0]
+
+
+def _read_images_by_cv(image_files):
+    images = [cv2.imread(image_file).astype(np.single) for image_file in image_files]
+    return [np.expand_dims(image, axis=0) for image in images]
+
+
+def _get_images_static_shape(images):
+    image_shapes = [image.shape for image in images]
+    image_static_shape = np.amax(image_shapes, axis=0)
+    assert isinstance(
+        image_static_shape, np.ndarray
+    ), "image_shapes: {}, image_static_shape: {}".format(
+        str(image_shapes), str(image_static_shape)
+    )
+    image_static_shape = image_static_shape.tolist()
+    assert image_static_shape[0] == 1, str(image_static_shape)
+    image_static_shape[0] = len(image_shapes)
+    return image_static_shape
+
+
+def _compare_image_normalize(test_case, image_files, std, mean):
+    images = _read_images_by_cv(image_files)
+    assert all([len(image.shape) == 4 for image in images])
+    image_shape = _get_images_static_shape(images)
+    norm_images = _of_image_normalize(images, tuple(image_shape), std, mean)
+    std_array = np.array(std).reshape(1, 1, 1, -1)
+    mean_array = np.array(mean).reshape(1, 1, 1, -1)
+    for (image, norm_image) in zip(images, norm_images):
+        exp_norm_image = (image - mean_array) / std_array
+        test_case.assertTrue(np.allclose(exp_norm_image, norm_image))
+
+
+@unittest.skipIf(True, "skip for now because of single-client tensor_list removed")
+class TestImageNormalize(flow.unittest.TestCase):
+    def test_image_normalize(test_case):
+        _compare_image_normalize(
+            test_case,
+            [
+                "/dataset/mscoco_2017/val2017/000000000139.jpg",
+                "/dataset/mscoco_2017/val2017/000000000632.jpg",
+            ],
+            (102.9801, 115.9465, 122.7717),
+            (1.0, 1.0, 1.0),
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_image_resize.py b/python/oneflow/compatible/single_client/test/ops/test_image_resize.py
new file mode 100644
index 0000000000000000000000000000000000000000..d984fc5ce7b4e6a2455d7baf5b98b7d06946d6e4
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_image_resize.py
@@ -0,0 +1,392 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import typing as tp
+import unittest
+
+import cv2
+import image_test_util
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as otp
+
+
+def _make_image_resize_to_fixed_func(
+    target_size,
+    image_static_shape,
+    dtype,
+    origin_dtype=flow.float32,
+    channels=3,
+    interpolation_type="bilinear",
+    func_cfg=None,
+    print_debug_info=False,
+):
+    @flow.global_function(type="predict", function_config=func_cfg)
+    def image_resize_to_fixed(
+        image_list: otp.ListListNumpy.Placeholder(
+            shape=image_static_shape, dtype=origin_dtype
+        )
+    ) -> tp.Tuple[otp.ListNumpy, otp.ListNumpy]:
+        image_buffer = flow.tensor_list_to_tensor_buffer(image_list)
+        (res_image, scale, _) = flow.image.resize(
+            image_buffer,
+            target_size=target_size,
+            keep_aspect_ratio=False,
+            channels=channels,
+            dtype=dtype,
+            interpolation_type=interpolation_type,
+        )
+        return (res_image, scale)
+
+    return image_resize_to_fixed
+
+
+def _make_image_resize_keep_aspect_ratio_func(
+    target_size,
+    min_size,
+    max_size,
+    image_static_shape,
+    aspect_ratio_list,
+    dtype,
+    channels=3,
+    resize_side="shorter",
+    interpolation_type="bilinear",
+    func_cfg=None,
+    print_debug_info=False,
+):
+    @flow.global_function(type="predict", function_config=func_cfg)
+    def image_resize_keep_aspect_ratio(
+        image_list: otp.ListListNumpy.Placeholder(shape=image_static_shape, dtype=dtype)
+    ) -> tp.Tuple[otp.ListListNumpy, otp.ListNumpy, otp.ListNumpy]:
+        image_buffer = flow.tensor_list_to_tensor_buffer(image_list)
+        (res_image, scale, new_size) = flow.image.resize(
+            image_buffer,
+            target_size=target_size,
+            min_size=min_size,
+            max_size=max_size,
+            keep_aspect_ratio=True,
+            resize_side=resize_side,
+            interpolation_type=interpolation_type,
+        )
+        out_shape = image_test_util.infer_keep_aspect_ratio_resized_images_static_shape(
+            target_size=target_size,
+            min_size=min_size,
+            max_size=max_size,
+            aspect_ratio_list=aspect_ratio_list,
+            resize_side=resize_side,
+            channels=channels,
+        )
+        if print_debug_info:
+            print("resized image_static_shape: {}".format(out_shape))
+        res_image = flow.tensor_buffer_to_tensor_list(
+            res_image, shape=out_shape, dtype=dtype
+        )
+        return (res_image, scale, new_size)
+
+    return image_resize_keep_aspect_ratio
+
+
+def _of_image_resize(
+    image_list,
+    dtype=flow.float32,
+    origin_dtype=None,
+    channels=3,
+    keep_aspect_ratio=False,
+    target_size=None,
+    min_size=None,
+    max_size=None,
+    resize_side="shorter",
+    interpolation_type="bilinear",
+    print_debug_info=False,
+):
+    assert isinstance(image_list, (list, tuple))
+    assert all((isinstance(image, np.ndarray) for image in image_list))
+    assert all((image.ndim == 3 for image in image_list))
+    assert all((image.shape[2] == channels for image in image_list))
+    (image_static_shape, aspect_ratio_list) = image_test_util.infer_images_static_shape(
+        image_list, channels
+    )
+    if print_debug_info:
+        print("image_static_shape: {}".format(image_static_shape))
+        print("aspect_ratio_list: {}".format(aspect_ratio_list))
+    flow.clear_default_session()
+    func_cfg = flow.FunctionConfig()
+    func_cfg.default_logical_view(flow.scope.mirrored_view())
+    image_list = [np.expand_dims(image, axis=0) for image in image_list]
+    if keep_aspect_ratio:
+        image_resize_func = _make_image_resize_keep_aspect_ratio_func(
+            target_size=target_size,
+            min_size=min_size,
+            max_size=max_size,
+            image_static_shape=image_static_shape,
+            aspect_ratio_list=aspect_ratio_list,
+            dtype=dtype,
+            channels=channels,
+            resize_side=resize_side,
+            interpolation_type=interpolation_type,
+            func_cfg=func_cfg,
+            print_debug_info=print_debug_info,
+        )
+        (res_image, scale, new_size) = image_resize_func([image_list])
+        return (res_image[0], scale[0], new_size[0])
+    else:
+        if origin_dtype is None:
+            origin_dtype = dtype
+        image_resize_func = _make_image_resize_to_fixed_func(
+            target_size=target_size,
+            image_static_shape=image_static_shape,
+            dtype=dtype,
+            origin_dtype=origin_dtype,
+            channels=channels,
+            interpolation_type=interpolation_type,
+            func_cfg=func_cfg,
+            print_debug_info=print_debug_info,
+        )
+        (res_image, scale) = image_resize_func([image_list])
+        new_size = np.asarray([(target_size, target_size)] * len(image_list))
+        return (res_image[0], scale[0], new_size)
+
+
+def _get_resize_size_and_scale(
+    w,
+    h,
+    target_size,
+    min_size=None,
+    max_size=None,
+    keep_aspect_ratio=True,
+    resize_side="shorter",
+):
+    if keep_aspect_ratio:
+        assert isinstance(target_size, int)
+        aspect_ratio = float(min((w, h))) / float(max((w, h)))
+        (
+            min_res_size,
+            max_res_size,
+        ) = image_test_util.compute_keep_aspect_ratio_resized_size(
+            target_size, min_size, max_size, aspect_ratio, resize_side
+        )
+        if w < h:
+            res_w = min_res_size
+            res_h = max_res_size
+        else:
+            res_w = max_res_size
+            res_h = min_res_size
+    else:
+        assert isinstance(target_size, (list, tuple))
+        assert len(target_size) == 2
+        assert all((isinstance(size, int) for size in target_size))
+        (res_w, res_h) = target_size
+    scale_w = res_w / w
+    scale_h = res_h / h
+    return ((res_w, res_h), (scale_w, scale_h))
+
+
+def _cv_image_resize(
+    image_list,
+    target_size,
+    keep_aspect_ratio=True,
+    min_size=None,
+    max_size=None,
+    resize_side="shorter",
+    interpolation=cv2.INTER_LINEAR,
+    dtype=np.float32,
+):
+    res_image_list = []
+    res_size_list = []
+    res_scale_list = []
+    for image in image_list:
+        (h, w) = image.shape[:2]
+        (new_size, scale) = _get_resize_size_and_scale(
+            w, h, target_size, min_size, max_size, keep_aspect_ratio, resize_side
+        )
+        res_image_list.append(
+            cv2.resize(image.squeeze(), new_size, interpolation=interpolation).astype(
+                dtype
+            )
+        )
+        res_size_list.append(new_size)
+        res_scale_list.append(scale)
+    return (res_image_list, res_scale_list, res_size_list)
+
+
+def _test_image_resize_with_cv(
+    test_case,
+    image_files,
+    target_size,
+    min_size=None,
+    max_size=None,
+    keep_aspect_ratio=True,
+    resize_side="shorter",
+    dtype=flow.float32,
+    origin_dtype=None,
+    print_debug_info=False,
+):
+    if origin_dtype is None:
+        origin_dtype = dtype
+    image_list = image_test_util.read_images_by_cv(image_files, origin_dtype)
+    if print_debug_info:
+        print("origin images shapes: {}".format([image.shape for image in image_list]))
+        print(
+            "target_size: {}, min_size: {}, max_size: {}, keep_aspect_ratio: {}, \nresize_side: {}, dtype: {}, origin_dtype: {}".format(
+                target_size,
+                min_size,
+                max_size,
+                keep_aspect_ratio,
+                resize_side,
+                dtype,
+                origin_dtype,
+            )
+        )
+    (of_res_images, of_scales, of_new_sizes) = _of_image_resize(
+        image_list=image_list,
+        dtype=dtype,
+        origin_dtype=origin_dtype,
+        keep_aspect_ratio=keep_aspect_ratio,
+        target_size=target_size,
+        min_size=min_size,
+        max_size=max_size,
+        resize_side=resize_side,
+        print_debug_info=print_debug_info,
+    )
+    (cv_res_images, cv_scales, cv_new_sizes) = _cv_image_resize(
+        image_list=image_list,
+        target_size=target_size,
+        keep_aspect_ratio=keep_aspect_ratio,
+        min_size=min_size,
+        max_size=max_size,
+        resize_side=resize_side,
+        dtype=flow.convert_oneflow_dtype_to_numpy_dtype(dtype),
+    )
+    if print_debug_info:
+        print("comparing resized image between of and cv")
+        for (i, (of_image, cv_image)) in enumerate(zip(of_res_images, cv_res_images)):
+            print("    origin image shape: {}".format(image_list[i].shape))
+            print(
+                "    resized image shape: {} vs. {}".format(
+                    of_image.shape, cv_image.shape
+                )
+            )
+        print("comparing resized image scale between of and cv")
+        for (of_scale, cv_scale) in zip(of_scales, cv_scales):
+            print("    scale: {} vs. {}:".format(of_scale, cv_scale))
+        print("comparing resized image new size between of and cv")
+        for (of_new_size, cv_new_size) in zip(of_new_sizes, cv_new_sizes):
+            print("    new_size: {} vs. {}:".format(of_new_size, cv_new_size))
+    for (
+        of_res_image,
+        cv_res_image,
+        of_scale,
+        cv_scale,
+        of_new_size,
+        cv_new_size,
+    ) in zip(
+        of_res_images, cv_res_images, of_scales, cv_scales, of_new_sizes, cv_new_sizes
+    ):
+        test_case.assertTrue(np.allclose(of_res_image, cv_res_image))
+        test_case.assertTrue(np.allclose(of_scale, cv_scale))
+        test_case.assertTrue(np.allclose(of_new_size, cv_new_size))
+
+
+@unittest.skipIf(True, "skip for now because of single-client tensor_list removed")
+class TestImageResize(flow.unittest.TestCase):
+    def test_image_resize_to_fixed_size(test_case):
+        (image_files, _) = image_test_util.random_sample_images_from_coco()
+        _test_image_resize_with_cv(
+            test_case, image_files, target_size=(224, 224), keep_aspect_ratio=False
+        )
+
+    def test_image_resize_shorter_to_target_size(test_case):
+        (image_files, _) = image_test_util.random_sample_images_from_coco()
+        _test_image_resize_with_cv(
+            test_case,
+            image_files,
+            target_size=800,
+            keep_aspect_ratio=True,
+            resize_side="shorter",
+        )
+
+    def test_image_resize_longer_to_target_size(test_case):
+        (image_files, _) = image_test_util.random_sample_images_from_coco()
+        _test_image_resize_with_cv(
+            test_case,
+            image_files,
+            target_size=1000,
+            keep_aspect_ratio=True,
+            resize_side="longer",
+        )
+
+    def test_image_resize_shorter_to_target_size_with_max_size(test_case):
+        (image_files, _) = image_test_util.random_sample_images_from_coco()
+        _test_image_resize_with_cv(
+            test_case,
+            image_files,
+            target_size=800,
+            max_size=1333,
+            keep_aspect_ratio=True,
+            resize_side="shorter",
+        )
+
+    def test_image_resize_longer_to_target_size_with_min_size(test_case):
+        (image_files, _) = image_test_util.random_sample_images_from_coco()
+        _test_image_resize_with_cv(
+            test_case,
+            image_files,
+            target_size=1000,
+            min_size=600,
+            keep_aspect_ratio=True,
+            resize_side="longer",
+        )
+
+    def test_image_resize_to_fixed_size_with_dtype_uint8(test_case):
+        (image_files, _) = image_test_util.random_sample_images_from_coco()
+        _test_image_resize_with_cv(
+            test_case,
+            image_files,
+            target_size=(1000, 1000),
+            keep_aspect_ratio=False,
+            dtype=flow.uint8,
+        )
+
+    def test_image_resize_shorter_to_target_size_with_max_size_with_dtype_uint8(
+        test_case,
+    ):
+        (image_files, _) = image_test_util.random_sample_images_from_coco()
+        _test_image_resize_with_cv(
+            test_case,
+            image_files,
+            target_size=1000,
+            max_size=1600,
+            keep_aspect_ratio=True,
+            resize_side="shorter",
+            dtype=flow.uint8,
+        )
+
+    def test_image_resize_uint8_to_float(test_case):
+        (image_files, _) = image_test_util.random_sample_images_from_coco()
+        _test_image_resize_with_cv(
+            test_case,
+            image_files,
+            target_size=(1000, 1000),
+            keep_aspect_ratio=False,
+            dtype=flow.float32,
+            origin_dtype=flow.uint8,
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_image_target_resize.py b/python/oneflow/compatible/single_client/test/ops/test_image_target_resize.py
new file mode 100644
index 0000000000000000000000000000000000000000..6091e277168d6dd58f150ce15b92e116ae03230e
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_image_target_resize.py
@@ -0,0 +1,152 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import typing as tp
+import unittest
+
+import cv2
+import image_test_util
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as otp
+
+
+def _of_image_target_resize(
+    images, target_size, max_size, image_static_shape, aspect_ratio_list
+):
+    assert image_static_shape[-1] == 3
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.mirrored_view())
+
+    @flow.global_function(function_config=func_config)
+    def image_target_resize_job(
+        image: otp.ListListNumpy.Placeholder(shape=image_static_shape, dtype=flow.float)
+    ) -> tp.Tuple[otp.ListListNumpy, otp.ListNumpy, otp.ListNumpy]:
+        image_buffer = flow.tensor_list_to_tensor_buffer(image)
+        (res_image_buffer, new_size, scale) = flow.image_target_resize(
+            image_buffer,
+            target_size=target_size,
+            max_size=max_size,
+            resize_side="shorter",
+        )
+        out_shape = image_test_util.infer_keep_aspect_ratio_resized_images_static_shape(
+            target_size=target_size,
+            min_size=None,
+            max_size=max_size,
+            aspect_ratio_list=aspect_ratio_list,
+            resize_side="shorter",
+            channels=3,
+        )
+        res_image = flow.tensor_buffer_to_tensor_list(
+            res_image_buffer, shape=out_shape, dtype=flow.float
+        )
+        return (res_image, new_size, scale)
+
+    (res_image, new_size, scale) = image_target_resize_job([images])
+    return (res_image[0], new_size[0], scale[0])
+
+
+def _target_resize_by_cv(images, target_size, max_size):
+    res_images = []
+    res_sizes = []
+    res_scales = []
+    for image in images:
+        (h, w) = image.shape[0:2]
+        (res_size, res_scale) = _get_target_resize_size(w, h, target_size, max_size)
+        res_images.append(cv2.resize(image, res_size))
+        res_sizes.append(res_size)
+        res_scales.append(res_scale)
+    return (res_images, res_sizes, res_scales)
+
+
+def _get_target_resize_size(w, h, target_size, max_size):
+    aspect_ratio = float(min((w, h))) / float(max((w, h)))
+    (
+        min_res_size,
+        max_res_size,
+    ) = image_test_util.compute_keep_aspect_ratio_resized_size(
+        target_size, None, max_size, aspect_ratio, "shorter"
+    )
+    if w < h:
+        res_w = min_res_size
+        res_h = max_res_size
+    else:
+        res_w = max_res_size
+        res_h = min_res_size
+    scale_w = res_w / w
+    scale_h = res_h / h
+    return ((res_w, res_h), (scale_w, scale_h))
+
+
+def _compare_image_target_resize_with_cv(
+    test_case, image_files, target_size, max_size, print_debug_info=False
+):
+    images = image_test_util.read_images_by_cv(image_files, flow.float)
+    (image_static_shape, aspect_ratio_list) = image_test_util.infer_images_static_shape(
+        images
+    )
+    expand_images = [np.expand_dims(image, axis=0) for image in images]
+    (resized_images, size, scale) = _of_image_target_resize(
+        expand_images, target_size, max_size, image_static_shape, aspect_ratio_list
+    )
+    (cv_resized_images, cv_resized_sizes, cv_resized_scales) = _target_resize_by_cv(
+        images, target_size, max_size
+    )
+    for (
+        resized_image,
+        cv_resized_image,
+        image_size,
+        image_scale,
+        resized_size,
+        resized_scale,
+    ) in zip(
+        resized_images,
+        cv_resized_images,
+        size,
+        scale,
+        cv_resized_sizes,
+        cv_resized_scales,
+    ):
+        if print_debug_info:
+            print("resized_image shape:", resized_image.shape)
+            print("cv_resized_image shape:", cv_resized_image.shape)
+            print("resized w & h:", image_size, resized_size)
+            print("resize w_scale & h_scale:", image_scale, resized_scale)
+        test_case.assertTrue(np.allclose(resized_image, cv_resized_image))
+        test_case.assertTrue(np.allclose(image_size, resized_size))
+        test_case.assertTrue(np.allclose(image_scale, resized_scale))
+
+
+@unittest.skipIf(True, "skip for now because of single-client tensor_list removed")
+class TestImageTargetResize(flow.unittest.TestCase):
+    def test_image_target_resize(test_case):
+        _compare_image_target_resize_with_cv(
+            test_case,
+            [
+                "/dataset/mscoco_2017/val2017/000000000139.jpg",
+                "/dataset/mscoco_2017/val2017/000000000632.jpg",
+            ],
+            800,
+            1333,
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_in_top_k.py b/python/oneflow/compatible/single_client/test/ops/test_in_top_k.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fa353fa6e1c578087ddeaab11a7e5c518d1a80d
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_in_top_k.py
@@ -0,0 +1,90 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+from test_util import GenArgList, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def compare_with_tensorflow(
+    device_type, target_dtype, predictions_shape, k, with_finite=False
+):
+    assert device_type in ["gpu", "cpu"]
+    assert target_dtype in ["int32", "int64"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_logical_view(flow.scope.mirrored_view())
+    func_config.default_data_type(flow.float)
+    instance_num = predictions_shape[0]
+    classes = predictions_shape[1]
+    targets = np.random.randint(classes, size=instance_num).astype(
+        type_name_to_np_type[target_dtype]
+    )
+    predictions = np.random.rand(*predictions_shape).astype("float32")
+    if with_finite:
+        predictions[np.random.randint(instance_num)][
+            np.random.randint(classes)
+        ] = float("inf")
+
+    @flow.global_function(function_config=func_config)
+    def IntopkJob(
+        targets: tp.ListNumpy.Placeholder(
+            (instance_num + 10,), dtype=type_name_to_flow_type[target_dtype]
+        ),
+        predictions: tp.ListNumpy.Placeholder(
+            tuple([dim + 5 for dim in predictions_shape]), dtype=flow.float
+        ),
+    ):
+        with flow.scope.placement(device_type, "0:0"):
+            return flow.math.in_top_k(targets, predictions, k=k)
+
+    of_out = IntopkJob([targets], [predictions]).get().numpy_list()[0]
+    tf_out = tf.math.in_top_k(targets, predictions, k=k)
+    assert np.array_equal(of_out, tf_out)
+
+
+def gen_arg_list():
+    arg_dict = OrderedDict()
+    arg_dict["device_type"] = ["cpu", "gpu"]
+    arg_dict["target_dtype"] = ["int32", "int64"]
+    arg_dict["predictions_shape"] = [(10, 5)]
+    arg_dict["k"] = [1, 2, 5]
+    arg_dict["with_finite"] = [False, True]
+    return GenArgList(arg_dict)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestInTopk(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_in_top_K(test_case):
+        for arg in gen_arg_list():
+            compare_with_tensorflow(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_indexed_slices_reduce_sum.py b/python/oneflow/compatible/single_client/test/ops/test_indexed_slices_reduce_sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3fc8b5b1c544a275269197a924fd5fd1f0d61e3
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_indexed_slices_reduce_sum.py
@@ -0,0 +1,83 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+func_config = flow.FunctionConfig()
+func_config.default_data_type(flow.float)
+
+
+def _check(test_case, x_indices, x_values, y_indices, y_values, num_unique):
+    ref_indices = np.unique(x_indices)
+    np.sort(ref_indices)
+    num_unique = num_unique.item()
+    test_case.assertTrue(num_unique == ref_indices.shape[0])
+    key_to_idx = dict(zip(ref_indices, range(num_unique)))
+    ref_values = np.zeros((num_unique, y_values.shape[-1]), y_values.dtype)
+    for i in range(x_indices.shape[0]):
+        ref_values[key_to_idx[x_indices[i].item()]] += x_values[i]
+    y_indices = y_indices[0:num_unique]
+    y_values = y_values[0:num_unique]
+    sorted_idx = np.argsort(y_indices)
+    y_indices = y_indices[sorted_idx]
+    y_values = y_values[sorted_idx]
+    test_case.assertTrue(np.array_equal(ref_indices, y_indices))
+    test_case.assertTrue(np.allclose(ref_values, y_values))
+
+
+def _run_test(test_case, indices, values, indices_dtype, values_dtype, device):
+    @flow.global_function(function_config=func_config)
+    def TestJob(
+        indices: oft.Numpy.Placeholder(indices.shape, dtype=indices_dtype),
+        values: oft.Numpy.Placeholder(values.shape, dtype=values_dtype),
+    ):
+        with flow.scope.placement(device, "0:0"):
+            return flow.experimental.indexed_slices_reduce_sum(indices, values)
+
+    (out_indices, out_values, num_unique) = TestJob(indices, values).get()
+    _check(
+        test_case,
+        indices,
+        values,
+        out_indices.numpy(),
+        out_values.numpy(),
+        num_unique.numpy(),
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestIndexedSlicesReduceSum(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_indexed_slices_reduce_sum_gpu(test_case):
+        indices = np.random.randint(0, 32, 1024).astype(np.int32)
+        values = np.random.rand(1024, 8).astype(np.float32)
+        _run_test(test_case, indices, values, flow.int32, flow.float32, "gpu")
+
+    def test_indexed_slices_reduce_sum_cpu(test_case):
+        indices = np.random.randint(0, 32, 1024).astype(np.int32)
+        values = np.random.rand(1024, 8).astype(np.float32)
+        _run_test(test_case, indices, values, flow.int32, flow.float32, "cpu")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_inplace.py b/python/oneflow/compatible/single_client/test/ops/test_inplace.py
new file mode 100644
index 0000000000000000000000000000000000000000..807986d0637585eea50ff40b95707eb99ef1d443
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_inplace.py
@@ -0,0 +1,118 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def MakeFuncConfig(enable_inplace):
+    func_config = flow.FunctionConfig()
+    func_config.enable_inplace(enable_inplace)
+    return func_config
+
+
+def TrainCompare(test_case, func):
+    func_config = MakeFuncConfig(True)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def EnableInplace():
+        return func("w0")
+
+    func_config.enable_inplace(False)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def DisableInplace():
+        return func("w1")
+
+    num_iter = 10
+    enable_inplace_losses = np.array(
+        [EnableInplace().get().tolist() for _ in range(num_iter)]
+    )
+    disable_inplace_losses = np.array(
+        [DisableInplace().get().tolist() for _ in range(num_iter)]
+    )
+    test_case.assertTrue(np.allclose(enable_inplace_losses, disable_inplace_losses))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestInplace(flow.unittest.TestCase):
+    def test_loss_inplace(test_case):
+        def IdentityLoss(name):
+            w = flow.get_variable(
+                name, (10,), initializer=flow.constant_initializer(100)
+            )
+            y = flow.math.reduce_sum(w)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [5]), momentum=0
+            ).minimize(y)
+            return y
+
+        TrainCompare(test_case, IdentityLoss)
+
+    def test_inplace_variable(test_case):
+        @flow.global_function(function_config=MakeFuncConfig(True))
+        def InplaceVariable():
+            w = flow.get_variable("w", (10,), initializer=flow.constant_initializer(1))
+            y = flow.math.relu(w)
+            return y
+
+        test_case.assertTrue(
+            np.allclose(InplaceVariable().get().numpy(), np.ones((10,), np.float32))
+        )
+
+    def test_deadlock(test_case):
+        @flow.global_function(function_config=MakeFuncConfig(True))
+        def Foo(x: oft.Numpy.Placeholder((10,))):
+            y = flow.math.relu(x)
+            y = flow.math.relu(y)
+
+        Foo(np.ones((10,), dtype=np.float32))
+
+    def test_nodeadlock_with_return(test_case):
+        @flow.global_function(function_config=MakeFuncConfig(True))
+        def Foo(x: oft.Numpy.Placeholder((10,))):
+            y = flow.math.relu(x)
+            y = flow.math.relu(y)
+            return y
+
+        Foo(np.ones((10,), dtype=np.float32)).get()
+
+    def test_reentrant_lock_check_failed(test_case):
+        @flow.global_function(function_config=MakeFuncConfig(True))
+        def Foo(x: oft.Numpy.Placeholder((10,))):
+            y = flow.math.relu(x)
+            y = flow.math.relu(y)
+
+        Foo(np.ones((10,), dtype=np.float32))
+
+    def test_const_inplace_variable(test_case):
+        @flow.global_function(function_config=MakeFuncConfig(True))
+        def InplaceVariable():
+            w = flow.get_variable("w", (2, 5), initializer=flow.constant_initializer(1))
+            y = flow.reshape(w, (10,))
+            return y
+
+        of_ret = InplaceVariable().get().numpy()
+        test_case.assertTrue(np.allclose(of_ret, np.ones((10,), np.float32)))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_instance_norm_op.py b/python/oneflow/compatible/single_client/test/ops/test_instance_norm_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c7a59f3250936ab4f69adb4d8025afdaa71c8af
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_instance_norm_op.py
@@ -0,0 +1,139 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+def getInstanceNorm1DOutAndGrad(input, gout, eps):
+    assert len(input.shape) == len(gout.shape)
+    assert len(input.shape) >= 3
+    input_reshape_to_1d = np.reshape(input, (input.shape[0], input.shape[1], -1))
+    gout_reshape_to_1d = np.reshape(gout, (gout.shape[0], gout.shape[1], -1))
+    gamma = np.ones((1, input_reshape_to_1d.shape[1], 1), dtype=np.float32)
+    mean_np = np.mean(input_reshape_to_1d, axis=2, keepdims=True)
+    in_sub_mean = input_reshape_to_1d - mean_np
+    var_np = np.mean(np.square(in_sub_mean), axis=2, keepdims=True)
+    invar_np = 1.0 / np.sqrt(var_np + eps)
+    out_np = in_sub_mean * invar_np * gamma
+    gvar = (
+        gout_reshape_to_1d * gamma * in_sub_mean * -0.5 * np.power(var_np + eps, -1.5)
+    )
+    gvar = np.sum(gvar, axis=2, keepdims=True)
+    gmean = np.sum(gout_reshape_to_1d * gamma, axis=2, keepdims=True)
+    gmean *= -invar_np
+    scale = 1.0 / input_reshape_to_1d.shape[2]
+    tmp = scale * np.sum(-2.0 * in_sub_mean, axis=2, keepdims=True) * gvar
+    gmean += tmp
+    gin_np = (
+        gout_reshape_to_1d * gamma * invar_np
+        + gvar * scale * 2.0 * in_sub_mean
+        + gmean * scale
+    )
+    return (
+        np.reshape(out_np, list(input.shape)),
+        np.reshape(gin_np, list(input.shape)),
+    )
+
+
+def _compare_instance_norm_nd_with_np(
+    input_shape, device_type, machine_ids, device_counts, eps, affine
+):
+    assert device_type in ["cpu", "gpu"]
+    assert len(input_shape) >= 3 and len(input_shape) <= 5
+    flow.clear_default_session()
+    if device_type == "cpu":
+        flow.config.cpu_device_num(device_counts)
+    else:
+        flow.config.gpu_device_num(device_counts)
+    func_config = flow.FunctionConfig()
+    func_config.default_placement_scope(flow.scope.placement(device_type, machine_ids))
+    input = np.random.random(size=input_shape).astype(np.float32)
+    gout = np.random.random(size=input_shape).astype(np.float32)
+    (out_np, gin_np) = getInstanceNorm1DOutAndGrad(input, gout, eps)
+
+    def assert_prediction_grad(gin_of: tp.Numpy):
+        assert np.allclose(gin_of, gin_np, atol=1e-05)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def instanceNormJob(
+        of_input: tp.Numpy.Placeholder(shape=input.shape),
+        multipler: tp.Numpy.Placeholder(shape=input.shape),
+    ) -> tp.Numpy:
+        with flow.scope.placement(device_type, "0:0"):
+            v = flow.get_variable(
+                shape=of_input.shape,
+                dtype=flow.float32,
+                initializer=flow.constant_initializer(0),
+                name="v",
+            )
+            x_var = of_input + v
+            flow.watch_diff(x_var, assert_prediction_grad)
+        if len(of_input.shape) == 3:
+            out = flow.nn.InstanceNorm1d(x_var, eps=eps, affine=affine)
+        elif len(of_input.shape) == 4:
+            out = flow.nn.InstanceNorm2d(x_var, eps=eps, affine=affine)
+        else:
+            out = flow.nn.InstanceNorm3d(x_var, eps=eps, affine=affine)
+        with flow.scope.placement(device_type, "0:0"):
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+            ).minimize(out * multipler)
+        return out
+
+    of_out = instanceNormJob(input, gout)
+    assert np.allclose(of_out, out_np, atol=1e-05)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestInstanceNormND1n1d(flow.unittest.TestCase):
+    def test_instance_norm(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["input_shape"] = [(4, 2, 32), (4, 2, 32, 32, 32)]
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["machine_ids"] = ["0:0"]
+        arg_dict["device_counts"] = [1]
+        arg_dict["eps"] = [0.001]
+        arg_dict["affine"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            _compare_instance_norm_nd_with_np(*arg)
+
+
+@flow.unittest.skip_unless_1n2d()
+class TestInstanceNormND1n2d(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_instance_norm(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["input_shape"] = [(4, 2, 32), (4, 2, 32, 32)]
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["machine_ids"] = ["0:0-1"]
+        arg_dict["device_counts"] = [2]
+        arg_dict["eps"] = [0.001]
+        arg_dict["affine"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            _compare_instance_norm_nd_with_np(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_interface_op_read_and_write.py b/python/oneflow/compatible/single_client/test/ops/test_interface_op_read_and_write.py
new file mode 100644
index 0000000000000000000000000000000000000000..579173100fe340ed042e5c03ec09e33b40f17581
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_interface_op_read_and_write.py
@@ -0,0 +1,65 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+@flow.unittest.skip_unless_1n2d()
+class TestInterfaceOpReadAndWrite(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test(test_case):
+        flow.config.gpu_device_num(2)
+
+        @flow.global_function()
+        def add() -> tp.Numpy:
+            with flow.scope.placement("gpu", "0:0-1"):
+                x = flow.get_variable(
+                    name="x",
+                    shape=(2, 3),
+                    initializer=flow.random_uniform_initializer(),
+                )
+                y = flow.get_variable(
+                    name="y",
+                    shape=(2, 3),
+                    initializer=flow.random_uniform_initializer(),
+                )
+                return flow.math.add_n([x, y])
+
+        flow.train.CheckPoint().init()
+        if flow.eager_execution_enabled():
+            add()
+        x_value = np.random.random((2, 3)).astype(np.float32)
+        y_value = np.random.random((2, 3)).astype(np.float32)
+        flow.experimental.set_interface_blob_value("x", x_value)
+        flow.experimental.set_interface_blob_value("y", y_value)
+        test_case.assertTrue(
+            np.array_equal(x_value, flow.experimental.get_interface_blob_value("x"))
+        )
+        test_case.assertTrue(
+            np.array_equal(y_value, flow.experimental.get_interface_blob_value("y"))
+        )
+        test_case.assertTrue(np.array_equal(add(), x_value + y_value))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_l1loss.py b/python/oneflow/compatible/single_client/test/ops/test_l1loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..a487e4cfd29a1b8930e5db611c9d6693737bdc77
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_l1loss.py
@@ -0,0 +1,158 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+from typing import Dict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+def _compare_l1loss_with_np(
+    input_shape, target_shape, device_type, machine_ids, device_counts
+):
+    input = np.random.random(size=input_shape).astype(np.float32)
+    target = np.random.random(size=target_shape).astype(np.float32)
+    assert device_type in ["cpu", "gpu"]
+    func_config = flow.FunctionConfig()
+    flow.clear_default_session()
+    if device_type == "cpu":
+        flow.config.cpu_device_num(device_counts)
+    else:
+        flow.config.gpu_device_num(device_counts)
+    func_config.default_placement_scope(flow.scope.placement(device_type, machine_ids))
+    func_config.default_logical_view(flow.scope.consistent_view())
+
+    def np_l1loss(np_input, np_target):
+        np_l1 = np.abs(np_target - np_input)
+        np_l1_mean = np.mean(np_l1)
+        np_l1_sum = np.sum(np_l1)
+        np_l1_dict = {
+            "np_l1_loss": np_l1,
+            "np_l1_loss_mean": np_l1_mean,
+            "np_l1_loss_sum": np_l1_sum,
+        }
+        return np_l1_dict
+
+    def np_l1_loss_diff(np_input, np_target):
+        original_shape = np_target.shape
+        elemcnt = np_target.size
+        prediction = np_input.reshape(-1)
+        label = np_target.reshape(-1)
+        prediction_grad = np.zeros(elemcnt).astype(prediction.dtype)
+        for i in np.arange(elemcnt):
+            diff = prediction[i] - label[i]
+            prediction_grad[i] = np.sign(diff)
+        grad_mean = prediction_grad.reshape(original_shape) / elemcnt
+        grad_dict = {"np_grad_mean": grad_mean}
+        return grad_dict
+
+    np_out_l1loss_dict = np_l1loss(input, target)
+    np_grad_dict = np_l1_loss_diff(input, target)
+
+    def assert_prediction_grad(blob: tp.Numpy):
+        assert np.allclose(blob, np_grad_dict["np_grad_mean"])
+
+    @flow.global_function(type="train", function_config=func_config)
+    def oneflow_l1loss(
+        of_input: tp.Numpy.Placeholder(shape=input.shape),
+        of_target: tp.Numpy.Placeholder(shape=target.shape),
+    ) -> Dict[str, tp.Numpy]:
+        with flow.scope.placement(device_type, "0:0"):
+            v = flow.get_variable(
+                shape=target.shape,
+                dtype=flow.float32,
+                initializer=flow.constant_initializer(0),
+                name="v",
+            )
+            x_var = of_input + v
+        flow.watch_diff(x_var, assert_prediction_grad)
+        l1loss = flow.nn.L1Loss(x_var, of_target, reduction="none", name="of_l1loss")
+        l1loss_mean = flow.nn.L1Loss(
+            x_var, of_target, reduction="mean", name="of_l1loss_mean"
+        )
+        l1loss_sum = flow.nn.L1Loss(
+            x_var, of_target, reduction="sum", name="of_l1loss_sum"
+        )
+        with flow.scope.placement(device_type, "0:0"):
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+            ).minimize(l1loss_mean)
+        return {
+            "of_l1_loss": l1loss,
+            "of_l1_loss_mean": l1loss_mean,
+            "of_l1_loss_sum": l1loss_sum,
+        }
+
+    of_out_l1loss_dict = oneflow_l1loss(input, target)
+    assert np.allclose(
+        of_out_l1loss_dict["of_l1_loss"], np_out_l1loss_dict["np_l1_loss"]
+    )
+    assert np.allclose(
+        of_out_l1loss_dict["of_l1_loss_mean"][0], np_out_l1loss_dict["np_l1_loss_mean"]
+    )
+    assert np.allclose(
+        of_out_l1loss_dict["of_l1_loss_sum"][0], np_out_l1loss_dict["np_l1_loss_sum"]
+    )
+
+
+def _gen_arg_dict(shape, device_type, machine_ids, device_counts):
+    arg_dict = OrderedDict()
+    arg_dict["input_shape"] = [shape]
+    arg_dict["target_shape"] = [shape]
+    arg_dict["device_type"] = [device_type]
+    arg_dict["machine_ids"] = [machine_ids]
+    arg_dict["device_counts"] = [device_counts]
+    return arg_dict
+
+
+@flow.unittest.skip_unless_1n1d()
+class Testl1loss1n1d(flow.unittest.TestCase):
+    def test_l1loss_cpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(16, 3), device_type="cpu", machine_ids="0:0", device_counts=1
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_l1loss_with_np(*arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_l1loss_gpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(3, 16, 32), device_type="gpu", machine_ids="0:0", device_counts=1
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_l1loss_with_np(*arg)
+
+
+@flow.unittest.skip_unless_1n2d()
+class Testl1loss1n2d(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_l1loss_gpu_1n2d(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(3, 32, 16), device_type="gpu", machine_ids="0:0-1", device_counts=2
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_l1loss_with_np(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_l2_normalize.py b/python/oneflow/compatible/single_client/test/ops/test_l2_normalize.py
new file mode 100644
index 0000000000000000000000000000000000000000..3abc454e85059072fa3f652272302c196d5fc0e9
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_l2_normalize.py
@@ -0,0 +1,86 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import test_global_storage
+from test_util import GenArgList, type_name_to_flow_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def compare_with_tensorflow(device_type, x_shape, data_type, axis, epsilon):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def L2NormalizeJob():
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "x",
+                shape=x_shape,
+                dtype=type_name_to_flow_type[data_type],
+                initializer=flow.random_uniform_initializer(minval=-10, maxval=10),
+                trainable=True,
+            )
+            loss = flow.math.l2_normalize(x, axis=axis, epsilon=epsilon)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch(loss, test_global_storage.Setter("loss"))
+            flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))
+            return loss
+
+    of_out = L2NormalizeJob().get()
+    with tf.GradientTape(persistent=True) as tape:
+        x = tf.Variable(test_global_storage.Get("x"))
+        tf_out = tf.math.l2_normalize(x, axis=axis, epsilon=epsilon)
+    loss_diff = test_global_storage.Get("loss_diff")
+    tf_x_diff = tape.gradient(tf_out, x, loss_diff)
+    assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=1e-05, atol=1e-05)
+    assert np.allclose(
+        test_global_storage.Get("x_diff"), tf_x_diff.numpy(), rtol=1e-05, atol=1e-05
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestL2Normalize(flow.unittest.TestCase):
+    def test_l2_normalize(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["x_shape"] = [(10, 10, 20, 30)]
+        arg_dict["data_type"] = ["float32"]
+        arg_dict["axis"] = [-1, 0, 1, 2, 3]
+        arg_dict["epsilon"] = [1e-10, 1e-05]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_lamb.py b/python/oneflow/compatible/single_client/test/ops/test_lamb.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4f578bd1d2819571c8b3b66606b489682be1d19
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_lamb.py
@@ -0,0 +1,143 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import tensorflow_addons as tfa
+import test_global_storage
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def compare_with_tensorflow_addons_lamb(
+    test_case,
+    device_type,
+    x_shape,
+    beta1,
+    beta2,
+    epsilon,
+    weight_decay,
+    learning_rate,
+    train_iters,
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float32)
+
+    @flow.global_function(type="train", function_config=flow.FunctionConfig())
+    def testLAMB(
+        random_mask: flow.typing.Numpy.Placeholder(x_shape, dtype=flow.float32)
+    ) -> flow.typing.Numpy:
+        with flow.scope.placement(device_type, "0:0-0"):
+            x = flow.get_variable(
+                name="x",
+                shape=x_shape,
+                dtype=flow.float32,
+                initializer=flow.random_uniform_initializer(minval=-10, maxval=10),
+                trainable=True,
+            )
+            loss = flow.math.reduce_mean(x + random_mask)
+            flow.optimizer.LAMB(
+                flow.optimizer.PiecewiseConstantScheduler([], [learning_rate]),
+                beta1=beta1,
+                beta2=beta2,
+                epsilon=epsilon,
+                weight_decay=weight_decay,
+            ).minimize(loss)
+            return x
+
+    random_masks_seq = []
+    for i in range(train_iters + 1):
+        random_masks_seq.append(np.random.uniform(size=x_shape).astype(np.float32))
+    x_list = []
+    init_value = None
+    for i in range(train_iters + 1):
+        x = testLAMB(random_masks_seq[i])
+        x_list.append(x)
+        if i == 0:
+            init_value = np.copy(x)
+    var = tf.Variable(init_value)
+    opt = tfa.optimizers.LAMB(
+        learning_rate=learning_rate,
+        beta_1=beta1,
+        beta_2=beta2,
+        epsilon=epsilon,
+        weight_decay_rate=weight_decay,
+    )
+    var_list = []
+    for i in range(train_iters):
+        with tf.GradientTape() as tape:
+            if i == 0:
+                var0 = tf.identity(var)
+                var_list.append(var0)
+            random_mask = tf.Variable(random_masks_seq[i])
+            loss = tf.reduce_mean(var + random_mask)
+        gradients = tape.gradient(loss, var)
+        opt.apply_gradients(zip([gradients], [var]))
+        var_list.append(var.numpy())
+    case = (
+        device_type,
+        x_shape,
+        beta1,
+        beta2,
+        epsilon,
+        weight_decay,
+        learning_rate,
+        train_iters,
+    )
+    test_case.assertTrue(len(x_list) == len(var_list))
+    for (i, o, t) in zip(range(len(var_list)), x_list, var_list):
+        diff = o - t
+        test_case.assertTrue(
+            np.allclose(x_list[i], var_list[i], rtol=0.001, atol=0.001), (i, case, diff)
+        )
+    diff = x.flatten() - var.numpy().flatten()
+    test_case.assertTrue(
+        np.allclose(x.flatten(), var.numpy().flatten(), rtol=0.001, atol=0.001),
+        (case, diff),
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestLamb(flow.unittest.TestCase):
+    def test_lamb(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_case"] = [test_case]
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["x_shape"] = [(10,)]
+        arg_dict["beta1"] = [0.9]
+        arg_dict["beta2"] = [0.999]
+        arg_dict["epsilon"] = [1e-06]
+        arg_dict["weight_decay"] = [0.01]
+        arg_dict["learning_rate"] = [0.0001]
+        arg_dict["train_iters"] = [10]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow_addons_lamb(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_layer_norm.py b/python/oneflow/compatible/single_client/test/ops/test_layer_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..57454150e7dbf4d7befc759a690e3407c8cc29ed
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_layer_norm.py
@@ -0,0 +1,256 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import collections
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import test_global_storage
+from test_util import GenArgList, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestLayerNorm(flow.unittest.TestCase):
+    def test_layer_norm(_):
+        confs = [{"x_shape": (40, 64), "begin_norm_axis": -1, "begin_params_axis": -1}]
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["confs"] = confs
+        arg_dict["data_type"] = ["float32", "float16"]
+        arg_dict["trainable"] = [True, False]
+        arg_dict["center"] = [True, False]
+        arg_dict["scale"] = [True, False]
+        arg_dict["epsilon"] = [1e-05, 1e-10]
+        arg_dict["fuse_add_to_output"] = [True, False]
+        for case in GenArgList(arg_dict):
+            (
+                device_type,
+                confs,
+                data_type,
+                trainable,
+                center,
+                scale,
+                epsilon,
+                fuse_add_to_output,
+            ) = case
+            if device_type == "cpu" and data_type == "float16":
+                continue
+            if device_type == "cpu" and fuse_add_to_output == True:
+                continue
+            x_shape = confs["x_shape"]
+            begin_norm_axis = confs["begin_norm_axis"]
+            begin_params_axis = confs["begin_params_axis"]
+            flow.clear_default_session()
+            assert (
+                begin_norm_axis == begin_params_axis
+            ), "tf doesn't support a dedicated begin_params_axis"
+            if data_type == "float16":
+                x = (
+                    np.random.uniform(low=-1, high=1, size=x_shape)
+                    .astype(np.float16)
+                    .astype(np.float32)
+                )
+            else:
+                x = np.random.uniform(low=-1, high=1, size=x_shape).astype(
+                    type_name_to_np_type[data_type]
+                )
+            dim = len(x.shape) - 2
+            with tf.GradientTape(persistent=True) as tape:
+                x_tf = tf.Variable(x)
+                if data_type == "float16":
+                    x_tf = tf.cast(x_tf, dtype=tf.float16)
+                    tf.keras.backend.set_floatx("float16")
+                layer = tf.keras.layers.LayerNormalization(
+                    axis=begin_norm_axis,
+                    epsilon=epsilon,
+                    center=center,
+                    scale=scale,
+                    beta_initializer="zeros",
+                    gamma_initializer="ones",
+                    beta_regularizer=None,
+                    gamma_regularizer=None,
+                    beta_constraint=None,
+                    gamma_constraint=None,
+                    trainable=trainable,
+                )
+                y_tf = layer(x_tf)
+                z_tf = y_tf + x_tf
+            if data_type == "float16":
+                dx_tf = tape.gradient(
+                    z_tf, x_tf, tf.constant(1.0, shape=z_tf.shape, dtype=tf.float16)
+                )
+            else:
+                dx_tf = tape.gradient(z_tf, x_tf, tf.constant(1.0, shape=z_tf.shape))
+            grad = tape.gradient(z_tf, layer.trainable_variables)
+            if trainable:
+                if scale and center:
+                    tf_gamma_diff = grad[0]
+                    tf_beta_diff = grad[1]
+                elif scale and (not center):
+                    tf_gamma_diff = grad[0]
+                elif not scale and center:
+                    tf_beta_diff = grad[0]
+                else:
+                    pass
+            else:
+                pass
+
+            def assert_grad(b):
+                diff = dx_tf.numpy() - b.numpy()
+                max_diff = np.max(np.abs(diff))
+                if data_type == "float16":
+                    tolerance = 0.003
+                else:
+                    tolerance = 1e-05
+                assert np.allclose(
+                    dx_tf.numpy(), b.numpy(), rtol=tolerance, atol=tolerance
+                ), (case, max_diff)
+
+            def assert_grad_gamma(b):
+                diff = tf_gamma_diff.numpy() - b.numpy()
+                max_diff = np.max(np.abs(diff))
+                assert np.allclose(
+                    tf_gamma_diff.numpy(), b.numpy(), rtol=0.0001, atol=0.0001
+                ), (case, max_diff)
+
+            def assert_grad_beta(b):
+                diff = tf_beta_diff.numpy() - b.numpy()
+                max_diff = np.max(np.abs(diff))
+                assert np.allclose(
+                    tf_beta_diff.numpy(), b.numpy(), rtol=1e-05, atol=1e-05
+                ), (case, max_diff)
+
+            if data_type == "float16":
+                dtype = flow.float
+            else:
+                dtype = type_name_to_flow_type[data_type]
+            func_config = flow.FunctionConfig()
+            func_config.default_data_type(flow.float)
+            func_config.enable_fuse_add_to_output(fuse_add_to_output)
+
+            @flow.global_function(type="train", function_config=func_config)
+            def test_job(x: oft.Numpy.Placeholder(x_shape, dtype=dtype)):
+                v = flow.get_variable(
+                    "x",
+                    shape=x_shape,
+                    dtype=dtype,
+                    initializer=flow.constant_initializer(0),
+                    trainable=True,
+                )
+                flow.watch_diff(v, assert_grad)
+                x += v
+                if data_type == "float16":
+                    x = flow.cast(x, dtype=flow.float16)
+                with flow.scope.placement(device_type, "0:0"):
+                    param_shape = x.shape[begin_params_axis:]
+                    gamma = None
+                    beta = None
+                    if center:
+                        with flow.scope.namespace("LayerNorm"):
+                            beta = flow.get_variable(
+                                name="beta",
+                                shape=param_shape,
+                                dtype=flow.float,
+                                initializer=flow.constant_initializer(0.0),
+                                trainable=trainable,
+                                model_name="beta",
+                                reuse=False,
+                            )
+                            if trainable:
+                                flow.watch_diff(beta, assert_grad_beta)
+                            if data_type == "float16":
+                                beta = flow.cast(beta, dtype=flow.float16)
+                    if scale:
+                        with flow.scope.namespace("LayerNorm"):
+                            gamma = flow.get_variable(
+                                name="gamma",
+                                shape=param_shape,
+                                dtype=flow.float,
+                                initializer=flow.constant_initializer(1.0),
+                                trainable=trainable,
+                                model_name="gamma",
+                                reuse=False,
+                            )
+                            if trainable:
+                                if data_type == "float16":
+                                    flow.watch_diff(
+                                        gamma, test_global_storage.Setter("gamma_diff")
+                                    )
+                                else:
+                                    flow.watch_diff(gamma, assert_grad_gamma)
+                            if data_type == "float16":
+                                gamma = flow.cast(gamma, dtype=flow.float16)
+                    x = flow.identity(x)
+                    y = flow.nn.layer_norm(
+                        x,
+                        gamma=gamma,
+                        beta=beta,
+                        begin_norm_axis=begin_norm_axis,
+                        begin_params_axis=begin_params_axis,
+                        epsilon=epsilon,
+                    )
+                    z = y + x
+                if data_type == "float16":
+                    y = flow.cast(y, dtype=flow.float)
+                    z = flow.cast(z, dtype=flow.float)
+                flow.optimizer.SGD(
+                    flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+                ).minimize(z)
+                return y
+
+            y = test_job(x).get()
+            assert y.numpy().shape == y_tf.numpy().shape, (
+                y.numpy().shape,
+                y_tf.numpy().shape,
+            )
+            diff = y.numpy() - y_tf.numpy()
+            max_diff = np.max(np.abs(diff))
+            assert np.allclose(y.numpy(), y_tf.numpy(), rtol=1e-05, atol=0.002), (
+                case,
+                max_diff,
+            )
+            if data_type == "float16" and trainable and scale:
+                np_dy = np.ones(x.shape).astype(np.float32)
+                np_gamma_diff = np.sum(
+                    np_dy * y.numpy().astype(np.float32), axis=0
+                ).astype(np.float16)
+                max_diff = np.max(
+                    np.abs(
+                        np_gamma_diff
+                        - test_global_storage.Get("gamma_diff").astype(np.float16)
+                    )
+                )
+                assert np.allclose(
+                    np_gamma_diff,
+                    test_global_storage.Get("gamma_diff").astype(np.float16),
+                    rtol=0.05,
+                    atol=0.05,
+                ), (case, max_diff)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_layers_conv1d.py b/python/oneflow/compatible/single_client/test/ops/test_layers_conv1d.py
new file mode 100644
index 0000000000000000000000000000000000000000..c946885c322c4fe11392709b153410b9d917d2ac
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_layers_conv1d.py
@@ -0,0 +1,175 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import test_global_storage
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def grouped_convolution1D(
+    inputs, filters, padding, num_groups, strides=None, dilation_rate=None
+):
+    input_list = tf.split(inputs, num_groups, axis=-1)
+    filter_list = tf.split(filters, num_groups, axis=-1)
+    output_list = []
+    for (conv_idx, (input_tensor, filter_tensor)) in enumerate(
+        zip(input_list, filter_list)
+    ):
+        output_list.append(
+            tf.nn.conv1d(
+                input_tensor,
+                filter_tensor,
+                padding="VALID",
+                stride=[1, 1, 1],
+                data_format="NWC",
+            )
+        )
+    outputs = tf.concat(output_list, axis=-1)
+    return outputs
+
+
+def compare_with_tensorflow(
+    test_case, device_type, x_shape, filters, kernel_size, groups
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def ConvJob():
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "x",
+                shape=x_shape,
+                dtype=flow.float,
+                initializer=flow.random_uniform_initializer(minval=0, maxval=100),
+                trainable=True,
+            )
+            loss = flow.layers.conv1d(
+                x,
+                filters,
+                kernel_size=kernel_size,
+                strides=[1],
+                padding="valid",
+                data_format="NCW",
+                dilation_rate=1,
+                groups=groups,
+                use_bias=False,
+                kernel_initializer=flow.random_uniform_initializer(
+                    minval=0, maxval=100
+                ),
+                weight_name="conv1d_weight",
+            )
+            weight_shape = (filters, x.shape[1] // groups, kernel_size)
+            weight = flow.get_variable(
+                name="conv1d_weight",
+                shape=weight_shape,
+                dtype=flow.float,
+                initializer=flow.random_uniform_initializer(minval=0, maxval=100),
+            )
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch(weight, test_global_storage.Setter("weight"))
+            flow.watch_diff(weight, test_global_storage.Setter("weight_diff"))
+            flow.watch(loss, test_global_storage.Setter("loss"))
+            flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))
+            return loss
+
+    of_out = ConvJob().get()
+    with tf.GradientTape(persistent=True) as tape:
+        x = tf.Variable(test_global_storage.Get("x").transpose(0, 2, 1))
+        assert groups > 0
+        assert x_shape[1] % groups == 0
+        assert filters % groups == 0
+        if groups == 1:
+            weight = tf.Variable(test_global_storage.Get("weight").transpose(2, 1, 0))
+            tf_out = tf.nn.conv1d(
+                x, weight, stride=[1, 1, 1], padding="VALID", data_format="NWC"
+            )
+        else:
+            weight = tf.Variable(test_global_storage.Get("weight").transpose(2, 1, 0))
+            tf_out = grouped_convolution1D(
+                x, weight, padding="VALID", num_groups=groups
+            )
+    loss_diff = test_global_storage.Get("loss_diff").transpose(0, 2, 1)
+    tf_x_diff = tape.gradient(tf_out, x, loss_diff)
+    tf_weight_diff = tape.gradient(tf_out, weight, loss_diff)
+    of_out_np = of_out.numpy().transpose(0, 2, 1)
+    tf_out_np = tf_out.numpy()
+    max_abs_diff = np.max(np.absolute(of_out_np - tf_out_np))
+    fail_info = "\nshape (of vs. tf): {} vs. {}\nmax_abs_diff: {}".format(
+        of_out_np.shape, tf_out_np.shape, max_abs_diff
+    )
+    test_case.assertTrue(
+        np.allclose(of_out_np, tf_out_np, rtol=1e-05, atol=1e-05), fail_info
+    )
+    of_x_diff_arr = test_global_storage.Get("x_diff").transpose(0, 2, 1)
+    tf_x_diff_arr = tf_x_diff.numpy()
+    max_abs_diff = np.max(np.abs(of_x_diff_arr - tf_x_diff_arr))
+    test_case.assertTrue(
+        np.allclose(of_x_diff_arr, tf_x_diff_arr, rtol=1e-05, atol=0.0001)
+    )
+    test_case.assertTrue(
+        np.allclose(
+            test_global_storage.Get("weight_diff").transpose(2, 1, 0),
+            tf_weight_diff.numpy(),
+            rtol=1e-05,
+            atol=1e-05,
+        )
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestLayersConv1d(flow.unittest.TestCase):
+    def test_conv1(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(10, 32, 20)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [3]
+        arg_dict["groups"] = [32]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(test_case, *arg)
+
+    def test_conv2(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["x_shape"] = [(10, 32, 20)]
+        arg_dict["filters"] = [32]
+        arg_dict["kernel_size"] = [3, 2]
+        arg_dict["groups"] = [1]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_layers_conv2d.py b/python/oneflow/compatible/single_client/test/ops/test_layers_conv2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..844523bd2747a8c375f69fce5dbfa44a92783f05
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_layers_conv2d.py
@@ -0,0 +1,219 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import test_global_storage
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def grouped_convolution2D(
+    inputs, filters, padding, num_groups, strides=None, dilation_rate=None
+):
+    input_list = tf.split(inputs, num_groups, axis=-1)
+    filter_list = tf.split(filters, num_groups, axis=-1)
+    output_list = []
+    for (conv_idx, (input_tensor, filter_tensor)) in enumerate(
+        zip(input_list, filter_list)
+    ):
+        output_list.append(
+            tf.nn.conv2d(
+                input_tensor,
+                filter_tensor,
+                padding="VALID",
+                strides=[1, 1, 1, 1],
+                data_format="NHWC",
+            )
+        )
+    outputs = tf.concat(output_list, axis=-1)
+    return outputs
+
+
+def compare_with_tensorflow(
+    test_case, device_type, x_shape, filters, kernel_size, groups
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def ConvJob():
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "x",
+                shape=x_shape,
+                dtype=flow.float,
+                initializer=flow.random_uniform_initializer(minval=0, maxval=100),
+                trainable=True,
+            )
+            loss = flow.layers.conv2d(
+                x,
+                filters,
+                kernel_size=kernel_size,
+                strides=[1, 1],
+                padding="valid",
+                data_format="NCHW",
+                dilation_rate=1,
+                groups=groups,
+                use_bias=False,
+                kernel_initializer=flow.random_uniform_initializer(
+                    minval=0, maxval=100
+                ),
+                weight_name="conv2d_weight",
+            )
+            weight_shape = (filters, x.shape[1] // groups, kernel_size, kernel_size)
+            weight = flow.get_variable(
+                name="conv2d_weight",
+                shape=weight_shape,
+                dtype=flow.float,
+                initializer=flow.random_uniform_initializer(minval=0, maxval=100),
+            )
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch(weight, test_global_storage.Setter("weight"))
+            flow.watch_diff(weight, test_global_storage.Setter("weight_diff"))
+            flow.watch(loss, test_global_storage.Setter("loss"))
+            flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))
+            return loss
+
+    of_out = ConvJob().get()
+    with tf.GradientTape(persistent=True) as tape:
+        x = tf.Variable(test_global_storage.Get("x").transpose(0, 2, 3, 1))
+        assert groups > 0
+        assert x_shape[1] % groups == 0
+        assert filters % groups == 0
+        if groups == 1:
+            weight = tf.Variable(
+                test_global_storage.Get("weight").transpose(2, 3, 1, 0)
+            )
+            tf_out = tf.nn.conv2d(
+                x, weight, strides=[1, 1, 1, 1], padding="VALID", data_format="NHWC"
+            )
+        else:
+            weight = tf.Variable(
+                test_global_storage.Get("weight").transpose(2, 3, 1, 0)
+            )
+            tf_out = grouped_convolution2D(
+                x, weight, padding="VALID", num_groups=groups
+            )
+    loss_diff = test_global_storage.Get("loss_diff").transpose(0, 2, 3, 1)
+    tf_x_diff = tape.gradient(tf_out, x, loss_diff)
+    tf_weight_diff = tape.gradient(tf_out, weight, loss_diff)
+    of_out_np = of_out.numpy().transpose(0, 2, 3, 1)
+    tf_out_np = tf_out.numpy()
+    max_abs_diff = np.max(np.absolute(of_out_np - tf_out_np))
+    fail_info = "\nshape (of vs. tf): {} vs. {}\nmax_abs_diff: {}".format(
+        of_out_np.shape, tf_out_np.shape, max_abs_diff
+    )
+    test_case.assertTrue(
+        np.allclose(of_out_np, tf_out_np, rtol=1e-05, atol=1e-05), fail_info
+    )
+    of_x_diff_arr = test_global_storage.Get("x_diff").transpose(0, 2, 3, 1)
+    tf_x_diff_arr = tf_x_diff.numpy()
+    max_abs_diff = np.max(np.abs(of_x_diff_arr - tf_x_diff_arr))
+    test_case.assertTrue(
+        np.allclose(of_x_diff_arr, tf_x_diff_arr, rtol=1e-05, atol=0.0001)
+    )
+    test_case.assertTrue(
+        np.allclose(
+            test_global_storage.Get("weight_diff").transpose(2, 3, 1, 0),
+            tf_weight_diff.numpy(),
+            rtol=1e-05,
+            atol=1e-05,
+        )
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestLayersConv2d(flow.unittest.TestCase):
+    def test_conv1(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(10, 32, 20, 20)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [3]
+        arg_dict["groups"] = [1]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(test_case, *arg)
+
+    def test_conv2(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(10, 32, 20, 20)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [3]
+        arg_dict["groups"] = [4]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(test_case, *arg)
+
+    def test_conv3(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(10, 32, 20, 20)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [3]
+        arg_dict["groups"] = [8]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(test_case, *arg)
+
+    def test_conv4(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(10, 32, 20, 20)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [3]
+        arg_dict["groups"] = [32]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(test_case, *arg)
+
+    def test_conv5(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(10, 32, 20, 20)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [1]
+        arg_dict["groups"] = [8]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(test_case, *arg)
+
+    def test_conv6(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(10, 32, 20, 20)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [1]
+        arg_dict["groups"] = [32]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_layers_conv3d.py b/python/oneflow/compatible/single_client/test/ops/test_layers_conv3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..b679b6dc1c596f40e89c6ebfa5404f6d3a15b914
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_layers_conv3d.py
@@ -0,0 +1,185 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import test_global_storage
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def grouped_convolution2D(
+    inputs, filters, padding, num_groups, strides=None, dilation_rate=None
+):
+    input_list = tf.split(inputs, num_groups, axis=-1)
+    filter_list = tf.split(filters, num_groups, axis=-1)
+    output_list = []
+    for (conv_idx, (input_tensor, filter_tensor)) in enumerate(
+        zip(input_list, filter_list)
+    ):
+        output_list.append(
+            tf.nn.conv3d(
+                input_tensor,
+                filter_tensor,
+                padding="VALID",
+                strides=[1, 1, 1, 1, 1],
+                data_format="NDHWC",
+            )
+        )
+    outputs = tf.concat(output_list, axis=-1)
+    return outputs
+
+
+def compare_with_tensorflow(
+    test_case, device_type, x_shape, filters, kernel_size, groups
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def ConvJob():
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "x",
+                shape=x_shape,
+                dtype=flow.float,
+                initializer=flow.random_uniform_initializer(minval=0, maxval=100),
+                trainable=True,
+            )
+            loss = flow.layers.conv3d(
+                x,
+                filters,
+                kernel_size=kernel_size,
+                strides=1,
+                padding="valid",
+                data_format="NCDHW",
+                dilation_rate=1,
+                groups=groups,
+                use_bias=False,
+                kernel_initializer=flow.random_uniform_initializer(
+                    minval=0, maxval=100
+                ),
+                weight_name="conv3d_weight",
+            )
+            weight_shape = (
+                filters,
+                x.shape[1] // groups,
+                kernel_size,
+                kernel_size,
+                kernel_size,
+            )
+            weight = flow.get_variable(
+                name="conv3d_weight",
+                shape=weight_shape,
+                dtype=flow.float,
+                initializer=flow.random_uniform_initializer(minval=0, maxval=100),
+            )
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch(weight, test_global_storage.Setter("weight"))
+            flow.watch_diff(weight, test_global_storage.Setter("weight_diff"))
+            flow.watch(loss, test_global_storage.Setter("loss"))
+            flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))
+            return loss
+
+    of_out = ConvJob().get()
+    with tf.GradientTape(persistent=True) as tape:
+        x = tf.Variable(test_global_storage.Get("x").transpose(0, 2, 3, 4, 1))
+        assert groups > 0
+        assert x_shape[1] % groups == 0
+        assert filters % groups == 0
+        if groups == 1:
+            weight = tf.Variable(
+                test_global_storage.Get("weight").transpose(2, 3, 4, 1, 0)
+            )
+            tf_out = tf.nn.conv3d(
+                x, weight, strides=[1, 1, 1, 1, 1], padding="VALID", data_format="NDHWC"
+            )
+        else:
+            weight = tf.Variable(
+                test_global_storage.Get("weight").transpose(2, 3, 4, 1, 0)
+            )
+            tf_out = grouped_convolution2D(
+                x, weight, padding="VALID", num_groups=groups
+            )
+    loss_diff = test_global_storage.Get("loss_diff").transpose(0, 2, 3, 4, 1)
+    tf_x_diff = tape.gradient(tf_out, x, loss_diff)
+    tf_weight_diff = tape.gradient(tf_out, weight, loss_diff)
+    of_out_np = of_out.numpy().transpose(0, 2, 3, 4, 1)
+    tf_out_np = tf_out.numpy()
+    max_abs_diff = np.max(np.absolute(of_out_np - tf_out_np))
+    fail_info = "\nshape (of vs. tf): {} vs. {}\nmax_abs_diff: {}".format(
+        of_out_np.shape, tf_out_np.shape, max_abs_diff
+    )
+    test_case.assertTrue(
+        np.allclose(of_out_np, tf_out_np, rtol=1e-05, atol=1e-05), fail_info
+    )
+    of_x_diff_arr = test_global_storage.Get("x_diff").transpose(0, 2, 3, 4, 1)
+    tf_x_diff_arr = tf_x_diff.numpy()
+    max_abs_diff = np.max(np.abs(of_x_diff_arr - tf_x_diff_arr))
+    test_case.assertTrue(
+        np.allclose(of_x_diff_arr, tf_x_diff_arr, rtol=1e-05, atol=0.0001)
+    )
+    test_case.assertTrue(
+        np.allclose(
+            test_global_storage.Get("weight_diff").transpose(2, 3, 4, 1, 0),
+            tf_weight_diff.numpy(),
+            rtol=1e-05,
+            atol=1e-05,
+        )
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestLayersConv3d(flow.unittest.TestCase):
+    def test_conv1(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(10, 32, 20, 20, 20)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [3]
+        arg_dict["groups"] = [32]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(test_case, *arg)
+
+    def test_conv2(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["x_shape"] = [(10, 32, 10, 10, 20)]
+        arg_dict["filters"] = [32]
+        arg_dict["kernel_size"] = [3, 2]
+        arg_dict["groups"] = [1]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_leaky_relu.py b/python/oneflow/compatible/single_client/test/ops/test_leaky_relu.py
new file mode 100644
index 0000000000000000000000000000000000000000..85201fff66cc4ce19bdc2e51618cdbe83c306fc9
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_leaky_relu.py
@@ -0,0 +1,86 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import test_global_storage
+from test_util import GenArgList, type_name_to_flow_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def compare_with_tensorflow(device_type, x_shape, data_type, alpha):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def LeakyReluJob():
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "x",
+                shape=x_shape,
+                dtype=type_name_to_flow_type[data_type],
+                initializer=flow.random_uniform_initializer(minval=-10, maxval=10),
+                trainable=True,
+            )
+            loss = flow.nn.leaky_relu(x, alpha=alpha)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch(loss, test_global_storage.Setter("loss"))
+            flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))
+            return loss
+
+    of_out = LeakyReluJob().get()
+    with tf.GradientTape(persistent=True) as tape:
+        x = tf.Variable(test_global_storage.Get("x"))
+        tf_out = tf.nn.leaky_relu(x, alpha)
+    loss_diff = test_global_storage.Get("loss_diff")
+    tf_x_diff = tape.gradient(tf_out, x, loss_diff)
+    assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=1e-05, atol=1e-05)
+    assert np.allclose(
+        test_global_storage.Get("x_diff"), tf_x_diff.numpy(), rtol=1e-05, atol=1e-05
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestLeakyRelu(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_leaky_relu(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["x_shape"] = [(10, 10, 20, 30), (10, 20)]
+        arg_dict["data_type"] = ["float32", "double"]
+        arg_dict["alpha"] = [0.1, -0.2, 2]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_logical_slice.py b/python/oneflow/compatible/single_client/test/ops/test_logical_slice.py
new file mode 100644
index 0000000000000000000000000000000000000000..de4eea5f7550f638a122046b82677defa35c381c
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_logical_slice.py
@@ -0,0 +1,92 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from collections import OrderedDict
+
+import numpy as np
+import test_global_storage
+from test_util import GenArgDict
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def _test_logical_slice(
+    test_case, var_shape, slice_tuples, split_axis, device_tag, flow_dtype, device_num
+):
+    flow.clear_default_session()
+    if device_tag == "gpu":
+        flow.config.gpu_device_num(device_num)
+
+    @flow.global_function()
+    def slice_fn():
+        with flow.scope.placement(device_tag, "0:0-{}".format(device_num - 1)):
+            var = flow.get_variable(
+                name="var",
+                shape=var_shape,
+                dtype=flow_dtype,
+                initializer=flow.random_uniform_initializer(-10, 10, dtype=flow_dtype),
+                distribute=flow.distribute.split(split_axis),
+            )
+            flow.watch(var, test_global_storage.Setter("var"))
+            ret = flow.experimental.logical_slice(var, slice_tuples)
+            return ret
+
+    of_res = slice_fn().get().numpy()
+    var_np = test_global_storage.Get("var")
+    slice_objs = []
+    for s in slice_tuples:
+        slice_objs.append(slice(s[0], s[1], s[2]))
+    test_case.assertTrue(np.array_equal(of_res, var_np[tuple(slice_objs)]))
+
+
+class TestLogicalSlice(flow.unittest.TestCase):
+    @flow.unittest.skip_unless_1n2d()
+    def test_logical_slice_4dim_2d(test_case):
+        var_shape = (30, 40, 20, 15)
+        slice_tuples = [(10, 20, 3), (1, 30, 4), (3, 16, 2), (5, 11, 1)]
+        arg_dict = OrderedDict()
+        arg_dict["split_axis"] = list(range(4))
+        arg_dict["device_tag"] = ["cpu", "gpu"]
+        arg_dict["flow_dtype"] = [flow.float, flow.int8]
+        arg_dict["device_num"] = [2]
+        for arg in GenArgDict(arg_dict):
+            _test_logical_slice(test_case, var_shape, slice_tuples, **arg)
+
+    @flow.unittest.skip_unless_1n4d()
+    def test_logical_slice_negative_start_stop_4dim_4d(test_case):
+        var_shape = (30, 40, 20, 15)
+        slice_tuples = [(10, None, 3), (1, -10, 4), (-15, -5, 2), (5, 11, 1)]
+        arg_dict = OrderedDict()
+        arg_dict["split_axis"] = list(range(4))
+        arg_dict["device_tag"] = ["cpu", "gpu"]
+        arg_dict["flow_dtype"] = [flow.float]
+        arg_dict["device_num"] = [4]
+        for arg in GenArgDict(arg_dict):
+            _test_logical_slice(test_case, var_shape, slice_tuples, **arg)
+
+    @flow.unittest.skip_unless_1n4d()
+    def test_logical_slice_2dim_3d(test_case):
+        var_shape = (30, 40)
+        slice_tuples = [(10, 20, 3), (1, 30, 4)]
+        arg_dict = OrderedDict()
+        arg_dict["split_axis"] = list(range(2))
+        arg_dict["device_tag"] = ["cpu", "gpu"]
+        arg_dict["flow_dtype"] = [flow.float]
+        arg_dict["device_num"] = [3]
+        for arg in GenArgDict(arg_dict):
+            _test_logical_slice(test_case, var_shape, slice_tuples, **arg)
diff --git a/python/oneflow/compatible/single_client/test/ops/test_logical_slice_assign.py b/python/oneflow/compatible/single_client/test/ops/test_logical_slice_assign.py
new file mode 100644
index 0000000000000000000000000000000000000000..449e15006a0b8696586bfa6d7460f35d705fd222
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_logical_slice_assign.py
@@ -0,0 +1,108 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgDict
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def _test_slice_assign(
+    test_case,
+    var_shape,
+    slice_tuples,
+    split_axis,
+    dst_device_tag,
+    flow_dtype,
+    device_num,
+):
+    flow.clear_default_session()
+    value_shape = [(s[1] - s[0] - 1) // s[2] + 1 for s in slice_tuples]
+    flow_to_np_dtype_dict = {flow.int8: np.int8, flow.float: np.single}
+    np_dtype = flow_to_np_dtype_dict[flow_dtype]
+    value = np.random.uniform(low=-10, high=10, size=value_shape).astype(np_dtype)
+    if dst_device_tag == "gpu":
+        flow.config.gpu_device_num(device_num)
+
+    def get_var():
+        return flow.get_variable(
+            name="var",
+            shape=var_shape,
+            dtype=flow_dtype,
+            initializer=flow.constant_initializer(0, dtype=flow_dtype),
+            distribute=flow.distribute.split(split_axis),
+        )
+
+    @flow.global_function()
+    def assign_fn(value_def: oft.Numpy.Placeholder(value.shape, dtype=flow_dtype)):
+        with flow.scope.placement(dst_device_tag, "0:0-{}".format(device_num - 1)):
+            var = get_var()
+            flow.experimental.logical_slice_assign(var, value_def, slice_tuples)
+
+    @flow.global_function()
+    def identity_fn():
+        with flow.scope.placement(dst_device_tag, "0:0-{}".format(device_num - 1)):
+            var = get_var()
+            return flow.identity(var)
+
+    assign_fn(value)
+    of_res = identity_fn().get().numpy()
+    np_res = np.zeros(var_shape).astype(np_dtype)
+    slice_objs = []
+    for s in slice_tuples:
+        slice_objs.append(slice(s[0], s[1], s[2]))
+    np_res[tuple(slice_objs)] = value
+    test_case.assertTrue(np.array_equal(of_res, np_res))
+
+
+@flow.unittest.skip_unless_1n4d()
+class TestSliceAssign(flow.unittest.TestCase):
+    def test_slice_assign_4dim_4d(test_case):
+        var_shape = (30, 40, 20, 15)
+        slice_tuples = [(10, 20, 3), (1, 30, 4), (3, 16, 2), (5, 11, 1)]
+        arg_dict = OrderedDict()
+        arg_dict["split_axis"] = list(range(4))
+        arg_dict["dst_device_tag"] = ["cpu", "gpu"]
+        arg_dict["flow_dtype"] = [flow.float, flow.int8]
+        arg_dict["device_num"] = [4]
+        for arg in GenArgDict(arg_dict):
+            _test_slice_assign(test_case, var_shape, slice_tuples, **arg)
+
+    def test_slice_assign_negative_start_stop_4dim_4d(test_case):
+        var_shape = (30, 40, 20, 15)
+        slice_tuples = [(10, 20, 3), (-39, -10, 4), (-15, -5, 2), (5, 11, 1)]
+        arg_dict = OrderedDict()
+        arg_dict["split_axis"] = list(range(4))
+        arg_dict["dst_device_tag"] = ["cpu", "gpu"]
+        arg_dict["flow_dtype"] = [flow.float]
+        arg_dict["device_num"] = [4]
+        for arg in GenArgDict(arg_dict):
+            _test_slice_assign(test_case, var_shape, slice_tuples, **arg)
+
+    def test_slice_assign_2dim_3d(test_case):
+        var_shape = (30, 40)
+        slice_tuples = [(10, 20, 3), (1, 30, 4)]
+        arg_dict = OrderedDict()
+        arg_dict["split_axis"] = list(range(2))
+        arg_dict["dst_device_tag"] = ["cpu", "gpu"]
+        arg_dict["flow_dtype"] = [flow.float]
+        arg_dict["device_num"] = [3]
+        for arg in GenArgDict(arg_dict):
+            _test_slice_assign(test_case, var_shape, slice_tuples, **arg)
diff --git a/python/oneflow/compatible/single_client/test/ops/test_logsoftmax.py b/python/oneflow/compatible/single_client/test/ops/test_logsoftmax.py
new file mode 100644
index 0000000000000000000000000000000000000000..f56e1fc974737b0732dac090808b90f33e17a1a7
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_logsoftmax.py
@@ -0,0 +1,137 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+from typing import Dict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+def _compare_logsoftmax_with_np(
+    input_shape, axis, device_type, machine_ids, device_counts
+):
+    input_1 = np.random.random(size=input_shape).astype(np.float32)
+    assert device_type in ["cpu", "gpu"]
+    flow.clear_default_session()
+    if device_type == "cpu":
+        flow.config.cpu_device_num(device_counts)
+    else:
+        flow.config.gpu_device_num(device_counts)
+    func_config = flow.FunctionConfig()
+    func_config.default_placement_scope(flow.scope.placement(device_type, machine_ids))
+
+    def np_logsoftmax(input, axis):
+        exps = np.exp(input)
+        softmax = exps / np.sum(exps, axis=axis, keepdims=True)
+        return np.log(softmax)
+
+    np_out_logsoftmax = np_logsoftmax(input_1, axis)
+
+    def np_diff(x, axis):
+        _grad = np.ones_like(x)
+        _sum = np.sum(_grad, axis=axis, keepdims=True)
+        _diff = _grad - np.exp(x) * _sum
+        return _diff
+
+    _np_grad = np_diff(np_out_logsoftmax, axis)
+
+    def assert_prediction_grad(blob: tp.Numpy):
+        assert np.allclose(blob, _np_grad, atol=1e-05)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def oneflow_logsoftmax(
+        of_input_1: tp.Numpy.Placeholder(shape=input_1.shape),
+    ) -> tp.Numpy:
+        with flow.scope.placement(device_type, "0:0"):
+            v = flow.get_variable(
+                shape=input_1.shape,
+                dtype=flow.float32,
+                initializer=flow.zeros_initializer(),
+                name="x_var",
+            )
+            x_var = of_input_1 + v
+        flow.watch_diff(x_var, assert_prediction_grad)
+        of_logsoftmax_out = flow.nn.logsoftmax(x_var, axis)
+        with flow.scope.placement(device_type, "0:0"):
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+            ).minimize(of_logsoftmax_out)
+        return of_logsoftmax_out
+
+    of_logsoftmax_out = oneflow_logsoftmax(input_1)
+    assert np.allclose(of_logsoftmax_out, np_out_logsoftmax, rtol=1e-05)
+
+
+def _gen_arg_dict(shape, axis, device_type, machine_ids, device_counts):
+    arg_dict = OrderedDict()
+    arg_dict["input_shape"] = [shape]
+    arg_dict["axis"] = [*axis]
+    arg_dict["device_type"] = [device_type]
+    arg_dict["machine_ids"] = [machine_ids]
+    arg_dict["device_counts"] = [device_counts]
+    return arg_dict
+
+
+@flow.unittest.skip_unless_1n1d()
+class Testlogsoftmax1n1d(flow.unittest.TestCase):
+    def test_logsoftmax_cpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(2, 64, 32),
+            axis=(1, -1),
+            device_type="cpu",
+            machine_ids="0:0",
+            device_counts=1,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_logsoftmax_with_np(*arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_logsoftmax_gpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(2, 4, 6, 2),
+            axis=(2, -2),
+            device_type="gpu",
+            machine_ids="0:0",
+            device_counts=1,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_logsoftmax_with_np(*arg)
+
+
+@flow.unittest.skip_unless_1n2d()
+class Testlogsoftmax1n2d(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_logsoftmax_gpu_1n2d(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(2, 8),
+            axis=(-1, 1),
+            device_type="gpu",
+            machine_ids="0:0-1",
+            device_counts=2,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_logsoftmax_with_np(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_masked_fill.py b/python/oneflow/compatible/single_client/test/ops/test_masked_fill.py
new file mode 100644
index 0000000000000000000000000000000000000000..166a2b94b993e1a5b974902ae21eb124da6013e2
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_masked_fill.py
@@ -0,0 +1,143 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import (
+    GenArgDict,
+    test_global_storage,
+    type_name_to_flow_type,
+    type_name_to_np_type,
+)
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def _masked_fill_np_fw_bw(x, mask, y_diff, type_name, value=0):
+    brocadcast_shape = np.broadcast(x, mask).shape
+    brocadcasted_x = np.broadcast_to(x, brocadcast_shape).astype(type_name)
+    brocadcasted_mask = np.broadcast_to(mask, brocadcast_shape)
+    masked_x = np.ma.array(brocadcasted_x, mask=brocadcasted_mask, fill_value=value)
+    y = masked_x.filled()
+    zero_like = np.zeros_like(y_diff)
+    filted_y_diff = np.where(brocadcasted_mask, zero_like, y_diff)
+    extended_axes_num = len(y_diff.shape) - len(x.shape)
+    extended_axes = tuple(range(extended_axes_num))
+    mid_diff = np.add.reduce(filted_y_diff, axis=extended_axes)
+    diff_axes = list()
+    for i in range(len(x.shape)):
+        if x.shape[i] != y_diff.shape[i + extended_axes_num]:
+            assert x.shape[i] == 1 and y_diff.shape[i + extended_axes_num] != 1
+            diff_axes.append(i)
+    if len(diff_axes) != 0:
+        x_diff = np.add.reduce(mid_diff, axis=tuple(diff_axes), keepdims=True)
+    else:
+        x_diff = mid_diff
+    return (y, x_diff)
+
+
+def _test_masked_fill_fw_bw(test_case, device, x_shape, mask_shape, type_name, value=0):
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    if type_name == "float16":
+        flow_type = flow.float
+        np_type = np.float32
+    else:
+        flow_type = type_name_to_flow_type[type_name]
+        np_type = type_name_to_np_type[type_name]
+    func_config.default_data_type(flow_type)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def test_masked_fill_fw_bw_job(
+        x: oft.Numpy.Placeholder(x_shape, dtype=flow_type),
+        mask: oft.Numpy.Placeholder(mask_shape, dtype=flow_type),
+    ):
+        with flow.scope.placement(device, "0:0"):
+            y = flow.get_variable(
+                name="vx",
+                shape=(1,),
+                dtype=flow.float,
+                initializer=flow.zeros_initializer(),
+            )
+            x += flow.cast(y, flow_type)
+            mask = flow.cast(mask, dtype=flow.int8)
+            if type_name == "float16":
+                out = flow.cast(
+                    flow.masked_fill(flow.cast(x, flow.float16), mask, value),
+                    flow.float,
+                )
+            else:
+                out = flow.masked_fill(x, mask, value)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(out)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch(out, test_global_storage.Setter("out"))
+            flow.watch_diff(out, test_global_storage.Setter("out_diff"))
+            return out
+
+    x = np.random.randint(low=0, high=100, size=x_shape)
+    mask = np.random.randint(low=0, high=2, size=mask_shape)
+    test_masked_fill_fw_bw_job(x.astype(np_type), mask.astype(np_type)).get()
+    out_diff = test_global_storage.Get("out_diff")
+    (np_out, np_x_diff) = _masked_fill_np_fw_bw(x, mask, out_diff, np_type, value)
+    if type_name == "float16":
+        tolerance = 0.001
+    else:
+        tolerance = 1e-05
+    test_case.assertTrue(
+        np.allclose(
+            np_out, test_global_storage.Get("out"), rtol=tolerance, atol=tolerance
+        )
+    )
+    test_case.assertTrue(
+        np.allclose(
+            np_x_diff, test_global_storage.Get("x_diff"), rtol=tolerance, atol=tolerance
+        )
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestMaskedFill(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_masked_fill_fw_bw(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["type_name"] = [
+            "float32",
+            "float16",
+            "double",
+            "int8",
+            "int32",
+            "int64",
+        ]
+        arg_dict["device"] = ["gpu", "cpu"]
+        arg_dict["x_shape"] = [(2, 2, 4), (2, 1, 4), (2, 2, 3, 2, 4)]
+        arg_dict["mask_shape"] = [(2, 1, 2, 4)]
+        arg_dict["value"] = [2.5, -5.5]
+        for arg in GenArgDict(arg_dict):
+            if arg["device"] == "cpu" and arg["type_name"] == "float16":
+                continue
+            _test_masked_fill_fw_bw(test_case, **arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_matmul.py b/python/oneflow/compatible/single_client/test/ops/test_matmul.py
new file mode 100644
index 0000000000000000000000000000000000000000..90cf20f814784d38bdedaef4fdf101f15243d0c9
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_matmul.py
@@ -0,0 +1,456 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import typing
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import test_global_storage
+from test_util import GenArgDict, GenArgList, type_name_to_flow_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def compare_with_tensorflow(
+    device_type,
+    a_shape,
+    b_shape,
+    transpose_a,
+    transpose_b,
+    data_type,
+    fuse_add_to_output,
+    enable_tf32,
+    alpha,
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.enable_fuse_add_to_output(fuse_add_to_output)
+    flow.config.enable_tensor_float_32_compute(enable_tf32)
+    if data_type == "float16":
+        dtype = flow.float
+    else:
+        dtype = type_name_to_flow_type[data_type]
+
+    @flow.global_function(type="train", function_config=func_config)
+    def MatmulJob():
+        with flow.scope.placement(device_type, "0:0"):
+            a = flow.get_variable(
+                "a",
+                shape=a_shape,
+                dtype=dtype,
+                initializer=flow.random_uniform_initializer(minval=0, maxval=1),
+                trainable=True,
+            )
+            b = flow.get_variable(
+                "b",
+                shape=b_shape,
+                dtype=dtype,
+                initializer=flow.random_uniform_initializer(minval=0, maxval=1),
+                trainable=True,
+            )
+            if data_type == "float16":
+                out = flow.matmul(
+                    flow.cast(a, dtype=flow.float16),
+                    flow.cast(b, dtype=flow.float16),
+                    transpose_a,
+                    transpose_b,
+                    alpha,
+                )
+                c = flow.get_variable(
+                    "c",
+                    shape=out.shape,
+                    dtype=dtype,
+                    initializer=flow.random_uniform_initializer(minval=-1, maxval=1),
+                    trainable=True,
+                )
+                loss = flow.cast(
+                    out + flow.cast(c, dtype=flow.float16), dtype=flow.float
+                )
+            else:
+                out = flow.matmul(a, b, transpose_a, transpose_b, alpha)
+                c = flow.get_variable(
+                    "c",
+                    shape=out.shape,
+                    dtype=dtype,
+                    initializer=flow.random_uniform_initializer(minval=-1, maxval=1),
+                    trainable=True,
+                )
+                loss = out + c
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            flow.watch(a, test_global_storage.Setter("a"))
+            flow.watch_diff(a, test_global_storage.Setter("a_diff"))
+            flow.watch(b, test_global_storage.Setter("b"))
+            flow.watch_diff(b, test_global_storage.Setter("b_diff"))
+            flow.watch(c, test_global_storage.Setter("c"))
+            flow.watch_diff(c, test_global_storage.Setter("c_diff"))
+            flow.watch(loss, test_global_storage.Setter("loss"))
+            flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))
+            return loss
+
+    of_out = MatmulJob().get()
+    with tf.GradientTape(persistent=True) as tape:
+        a = tf.Variable(test_global_storage.Get("a"))
+        b = tf.Variable(test_global_storage.Get("b"))
+        c = tf.Variable(test_global_storage.Get("c"))
+        if data_type == "float16":
+            a = tf.cast(a, tf.float16)
+            b = tf.cast(b, tf.float16)
+            c = tf.cast(c, tf.float16)
+        tf_out = tf.matmul(a, b, transpose_a, transpose_b)
+        tf_out = tf_out * alpha
+        tf_out = tf_out + c
+        if data_type == "float16":
+            tf_out = tf.cast(tf_out, tf.float32)
+    loss_diff = test_global_storage.Get("loss_diff")
+    tf_a_diff = tape.gradient(tf_out, a, loss_diff)
+    tf_b_diff = tape.gradient(tf_out, b, loss_diff)
+    tf_c_diff = tape.gradient(tf_out, c, loss_diff)
+    if data_type == "float16":
+        tolerance = 0.002
+    else:
+        tolerance = 0.001
+    assert np.allclose(
+        of_out.numpy(), tf_out.numpy(), rtol=tolerance, atol=tolerance
+    ), np.max(np.abs(of_out.numpy() - tf_out.numpy()))
+    assert np.allclose(
+        test_global_storage.Get("a_diff"),
+        tf_a_diff.numpy(),
+        rtol=tolerance,
+        atol=tolerance,
+    )
+    assert np.allclose(
+        test_global_storage.Get("b_diff"),
+        tf_b_diff.numpy(),
+        rtol=tolerance,
+        atol=tolerance,
+    )
+    assert np.allclose(
+        test_global_storage.Get("c_diff"),
+        tf_c_diff.numpy(),
+        rtol=tolerance,
+        atol=tolerance,
+    )
+
+
+def filter_args(arg_list):
+    def trans_shape(shape):
+        tmp_shape = shape[:-2]
+        tmp_shape += (shape[-1], shape[-2])
+        return tmp_shape
+
+    ret = []
+    for arg in arg_list:
+        a_shape = arg[1]
+        b_shape = arg[2]
+        if arg[3]:
+            a_shape = trans_shape(a_shape)
+        if arg[4]:
+            b_shape = trans_shape(b_shape)
+        if a_shape[-1] == b_shape[-2]:
+            ret.append(tuple(arg))
+    return ret
+
+
+def gen_arg_list():
+    arg_dict = OrderedDict()
+    arg_dict["device_type"] = ["gpu", "cpu"]
+    arg_dict["a_shape"] = [(512, 256), (256, 512)]
+    arg_dict["b_shape"] = [(256, 1024), (1024, 256)]
+    arg_dict["transpose_a"] = [True, False]
+    arg_dict["transpose_b"] = [True, False]
+    arg_dict["data_type"] = ["float16", "float32", "double"]
+    arg_dict["fuse_add_to_output"] = [True, False]
+    arg_dict["enable_tf32"] = [True, False]
+    arg_dict["alpha"] = [1.5, 1]
+    matmul_args = filter_args(GenArgList(arg_dict))
+    arg_dict.clear()
+    arg_dict["device_type"] = ["gpu", "cpu"]
+    arg_dict["a_shape"] = [(10, 10, 64, 32), (10, 10, 32, 64)]
+    arg_dict["b_shape"] = [(10, 10, 32, 128), (10, 10, 128, 32)]
+    arg_dict["transpose_a"] = [True, False]
+    arg_dict["transpose_b"] = [True, False]
+    arg_dict["data_type"] = ["float16", "float32", "double"]
+    arg_dict["fuse_add_to_output"] = [True, False]
+    arg_dict["enable_tf32"] = [True, False]
+    arg_dict["alpha"] = [2.0]
+    batch_matmul_args = filter_args(GenArgList(arg_dict))
+    return matmul_args + batch_matmul_args
+
+
+def filter_args_v2(arg_dict_list):
+    def trans_shape(shape):
+        tmp_shape = shape[:-2]
+        tmp_shape += (shape[-1], shape[-2])
+        return tmp_shape
+
+    ret = []
+    for arg_dict in arg_dict_list:
+        if arg_dict["transpose_a"]:
+            a_shape = trans_shape(arg_dict["a_shape"])
+        else:
+            a_shape = arg_dict["a_shape"]
+        if arg_dict["transpose_b"]:
+            b_shape = trans_shape(arg_dict["b_shape"])
+        else:
+            b_shape = arg_dict["b_shape"]
+        if a_shape[-1] != b_shape[-2]:
+            continue
+        if arg_dict["device_type"] == "cpu" and (
+            arg_dict["data_type"] == "float16" or arg_dict["enable_tf32"] is True
+        ):
+            continue
+        if arg_dict["data_type"] != "float32" and arg_dict["enable_tf32"] is True:
+            continue
+        if (
+            arg_dict["test_add_to_output"] is False
+            and arg_dict["fuse_add_to_output"] is True
+        ):
+            continue
+        arg_dict["atol"] = 1e-05
+        ret.append(arg_dict)
+    return ret
+
+
+def gen_args():
+    args = OrderedDict()
+    args["a_shape"] = [(10, 3, 4), (7, 6, 8)]
+    args["b_shape"] = [(4, 5), (10, 8)]
+    args["transpose_a"] = [False]
+    args["transpose_b"] = [True, False]
+    args["alpha"] = [1.5, 1]
+    args["data_type"] = ["float16", "float32", "double"]
+    args["device_type"] = ["gpu", "cpu"]
+    args["test_add_to_output"] = [True, False]
+    args["fuse_add_to_output"] = [True, False]
+    args["enable_tf32"] = [True, False]
+    return filter_args_v2(GenArgDict(args))
+
+
+def get_lr_scheduler():
+    return flow.optimizer.PiecewiseConstantScheduler([], [0.0001])
+
+
+def get_optimizer():
+    return flow.optimizer.SGD(get_lr_scheduler(), momentum=0)
+
+
+def make_matmul_func(
+    a_shape,
+    b_shape,
+    trans_a,
+    trans_b,
+    alpha,
+    dtype,
+    device_type,
+    test_add_to_output,
+    fuse_add_to_output,
+    tf32,
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    flow.config.enable_tensor_float_32_compute(tf32)
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.enable_fuse_add_to_output(fuse_add_to_output)
+    func_config.default_placement_scope(flow.scope.placement(device_type, "0:0"))
+
+    @flow.global_function(type="train", function_config=func_config)
+    def matmul_job() -> typing.Tuple[
+        flow.typing.Numpy, flow.typing.Numpy, flow.typing.Numpy, flow.typing.Numpy
+    ]:
+        a_var = flow.get_variable(
+            "a",
+            shape=a_shape,
+            dtype=flow.float32,
+            initializer=flow.random_uniform_initializer(minval=0, maxval=1),
+            trainable=True,
+        )
+        b_var = flow.get_variable(
+            "b",
+            shape=b_shape,
+            dtype=flow.float32,
+            initializer=flow.random_uniform_initializer(minval=0, maxval=1),
+            trainable=True,
+        )
+        flow.watch_diff(a_var, test_global_storage.Setter("a_diff"))
+        flow.watch_diff(b_var, test_global_storage.Setter("b_diff"))
+        if dtype is flow.float16:
+            a = flow.amp_white_identity(a_var)
+            b = flow.amp_white_identity(b_var)
+        else:
+            a = a_var
+            b = b_var
+        c = flow.matmul(a, b, trans_a, trans_b, alpha)
+        add_to = flow.get_variable(
+            "c",
+            shape=c.shape,
+            dtype=flow.float32,
+            initializer=flow.random_uniform_initializer(minval=-1, maxval=1),
+            trainable=True,
+        )
+        if test_add_to_output:
+            flow.watch_diff(add_to, test_global_storage.Setter("add_to_diff"))
+            if dtype is flow.float16:
+                add_to = flow.amp_white_identity(add_to)
+            c = c + add_to
+        flow.watch_diff(c, test_global_storage.Setter("c_diff"))
+        get_optimizer().minimize(c)
+        return (a_var, b_var, add_to, c)
+
+    return matmul_job
+
+
+def np_matmul(a, b, trans_a=False, trans_b=False, bias=None, alpha=None):
+    assert len(a.shape) >= 2
+    assert len(b.shape) >= 2
+
+    def transpose(x):
+        if len(x.shape) == 2:
+            x = np.transpose(x)
+        else:
+            perm = list(range(x.ndim)[:-2]) + [x.ndim - 1, x.ndim - 2]
+            x = np.transpose(x, perm)
+        return x
+
+    if trans_a:
+        a = transpose(a)
+    if trans_b:
+        b = transpose(b)
+    c = np.matmul(a, b)
+    if alpha is not None:
+        c = c * float(alpha)
+    if bias is not None:
+        c = c + bias
+    return c
+
+
+def compare_with_np(
+    test_case,
+    a_shape,
+    b_shape,
+    transpose_a,
+    transpose_b,
+    alpha=1.0,
+    data_type="float32",
+    device_type="gpu",
+    test_add_to_output=False,
+    fuse_add_to_output=False,
+    enable_tf32=False,
+    rtol=1e-05,
+    atol=1e-08,
+):
+    def print_dbg_info(lhs=None, rhs=None):
+        print(
+            f"a_shape: {a_shape}, b_shape: {b_shape}, transpose_a: {transpose_a}, transpose_b: {transpose_b}, alpha: {alpha}, data_type: {data_type}, device_type: {device_type}, test_add_to_output: {test_add_to_output}, fuse_add_to_output: {fuse_add_to_output}, enable_tf32: {enable_tf32}"
+        )
+        if lhs is not None:
+            print(f"lhs: {lhs.shape}\n{lhs}")
+        if rhs is not None:
+            print(f"rhs: {rhs.shape}\n{rhs}")
+        if lhs is not None and rhs is not None:
+            diff = lhs - rhs
+            print(f"abs diff mean: {np.abs(diff).mean()}")
+            print(f"abs diff max: {np.abs(diff).max()}")
+
+    dtype = type_name_to_flow_type[data_type]
+    matmul_fn = make_matmul_func(
+        a_shape,
+        b_shape,
+        transpose_a,
+        transpose_b,
+        alpha,
+        dtype,
+        device_type,
+        test_add_to_output,
+        fuse_add_to_output,
+        enable_tf32,
+    )
+    (a, b, add_to_output, c) = matmul_fn()
+    if test_add_to_output is False:
+        add_to_output = None
+    c_ = np_matmul(a, b, transpose_a, transpose_b, bias=add_to_output, alpha=alpha)
+    comp_c_result = np.allclose(c, c_, rtol, atol)
+    if not comp_c_result:
+        print_dbg_info(c, c_)
+    test_case.assertTrue(comp_c_result)
+    c_diff = test_global_storage.Get("c_diff")
+    a_diff = test_global_storage.Get("a_diff")
+    b_diff = test_global_storage.Get("b_diff")
+    if transpose_a:
+        raise NotImplementedError
+    else:
+        a_diff_ = np_matmul(
+            c_diff, b, transpose_a, not transpose_b, bias=None, alpha=alpha
+        )
+    comp_a_diff_result = np.allclose(a_diff, a_diff_, rtol, atol)
+    if not comp_a_diff_result:
+        print_dbg_info(a_diff, a_diff_)
+    test_case.assertTrue(comp_a_diff_result)
+    if transpose_b:
+        b_diff_ = np_matmul(
+            c_diff.reshape((-1, c_diff.shape[-1])),
+            a.reshape((-1, a.shape[-1])),
+            True,
+            transpose_a,
+            bias=None,
+            alpha=alpha,
+        )
+    else:
+        b_diff_ = np_matmul(
+            a.reshape((-1, a.shape[-1])),
+            c_diff.reshape((-1, c_diff.shape[-1])),
+            not transpose_a,
+            False,
+            bias=None,
+            alpha=alpha,
+        )
+    comp_b_diff_result = np.allclose(b_diff, b_diff_, rtol, atol)
+    if not comp_b_diff_result:
+        print_dbg_info(b_diff, b_diff_)
+    test_case.assertTrue(comp_b_diff_result)
+    if test_add_to_output:
+        add_to_diff = test_global_storage.Get("add_to_diff")
+        test_case.assertTrue(np.allclose(add_to_diff, c_diff))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestMatmul(flow.unittest.TestCase):
+    def test_matmul(test_case):
+        for arg in gen_arg_list():
+            if arg[0] == "cpu" and (arg[5] == "float16" or arg[7] == True):
+                continue
+            if arg[5] != "float32" and arg[7] == True:
+                continue
+            compare_with_tensorflow(*arg)
+
+    def test_broadcast_matmul(self):
+        for arg in gen_args():
+            compare_with_np(self, **arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_memory_zone_out_of_memory.py b/python/oneflow/compatible/single_client/test/ops/test_memory_zone_out_of_memory.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d5ac72fd03ceea512462d39e9f6a46e58f3c197
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_memory_zone_out_of_memory.py
@@ -0,0 +1,59 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+from collections import OrderedDict
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+
+class MemoryZoneOutOfMemoryException(Exception):
+    def __init__(self, err="memory_zone_out_of_memory"):
+        Exception.__init__(self, err)
+
+
+def constant(device_type):
+    flow.env.init()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+
+    @flow.global_function(function_config=func_config)
+    def ConstantJob():
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.constant(
+                6, dtype=flow.float, shape=(1024 * 1024 * 1024, 1024 * 1024 * 1024)
+            )
+            return x
+
+    try:
+        ConstantJob().get()
+    except Exception as e:
+        if "memory_zone_out_of_memory" in str(e):
+            print(e)
+            raise MemoryZoneOutOfMemoryException()
+
+
+def memory_zone_out_of_memory_of_gpu():
+    return constant("gpu")
+
+
+def memory_zone_out_of_memory_of_cpu():
+    return constant("cpu")
diff --git a/python/oneflow/compatible/single_client/test/ops/test_mish.py b/python/oneflow/compatible/single_client/test/ops/test_mish.py
new file mode 100644
index 0000000000000000000000000000000000000000..abb0a1a65161ea5bf3874895b40f2f8b5ee92af0
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_mish.py
@@ -0,0 +1,118 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+from typing import Dict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+def _compare_mish_with_np(input_shape, device_type, machine_ids, device_counts):
+    input_1 = np.random.random(size=input_shape).astype(np.float32)
+    assert device_type in ["cpu", "gpu"]
+    flow.clear_default_session()
+    if device_type == "cpu":
+        flow.config.cpu_device_num(device_counts)
+    else:
+        flow.config.gpu_device_num(device_counts)
+    func_config = flow.FunctionConfig()
+    func_config.default_placement_scope(flow.scope.placement(device_type, machine_ids))
+
+    def np_mish(input):
+        return input * np.tanh(np.log1p(np.exp(input)))
+
+    np_out_mish = np_mish(input_1)
+
+    def np_diff(input):
+        u = np.log1p(np.exp(input))
+        return np.tanh(u) + input * (1 - np.tanh(u) ** 2) * (
+            np.exp(input) / (1 + np.exp(input))
+        )
+
+    _np_grad = np_diff(input_1)
+
+    def assert_prediction_grad(blob: tp.Numpy):
+        assert np.allclose(blob, _np_grad)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def oneflow_mish(of_input_1: tp.Numpy.Placeholder(shape=input_1.shape)) -> tp.Numpy:
+        with flow.scope.placement(device_type, "0:0"):
+            v = flow.get_variable(
+                shape=input_1.shape,
+                dtype=flow.float32,
+                initializer=flow.zeros_initializer(),
+                name="x_var",
+            )
+            x_var = of_input_1 + v
+        flow.watch_diff(x_var, assert_prediction_grad)
+        of_mish_out = flow.nn.mish(x_var)
+        with flow.scope.placement(device_type, "0:0"):
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+            ).minimize(of_mish_out)
+        return of_mish_out
+
+    of_out_mish = oneflow_mish(input_1)
+    assert np.allclose(of_out_mish, np_out_mish)
+
+
+def _gen_arg_dict(shape, device_type, machine_ids, device_counts):
+    arg_dict = OrderedDict()
+    arg_dict["input_shape"] = [shape]
+    arg_dict["device_type"] = [device_type]
+    arg_dict["machine_ids"] = [machine_ids]
+    arg_dict["device_counts"] = [device_counts]
+    return arg_dict
+
+
+@flow.unittest.skip_unless_1n1d()
+class Testmish1n1d(flow.unittest.TestCase):
+    def test_mish_cpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(3, 3), device_type="cpu", machine_ids="0:0", device_counts=1
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_mish_with_np(*arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_mish_gpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(3, 16, 32), device_type="gpu", machine_ids="0:0", device_counts=1
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_mish_with_np(*arg)
+
+
+@flow.unittest.skip_unless_1n2d()
+class Testmish1n2d(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_mish_gpu_1n2d(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(3, 8, 8, 4), device_type="gpu", machine_ids="0:0-1", device_counts=2
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_mish_with_np(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_mod.py b/python/oneflow/compatible/single_client/test/ops/test_mod.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a3d735f171ac44401450c81cdc122c0e0f7434a
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_mod.py
@@ -0,0 +1,78 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+func_config = flow.FunctionConfig()
+func_config.default_data_type(flow.float)
+
+
+def GenerateTest(test_case, a_shape, b_shape):
+    @flow.global_function(function_config=func_config)
+    def ModJob(a: oft.Numpy.Placeholder(a_shape), b: oft.Numpy.Placeholder(b_shape)):
+        return a % b
+
+    a = np.random.rand(*a_shape).astype(np.float32)
+    b = np.random.rand(*b_shape).astype(np.float32)
+    y = ModJob(a, b).get().numpy()
+    test_case.assertTrue(np.allclose(y, a % b))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestMod(flow.unittest.TestCase):
+    def test_naive(test_case):
+        @flow.global_function(function_config=func_config)
+        def ModJob(a: oft.Numpy.Placeholder((5, 2)), b: oft.Numpy.Placeholder((5, 2))):
+            return a % b
+
+        x = np.random.rand(5, 2).astype(np.float32)
+        y = np.random.rand(5, 2).astype(np.float32)
+        z = None
+        z = ModJob(x, y).get().numpy()
+        test_case.assertTrue(np.allclose(z, x % y))
+
+    def test_broadcast(test_case):
+        @flow.global_function(function_config=func_config)
+        def ModJob(a: oft.Numpy.Placeholder((5, 2)), b: oft.Numpy.Placeholder((1, 2))):
+            return a % b
+
+        x = np.random.rand(5, 2).astype(np.float32)
+        y = np.random.rand(1, 2).astype(np.float32)
+        z = None
+        z = ModJob(x, y).get().numpy()
+        test_case.assertTrue(np.allclose(z, x % y))
+
+    def test_xy_mod_x1(test_case):
+        GenerateTest(test_case, (64, 64), (64, 1))
+
+    def test_xy_mod_1y(test_case):
+        GenerateTest(test_case, (64, 64), (1, 64))
+
+    def test_xyz_mod_x1z(test_case):
+        GenerateTest(test_case, (64, 64, 64), (64, 1, 64))
+
+    def test_xyz_mod_1y1(test_case):
+        GenerateTest(test_case, (64, 64, 64), (1, 64, 1))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_mod_int.py b/python/oneflow/compatible/single_client/test/ops/test_mod_int.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d40b3901a57b3be5d986d7bc8187aeaeb90ef1d
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_mod_int.py
@@ -0,0 +1,87 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+func_config = flow.FunctionConfig()
+func_config.default_data_type(flow.int32)
+
+
+def GenerateTest(test_case, a_shape, b_shape):
+    @flow.global_function(function_config=func_config)
+    def ModJob(
+        a: oft.Numpy.Placeholder(a_shape, dtype=flow.int32),
+        b: oft.Numpy.Placeholder(b_shape, dtype=flow.int32),
+    ):
+        return a % b
+
+    a = (np.random.rand(*a_shape) * 1000).astype(np.int32) + 1
+    b = (np.random.rand(*b_shape) * 1000).astype(np.int32) + 1
+    y = ModJob(a, b).get().numpy()
+    test_case.assertTrue(np.array_equal(y, a % b))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestModInt(flow.unittest.TestCase):
+    def test_naive(test_case):
+        @flow.global_function(function_config=func_config)
+        def ModJob(
+            a: oft.Numpy.Placeholder((5, 2), dtype=flow.int32),
+            b: oft.Numpy.Placeholder((5, 2), dtype=flow.int32),
+        ):
+            return a % b
+
+        x = (np.random.rand(5, 2) * 1000).astype(np.int32) + 1
+        y = (np.random.rand(5, 2) * 1000).astype(np.int32) + 1
+        z = None
+        z = ModJob(x, y).get().numpy()
+        test_case.assertTrue(np.array_equal(z, x % y))
+
+    def test_broadcast(test_case):
+        @flow.global_function(function_config=func_config)
+        def ModJob(
+            a: oft.Numpy.Placeholder((5, 2), dtype=flow.int32),
+            b: oft.Numpy.Placeholder((1, 2), dtype=flow.int32),
+        ):
+            return a % b
+
+        x = (np.random.rand(5, 2) * 1000).astype(np.int32) + 1
+        y = (np.random.rand(1, 2) * 1000).astype(np.int32) + 1
+        z = None
+        z = ModJob(x, y).get().numpy()
+        test_case.assertTrue(np.array_equal(z, x % y))
+
+    def test_xy_mod_x1(test_case):
+        GenerateTest(test_case, (64, 64), (64, 1))
+
+    def test_xy_mod_1y(test_case):
+        GenerateTest(test_case, (64, 64), (1, 64))
+
+    def test_xyz_mod_x1z(test_case):
+        GenerateTest(test_case, (64, 64, 64), (64, 1, 64))
+
+    def test_xyz_mod_1y1(test_case):
+        GenerateTest(test_case, (64, 64, 64), (1, 64, 1))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_model.py b/python/oneflow/compatible/single_client/test/ops/test_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..20772161ba2893b4cde9dedd446610765a59a68c
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_model.py
@@ -0,0 +1,142 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import tempfile
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible.single_client import experimental as flow
+from oneflow.compatible.single_client.nn.parameter import Parameter
+
+
+@unittest.skipIf(
+    not flow.unittest.env.eager_execution_enabled(),
+    ".numpy() doesn't work in lazy mode",
+)
+class TestEagerModel(flow.unittest.TestCase):
+    def test_model(test_case):
+        model_dir = tempfile.TemporaryDirectory()
+        model_dir_path = model_dir.name
+        para = np.random.randn(2, 3)
+        in_data = np.full((2, 3), 1)
+
+        class CustomModule(flow.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.w = Parameter(flow.Tensor(para))
+
+            def forward(self, x):
+                return x + self.w
+
+        class EagerModel(flow.Model):
+            def __init__(self):
+                super().__init__()
+                self.m = CustomModule()
+
+            def forward(self, x):
+                return self.m(x)
+
+            def training_step(self, batch, **kwargs):
+                return (flow.sum(self(batch)), list(self.m.parameters())[0])
+
+            def configure_optimizers(self):
+                sgd = flow.optim.SGD(
+                    [
+                        {
+                            "params": list(self.m.parameters()),
+                            "lr": 1.0,
+                            "momentum": 0.0,
+                            "scale": 1.0,
+                        }
+                    ]
+                )
+                return sgd
+
+            def validation_step(self, batch):
+                return self(batch)
+
+        class TrainData(flow.model.DataModule):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, step_idx=0, optimizer_idx=0):
+                return flow.ones((2, 3))
+
+        class ValData(flow.model.DataModule):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, step_idx=0, optimizer_idx=0):
+                return flow.ones((2, 3))
+
+        class OutputMonitor(flow.model.Callback):
+            def on_training_step_end(self, step_idx, outputs, optimizer_idx):
+                nonlocal para
+                loss = outputs[0].numpy()
+                test_case.assertTrue(
+                    np.allclose(
+                        loss, np.sum(in_data + para, dtype=np.float), 0.0001, 0.0001
+                    )
+                )
+                para -= 1
+                test_case.assertTrue(
+                    np.allclose(outputs[1].numpy(), para, 0.0001, 0.0001)
+                )
+                fmt_str = "{:>12}  {:>12}  {:>12.6f}"
+                print(fmt_str.format(step_idx, "train loss:", loss.mean()))
+
+            def on_validation_step_end(self, step_idx, outputs):
+                nonlocal para
+                test_case.assertTrue(
+                    np.allclose(outputs.numpy(), in_data + para, 0.0001, 0.0001)
+                )
+                fmt_str = "{:>12}  {:>12}  {:>12.6f}"
+                print(
+                    fmt_str.format(
+                        step_idx, "validation output:", outputs.numpy().mean()
+                    )
+                )
+
+        train_config = flow.model.TrainingConfig()
+        train_config.config_data(TrainData())
+        val_config = flow.model.ValidationConfig()
+        val_config.config_data(ValData())
+        val_config.config_step_interval(1)
+        ck_config = flow.model.CheckpointConfig()
+        ck_config.config_save(dirpath=model_dir_path, step_interval=5)
+        output_monitor_cb = OutputMonitor()
+        eager_md = EagerModel()
+        eager_md.fit(
+            training_config=train_config,
+            validation_config=val_config,
+            checkpoint_config=ck_config,
+            callbacks=output_monitor_cb,
+            max_steps=10,
+        )
+        step_9_para = list(eager_md.parameters())[0][0].numpy()
+        eager_md_load = EagerModel()
+        eager_md_load.load_state_dict(flow.load(model_dir_path + "-4"))
+        loaded_step_4_para = list(eager_md_load.parameters())[0][0].numpy()
+        test_case.assertTrue(
+            np.allclose(step_9_para, loaded_step_4_para - 5, 0.0001, 0.0001)
+        )
+        model_dir.cleanup()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_model_io.py b/python/oneflow/compatible/single_client/test/ops/test_model_io.py
new file mode 100644
index 0000000000000000000000000000000000000000..44c8600b5a71918abd97bf3cc84736c6d238e2d6
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_model_io.py
@@ -0,0 +1,110 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import time
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+def _make_gen_var_func(shape, dtype, lr):
+    @flow.global_function(type="train")
+    def gen_var(x: tp.Numpy.Placeholder(shape=shape, dtype=dtype)) -> tp.Numpy:
+        var = flow.get_variable(
+            name="var",
+            shape=shape,
+            dtype=dtype,
+            initializer=flow.random_uniform_initializer(),
+        )
+        y = var + x
+        flow.optimizer.SGD(
+            flow.optimizer.PiecewiseConstantScheduler([], [lr]), momentum=0
+        ).minimize(y)
+        return var
+
+    return gen_var
+
+
+def _make_get_var_func(shape, dtype):
+    @flow.global_function(type="predict")
+    def get_var() -> tp.Numpy:
+        return flow.get_variable(
+            name="var",
+            shape=shape,
+            dtype=dtype,
+            initializer=flow.random_uniform_initializer(),
+            reuse=True,
+        )
+
+    return get_var
+
+
+def _load_snapshot_manually(path, shape, dtype):
+    var_path = os.path.join(path, "var", "out")
+    return np.fromfile(
+        var_path, dtype=flow.convert_oneflow_dtype_to_numpy_dtype(dtype)
+    ).reshape(*shape)
+
+
+def _test_model_io(test_case, shape, dtype, lr, num_iters):
+    flow.clear_default_session()
+    flow.config.enable_legacy_model_io(True)
+    gen_var = _make_gen_var_func(shape, dtype, lr)
+    model_save_root_dir = "./log/snapshot/"
+    if not os.path.exists(model_save_root_dir):
+        os.makedirs(model_save_root_dir)
+    snapshot_path = model_save_root_dir + "snapshot-{}".format(
+        time.strftime("%Y%m%d-%H:%M:%S")
+    )
+    checkpoint = flow.train.CheckPoint()
+    checkpoint.init()
+    variables = []
+    for i in range(num_iters):
+        var = gen_var(
+            np.random.rand(*shape).astype(
+                flow.convert_oneflow_dtype_to_numpy_dtype(dtype)
+            )
+        )
+        if i > 0:
+            test_case.assertTrue(np.allclose(var, variables[-1] - lr / var.size))
+        variables.append(var)
+        checkpoint.save("{}-{}".format(snapshot_path, i))
+    flow.clear_default_session()
+    get_var = _make_get_var_func(shape, dtype)
+    final_snapshot_path = "{}-{}".format(snapshot_path, num_iters - 1)
+    checkpoint = flow.train.CheckPoint()
+    checkpoint.load(final_snapshot_path)
+    final_var = get_var()
+    var_from_file = _load_snapshot_manually(final_snapshot_path, shape, dtype)
+    test_case.assertTrue(np.allclose(final_var, var_from_file))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestModelIo(flow.unittest.TestCase):
+    def test_model_io_case_0(test_case):
+        if flow.eager_execution_enabled():
+            print("\nSkip under erger mode!")
+            return
+        _test_model_io(test_case, (2, 2), flow.float32, 0.01, 10)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_module_container.py b/python/oneflow/compatible/single_client/test/ops/test_module_container.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc3407f255ea9d54cfa02b9bff0b4f1852859bee
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_module_container.py
@@ -0,0 +1,46 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from typing import Tuple
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible.single_client import experimental as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+@unittest.skipIf(
+    not flow.unittest.env.eager_execution_enabled(),
+    "module doesn't work in lazy mode now",
+)
+class TestContainer(flow.unittest.TestCase):
+    def test_module_forward(test_case):
+        class CustomModule(flow.nn.Module):
+            def __init__(self, w):
+                super().__init__()
+                self.w = w
+
+            def forward(self, x):
+                return x + self.w
+
+        m1 = CustomModule(5)
+        m2 = CustomModule(4)
+        s = flow.nn.Sequential(m1, m2)
+        test_case.assertEqual(s(1), 10)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_moments.py b/python/oneflow/compatible/single_client/test/ops/test_moments.py
new file mode 100644
index 0000000000000000000000000000000000000000..db35ec2565f879e1740a291b6cbcc44ba773483c
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_moments.py
@@ -0,0 +1,91 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import test_global_storage
+from test_util import GenArgList, type_name_to_flow_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def compare_with_tensorflow(device_type, x_shape, data_type, axes):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    if max(axes) >= len(x_shape):
+        return
+
+    @flow.global_function(type="train", function_config=func_config)
+    def MomentsJob():
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "x",
+                shape=x_shape,
+                dtype=type_name_to_flow_type[data_type],
+                initializer=flow.random_uniform_initializer(minval=-10, maxval=10),
+                trainable=True,
+            )
+            (m, v) = flow.nn.moments(x, axes)
+            loss = m + v
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch(loss, test_global_storage.Setter("loss"))
+            flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))
+            return (m, v)
+
+    of_out = MomentsJob().get()
+    with tf.GradientTape(persistent=True) as tape:
+        x = tf.Variable(test_global_storage.Get("x"))
+        tf_out = tf.nn.moments(x, axes)
+        tf_loss = tf_out[0] + tf_out[1]
+    loss_diff = test_global_storage.Get("loss_diff")
+    tf_x_diff = tape.gradient(tf_loss, x, loss_diff)
+    for i in range(2):
+        assert np.allclose(of_out[i].numpy(), tf_out[i].numpy(), rtol=1e-05, atol=1e-05)
+    assert np.allclose(
+        test_global_storage.Get("x_diff"), tf_x_diff.numpy(), rtol=1e-05, atol=1e-05
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestMoments(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_moments(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["x_shape"] = [(10, 20, 30), (20,)]
+        arg_dict["data_type"] = ["float32", "double"]
+        arg_dict["axes"] = [[0], [0, 2], [0, 1]]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_mseloss.py b/python/oneflow/compatible/single_client/test/ops/test_mseloss.py
new file mode 100644
index 0000000000000000000000000000000000000000..f79d52a9cf2b8023b23b1d671f40864dd92882e9
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_mseloss.py
@@ -0,0 +1,147 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+from typing import Dict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+def _compare_mseloss_with_np(
+    input_shape, target_shape, device_type, machine_ids, device_counts
+):
+    input = np.random.random(size=input_shape).astype(np.float32)
+    target = np.random.random(size=target_shape).astype(np.float32)
+    assert device_type in ["cpu", "gpu"]
+    flow.clear_default_session()
+    if device_type == "cpu":
+        flow.config.cpu_device_num(device_counts)
+    else:
+        flow.config.gpu_device_num(device_counts)
+    func_config = flow.FunctionConfig()
+
+    def np_mseloss(np_input, np_target):
+        np_mse = np.square(np_target - np_input)
+        np_mse_mean = np.mean(np_mse)
+        np_mse_sum = np.sum(np_mse)
+        return {
+            "np_mse_loss": np_mse,
+            "np_mse_loss_mean": np_mse_mean,
+            "np_mse_loss_sum": np_mse_sum,
+        }
+
+    def np_mseloss_grad(np_input, np_target):
+        elem_cnt = np_input.size
+        np_mse_grad_mean = -2 * (np_target - np_input) / elem_cnt
+        return {"np_mse_grad_mean": np_mse_grad_mean}
+
+    np_out_mseloss_dict = np_mseloss(input, target)
+    np_grad_dict = np_mseloss_grad(input, target)
+
+    def assert_prediction_grad(blob: tp.Numpy):
+        assert np.allclose(blob, np_grad_dict["np_mse_grad_mean"])
+
+    @flow.global_function(type="train", function_config=func_config)
+    def oneflow_mseloss(
+        of_input: tp.Numpy.Placeholder(shape=input.shape),
+        of_target: tp.Numpy.Placeholder(shape=target.shape),
+    ) -> Dict[str, tp.Numpy]:
+        with flow.scope.placement(device_type, "0:0"):
+            v = flow.get_variable(
+                shape=input.shape,
+                dtype=flow.float32,
+                initializer=flow.zeros_initializer(),
+                name="x_var",
+            )
+            x_var = of_input + v
+        flow.watch_diff(x_var, assert_prediction_grad)
+        mseloss = flow.nn.MSELoss(x_var, of_target, reduction="none", name="of_mseloss")
+        mseloss_mean = flow.nn.MSELoss(
+            x_var, of_target, reduction="mean", name="of_mseloss_reduce_mean"
+        )
+        mseloss_sum = flow.nn.MSELoss(
+            x_var, of_target, reduction="sum", name="of_mseloss_reduce_sum"
+        )
+        with flow.scope.placement(device_type, "0:0"):
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+            ).minimize(mseloss_mean)
+        return {
+            "of_mse_loss": mseloss,
+            "of_mse_loss_mean": mseloss_mean,
+            "of_mse_loss_sum": mseloss_sum,
+        }
+
+    of_out_mseloss_dict = oneflow_mseloss(input, target)
+    assert np.allclose(
+        of_out_mseloss_dict["of_mse_loss"], np_out_mseloss_dict["np_mse_loss"]
+    )
+    assert np.allclose(
+        of_out_mseloss_dict["of_mse_loss_mean"], np_out_mseloss_dict["np_mse_loss_mean"]
+    )
+    assert np.allclose(
+        of_out_mseloss_dict["of_mse_loss_sum"], np_out_mseloss_dict["np_mse_loss_sum"]
+    )
+
+
+def _gen_arg_dict(shape, device_type, machine_ids, device_counts):
+    arg_dict = OrderedDict()
+    arg_dict["input_shape"] = [shape]
+    arg_dict["target_shape"] = [shape]
+    arg_dict["device_type"] = [device_type]
+    arg_dict["machine_ids"] = [machine_ids]
+    arg_dict["device_counts"] = [device_counts]
+    return arg_dict
+
+
+@flow.unittest.skip_unless_1n1d()
+class Testmseloss1n1d(flow.unittest.TestCase):
+    def test_mseloss_cpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(3, 16), device_type="cpu", machine_ids="0:0", device_counts=1
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_mseloss_with_np(*arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_mseloss_gpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(3, 16, 32), device_type="gpu", machine_ids="0:0", device_counts=1
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_mseloss_with_np(*arg)
+
+
+@flow.unittest.skip_unless_1n2d()
+class Testmseloss1n2d(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_mseloss_gpu_1n2d(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(3, 16, 16), device_type="gpu", machine_ids="0:0-1", device_counts=2
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_mseloss_with_np(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_multi_optimizer.py b/python/oneflow/compatible/single_client/test/ops/test_multi_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..20dd32143d9b20d0cd5334d1b110b868a202c921
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_multi_optimizer.py
@@ -0,0 +1,169 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def compare_multi_optimizer_with_tensorflow(
+    device_type,
+    var1_shape,
+    var2_shape,
+    var3_shape,
+    sgd_opt_args,
+    rmsprop_opt_args,
+    adam_opt_args,
+    train_iters,
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float32)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def TestMultiOptimizerJob():
+        with flow.scope.placement(device_type, "0:0-0"):
+            var1 = flow.get_variable(
+                name="var1",
+                shape=var1_shape,
+                dtype=flow.float32,
+                initializer=flow.random_uniform_initializer(minval=0, maxval=100),
+                trainable=True,
+            )
+            var2 = flow.get_variable(
+                name="var2",
+                shape=var2_shape,
+                dtype=flow.float32,
+                initializer=flow.random_uniform_initializer(minval=0, maxval=100),
+                trainable=True,
+            )
+            var3 = flow.get_variable(
+                name="var3",
+                shape=var3_shape,
+                dtype=flow.float32,
+                initializer=flow.random_uniform_initializer(minval=0, maxval=100),
+                trainable=True,
+            )
+            loss = flow.math.reduce_sum(var1 + var2 + var3)
+            sgd_opt = flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [sgd_opt_args["lr"]]),
+                momentum=sgd_opt_args["momentum"],
+                variables=["var1"],
+            )
+            rmsprop_opt = flow.optimizer.RMSProp(
+                flow.optimizer.PiecewiseConstantScheduler([], [rmsprop_opt_args["lr"]]),
+                decay_rate=rmsprop_opt_args["decay_rate"],
+                epsilon=0,
+                centered=rmsprop_opt_args["centered"],
+                variables=["var2"],
+            )
+            adam_opt = flow.optimizer.Adam(
+                flow.optimizer.PiecewiseConstantScheduler([], [adam_opt_args["lr"]]),
+                beta1=adam_opt_args["beta1"],
+                beta2=adam_opt_args["beta2"],
+                epsilon=adam_opt_args["epsilon"],
+                do_bias_correction=True,
+                variables=["var3"],
+            )
+            flow.optimizer.CombinedOptimizer([sgd_opt, rmsprop_opt, adam_opt]).minimize(
+                loss
+            )
+            return (var1, var2, var3)
+
+    init_var1 = None
+    init_var2 = None
+    init_var3 = None
+    for i in range(train_iters + 1):
+        (var1, var2, var3) = TestMultiOptimizerJob().get()
+        if i == 0:
+            init_var1 = np.copy(var1.numpy())
+            init_var2 = np.copy(var2.numpy())
+            init_var3 = np.copy(var3.numpy())
+    tf_var1 = tf.Variable(init_var1)
+    tf_var2 = tf.Variable(init_var2)
+    tf_var3 = tf.Variable(init_var3)
+    tf_sgd_opt = tf.keras.optimizers.SGD(
+        learning_rate=sgd_opt_args["lr"],
+        momentum=sgd_opt_args["momentum"],
+        nesterov=False,
+    )
+    tf_rmsprop_opt = tf.keras.optimizers.RMSprop(
+        learning_rate=rmsprop_opt_args["lr"],
+        rho=rmsprop_opt_args["decay_rate"],
+        momentum=0.0,
+        epsilon=0,
+        centered=rmsprop_opt_args["centered"],
+    )
+    tf_adam_opt = tf.keras.optimizers.Adam(
+        learning_rate=adam_opt_args["lr"],
+        beta_1=adam_opt_args["beta1"],
+        beta_2=adam_opt_args["beta2"],
+        epsilon=adam_opt_args["epsilon"],
+        amsgrad=False,
+    )
+    for i in range(train_iters):
+        with tf.GradientTape(persistent=True) as tape:
+            loss = tf.math.reduce_sum(tf_var1 + tf_var2 + tf_var3)
+        tf_var1_grad = tape.gradient([loss], tf_var1)
+        tf_var2_grad = tape.gradient([loss], tf_var2)
+        tf_var3_grad = tape.gradient([loss], tf_var3)
+        tf_sgd_opt.apply_gradients([(tf_var1_grad, tf_var1)])
+        tf_rmsprop_opt.apply_gradients([(tf_var2_grad, tf_var2)])
+        tf_adam_opt.apply_gradients([(tf_var3_grad, tf_var3)])
+    assert np.allclose(
+        var1.flatten(), tf_var1.numpy().flatten(), rtol=0.0001, atol=0.0001
+    )
+    assert np.allclose(
+        var2.flatten(), tf_var2.numpy().flatten(), rtol=0.005, atol=0.005
+    )
+    assert np.allclose(
+        var3.flatten(), tf_var3.numpy().flatten(), rtol=0.0001, atol=0.0001
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestMultiOptimizer(flow.unittest.TestCase):
+    def test_multi_optimizer(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["var1_shape"] = [(10,)]
+        arg_dict["var2_shape"] = [(10,)]
+        arg_dict["var3_shape"] = [(10,)]
+        arg_dict["sgd_opt_args"] = [{"lr": 1, "momentum": 0.9}]
+        arg_dict["rmsprop_opt_args"] = [
+            {"lr": 0.5, "decay_rate": 0.9, "centered": False}
+        ]
+        arg_dict["adam_opt_args"] = [
+            {"lr": 2, "beta1": 0.9, "beta2": 0.99, "epsilon": 1e-09}
+        ]
+        arg_dict["train_iters"] = [10]
+        for arg in GenArgList(arg_dict):
+            compare_multi_optimizer_with_tensorflow(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_multi_process.py b/python/oneflow/compatible/single_client/test/ops/test_multi_process.py
new file mode 100644
index 0000000000000000000000000000000000000000..d921a5e3ff0ca025e693edcebba4016f15b2f536
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_multi_process.py
@@ -0,0 +1,111 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+
+@unittest.skipIf(flow.sysconfig.has_rpc_backend_grpc() == False, "lacks grpc")
+@flow.unittest.skip_unless_1n4d()
+@unittest.skipIf(
+    os.getenv("ONEFLOW_TEST_GITHUB_HOSTED"),
+    "this will fail because github hosted VM has only two CPU cores",
+)
+class TestMultiProcess(flow.unittest.TestCase):
+    def test_multi_process(test_case):
+        flow.config.gpu_device_num(4)
+        func_config = flow.FunctionConfig()
+        func_config.concurrency_width(1)
+
+        @flow.global_function()
+        def Foo():
+            with flow.scope.placement("gpu", "0:0-3"):
+                x = flow.get_variable(
+                    "x",
+                    shape=(2, 5),
+                    dtype=flow.float,
+                    initializer=flow.random_uniform_initializer(minval=0, maxval=1),
+                    trainable=False,
+                )
+            return x
+
+        of_ret = Foo().get()
+        test_case.assertEqual(of_ret.numpy().shape, (2, 5))
+
+    def test_worker_to_master_communication(test_case):
+        flow.config.gpu_device_num(4)
+        func_config = flow.FunctionConfig()
+        func_config.concurrency_width(1)
+
+        @flow.global_function()
+        def Foo():
+            with flow.scope.placement("gpu", "0:0"):
+                x = flow.get_variable(
+                    "x",
+                    shape=(2, 5),
+                    dtype=flow.float,
+                    initializer=flow.random_uniform_initializer(minval=0, maxval=1),
+                    trainable=False,
+                )
+            with flow.scope.placement("gpu", "0:3"):
+                y = flow.get_variable(
+                    "y",
+                    shape=(2, 5),
+                    dtype=flow.float,
+                    initializer=flow.constant_initializer(0),
+                    trainable=False,
+                )
+                flow.assign(y, x)
+            return y
+
+        of_ret = Foo().get()
+        test_case.assertEqual(of_ret.numpy().shape, (2, 5))
+
+    def test_worker_to_worker_communication(test_case):
+        flow.config.gpu_device_num(4)
+        func_config = flow.FunctionConfig()
+        func_config.concurrency_width(1)
+
+        @flow.global_function()
+        def Foo():
+            with flow.scope.placement("gpu", "0:1"):
+                x = flow.get_variable(
+                    "x",
+                    shape=(2, 5),
+                    dtype=flow.float,
+                    initializer=flow.random_uniform_initializer(minval=0, maxval=1),
+                    trainable=False,
+                )
+            with flow.scope.placement("gpu", "0:2"):
+                y = flow.get_variable(
+                    "y",
+                    shape=(2, 5),
+                    dtype=flow.float,
+                    initializer=flow.constant_initializer(0),
+                    trainable=False,
+                )
+                flow.assign(y, x)
+            return y
+
+        of_ret = Foo().get()
+        test_case.assertEqual(of_ret.numpy().shape, (2, 5))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_multi_square_sum.py b/python/oneflow/compatible/single_client/test/ops/test_multi_square_sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..58401403810d23749a13cfde89ab2fcd988d3b2e
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_multi_square_sum.py
@@ -0,0 +1,76 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+func_config = flow.FunctionConfig()
+func_config.default_data_type(flow.float)
+
+
+def multi_square_sum(x, name=None):
+    return (
+        flow.user_op_builder(name if name is not None else "MultiSquareSum")
+        .Op("multi_square_sum")
+        .Input("x", x)
+        .Output("y")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def _check(test_case, xs, y):
+    ref_y = np.sum(np.array([np.sum(x ** 2) for x in xs]))
+    test_case.assertTrue(np.allclose(y, ref_y))
+
+
+def _run_test(test_case, x, n, dtype, device):
+    flow.clear_default_session()
+
+    @flow.global_function(function_config=func_config)
+    def multi_square_sum_job(x: oft.Numpy.Placeholder(x.shape, dtype=dtype)):
+        with flow.scope.placement(device, "0:0"):
+            xs = [x + 0.1 * i for i in range(n)]
+            return multi_square_sum(xs)
+
+    y = multi_square_sum_job(x).get()
+    _check(test_case, [(x + 0.1 * i).astype(np.float32) for i in range(n)], y.numpy())
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestMultiSquareSum(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_multi_square_sum_random_gpu(test_case):
+        x = np.random.rand(3, 4, 5).astype(np.float32)
+        _run_test(test_case, x, 5, flow.float32, "gpu")
+        _run_test(test_case, x, 5, flow.float32, "gpu")
+        _run_test(test_case, x, 88, flow.float32, "gpu")
+        _run_test(test_case, x, 64, flow.float32, "gpu")
+
+    def test_multi_square_sum_random_cpu(test_case):
+        x = np.random.rand(3, 4, 5).astype(np.float32)
+        _run_test(test_case, x, 5, flow.float32, "cpu")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_multiply.py b/python/oneflow/compatible/single_client/test/ops/test_multiply.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab344db03daf46da1ee3c090df883955a824908d
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_multiply.py
@@ -0,0 +1,130 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+from test_util import (
+    Args,
+    CompareOpWithTensorFlow,
+    GenArgDict,
+    test_global_storage,
+    type_name_to_flow_type,
+    type_name_to_np_type,
+)
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def _test_element_wise_mul_fw_bw(test_case, device, shape, type_name):
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    np_type = type_name_to_np_type[type_name]
+    flow_type = type_name_to_flow_type[type_name]
+
+    @flow.global_function(type="train", function_config=func_config)
+    def test_element_wise_mul_job(
+        x: oft.Numpy.Placeholder(shape, dtype=flow.float),
+        y: oft.Numpy.Placeholder(shape, dtype=flow.float),
+    ):
+        with flow.scope.placement(device, "0:0"):
+            x += flow.get_variable(
+                name="vx",
+                shape=(1,),
+                dtype=flow.float,
+                initializer=flow.zeros_initializer(),
+            )
+            y += flow.get_variable(
+                name="vy",
+                shape=(1,),
+                dtype=flow.float,
+                initializer=flow.zeros_initializer(),
+            )
+            x = flow.cast(x, dtype=flow_type)
+            y = flow.cast(y, dtype=flow_type)
+            out = flow.math.multiply(x, y)
+            out = flow.cast(out, dtype=flow.float)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(out)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch(y, test_global_storage.Setter("y"))
+            flow.watch_diff(y, test_global_storage.Setter("y_diff"))
+            flow.watch(out, test_global_storage.Setter("out"))
+            flow.watch_diff(out, test_global_storage.Setter("out_diff"))
+            return out
+
+    x = np.random.randint(low=0, high=10, size=shape).astype(np.float32)
+    y = np.random.randint(low=0, high=10, size=shape).astype(np.float32)
+    test_element_wise_mul_job(x, y).get()
+    test_case.assertTrue(
+        np.allclose(
+            test_global_storage.Get("x") * test_global_storage.Get("y"),
+            test_global_storage.Get("out"),
+        )
+    )
+    test_case.assertTrue(
+        np.allclose(
+            test_global_storage.Get("out_diff") * test_global_storage.Get("x"),
+            test_global_storage.Get("y_diff"),
+        )
+    )
+    test_case.assertTrue(
+        np.allclose(
+            test_global_storage.Get("out_diff") * test_global_storage.Get("y"),
+            test_global_storage.Get("x_diff"),
+        )
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestMultiply(flow.unittest.TestCase):
+    def test_scalar_mul(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["flow_op"] = [flow.math.multiply]
+        arg_dict["tf_op"] = [tf.math.multiply]
+        arg_dict["input_shape"] = [(10, 10, 10)]
+        arg_dict["op_args"] = [
+            Args([1]),
+            Args([-1]),
+            Args([84223.19348]),
+            Args([-3284.139]),
+        ]
+        for arg in GenArgDict(arg_dict):
+            CompareOpWithTensorFlow(**arg)
+
+    def test_element_wise_mul_fw_bw(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device"] = ["gpu", "cpu"]
+        arg_dict["shape"] = [(96, 96)]
+        arg_dict["type_name"] = ["float32", "double", "int8", "int32", "int64"]
+        for arg in GenArgDict(arg_dict):
+            _test_element_wise_mul_fw_bw(test_case, **arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_namescope.py b/python/oneflow/compatible/single_client/test/ops/test_namescope.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae2ce5e431af2bb7c17367b8aa780c1e738f9c0f
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_namescope.py
@@ -0,0 +1,56 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestNameScope(flow.unittest.TestCase):
+    def test_name_scope(test_case):
+        flow.clear_default_session()
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+
+        def get_var(var_name):
+            return flow.get_variable(
+                name=var_name,
+                shape=(2, 256, 14, 14),
+                dtype=flow.float32,
+                initializer=flow.random_uniform_initializer(),
+            )
+
+        @flow.global_function(function_config=func_config)
+        def test_name_scope_job():
+            with flow.scope.namespace("backbone"):
+                with flow.scope.namespace("branch"):
+                    var1 = get_var("var")
+                with flow.scope.namespace("branch"):
+                    var2 = get_var("var")
+            var3 = get_var("backbone-branch-var")
+            return (var1, var2, var3)
+
+        (var1, var2, var3) = test_name_scope_job().get()
+        test_case.assertTrue(np.array_equal(var1.numpy(), var2.numpy()))
+        test_case.assertTrue(np.array_equal(var1.numpy(), var3.numpy()))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_nccl_use_compute_stream.py b/python/oneflow/compatible/single_client/test/ops/test_nccl_use_compute_stream.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d9737a505e8c57937a889c44af1808554757bb7
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_nccl_use_compute_stream.py
@@ -0,0 +1,143 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def _test_split_to_split_enable_all_to_all(test_case, src_axis, dst_axis):
+    flow.clear_default_session()
+    flow.config.gpu_device_num(2)
+    flow.config.nccl_use_compute_stream(True)
+    flow.config.disable_group_boxing_by_dst_parallel(True)
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.consistent_view())
+
+    @flow.global_function(function_config=func_config)
+    def split_to_split_job(x: oft.Numpy.Placeholder((32, 16, 64, 48))):
+        with flow.scope.placement("gpu", "0:0-1"):
+            src = flow.identity(x.with_distribute(flow.distribute.split(src_axis)))
+            dst = flow.identity(src.with_distribute(flow.distribute.split(dst_axis)))
+        return dst
+
+    x = np.random.rand(32, 16, 64, 48).astype(np.float32)
+    y = split_to_split_job(x).get().numpy()
+    test_case.assertTrue(np.array_equal(x, y))
+
+
+def _test_split_to_broadcast(test_case, src_axis):
+    flow.clear_default_session()
+    flow.config.gpu_device_num(2)
+    flow.config.nccl_use_compute_stream(True)
+    flow.config.disable_group_boxing_by_dst_parallel(True)
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.consistent_view())
+
+    @flow.global_function(function_config=func_config)
+    def split_to_broadcast_job(x: oft.Numpy.Placeholder((96, 96))):
+        with flow.scope.placement("gpu", "0:0-1"):
+            src = flow.identity(x.with_distribute(flow.distribute.split(src_axis)))
+            dst = flow.identity(src.with_distribute(flow.distribute.broadcast()))
+        return dst
+
+    x = np.random.rand(96, 96).astype(np.float32)
+    y = split_to_broadcast_job(x).get().numpy()
+    test_case.assertTrue(np.array_equal(x, y))
+
+
+def _test_partial_sum_to_split(test_case, dst_axis):
+    flow.clear_default_session()
+    flow.config.gpu_device_num(2)
+    flow.config.nccl_use_compute_stream(True)
+    flow.config.disable_group_boxing_by_dst_parallel(True)
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.consistent_view())
+
+    @flow.global_function(function_config=func_config)
+    def partial_sum_to_split_job(x: oft.Numpy.Placeholder((96, 96, 96))):
+        with flow.scope.placement("gpu", "0:0-1"):
+            src = flow.identity(x.with_distribute(flow.distribute.split(0)))
+            src = flow.math.reduce_sum(src, axis=0)
+            dst = flow.identity(src.with_distribute(flow.distribute.split(dst_axis)))
+        return dst
+
+    x = np.random.uniform(-1e-05, 1e-05, (96, 96, 96)).astype(np.float32)
+    y = partial_sum_to_split_job(x).get().numpy()
+    test_case.assertTrue(np.allclose(np.sum(x, axis=0), y))
+
+
+def _test_partial_sum_to_broadcast(test_case):
+    flow.clear_default_session()
+    flow.config.gpu_device_num(2)
+    flow.config.nccl_use_compute_stream(True)
+    flow.config.disable_group_boxing_by_dst_parallel(True)
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.consistent_view())
+
+    @flow.global_function(function_config=func_config)
+    def partial_sum_to_broadcast_job(x: oft.Numpy.Placeholder((96, 96, 96))):
+        with flow.scope.placement("gpu", "0:0-1"):
+            src = flow.identity(x.with_distribute(flow.distribute.split(0)))
+            src = flow.math.reduce_sum(src, axis=0)
+            dst = flow.identity(src.with_distribute(flow.distribute.broadcast()))
+        return dst
+
+    x = np.random.uniform(-1e-05, 1e-05, (96, 96, 96)).astype(np.float32)
+    y = partial_sum_to_broadcast_job(x).get().numpy()
+    test_case.assertTrue(np.allclose(np.sum(x, axis=0), y))
+
+
+@flow.unittest.skip_unless_1n2d()
+class TestNcclUseComputeStream(flow.unittest.TestCase):
+    def test_split_to_split_all_to_all(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["src_axis"] = [0, 1, 2, 3]
+        arg_dict["dst_axis"] = [0, 1, 2, 3]
+        for arg in GenArgList(arg_dict):
+            (src_axis, dst_axis) = arg
+            if src_axis == dst_axis:
+                continue
+            _test_split_to_split_enable_all_to_all(test_case, *arg)
+
+    def test_split_to_broadcast(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["src_axis"] = [0, 1]
+        for arg in GenArgList(arg_dict):
+            _test_split_to_broadcast(test_case, *arg)
+
+    def test_partial_sum_to_split(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["dst_axis"] = [0, 1]
+        for arg in GenArgList(arg_dict):
+            _test_partial_sum_to_split(test_case, *arg)
+
+    def test_partial_sum_to_broadcast(test_case):
+        _test_partial_sum_to_broadcast(test_case)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_nn_conv1d.py b/python/oneflow/compatible/single_client/test/ops/test_nn_conv1d.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c607c4b82c396885bf9f7c937f14ba19291894d
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_nn_conv1d.py
@@ -0,0 +1,175 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import test_global_storage
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def compare_with_tensorflow(
+    device_type,
+    x_shape,
+    filters,
+    kernel_size,
+    groups,
+    of_padding="SAME",
+    tf_padding="SAME",
+    stride=1,
+    data_format="NCDHW",
+    dilation=1,
+):
+    if os.getenv("ONEFLOW_TEST_CPU_ONLY") and dilation > 1:
+        return
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.consistent_view())
+    func_config.cudnn_conv_heuristic_search_algo(False)
+    if data_format == "NCW":
+        xy_data_transpose = (0, 2, 1)
+        weight_data_transpose = (2, 1, 0)
+    else:
+        xy_data_transpose = (0, 1, 2)
+        weight_data_transpose = (1, 2, 0)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def ConvJob():
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "x",
+                shape=x_shape,
+                dtype=flow.float,
+                initializer=flow.random_uniform_initializer(minval=0, maxval=100),
+                trainable=True,
+            )
+            if data_format == "NCW":
+                weight_shape = (filters, x.shape[1] // groups, kernel_size)
+            else:
+                weight_shape = (filters, kernel_size, x.shape[2] // groups)
+            weight = flow.get_variable(
+                "conv-weight",
+                shape=weight_shape,
+                dtype=flow.float,
+                initializer=flow.random_uniform_initializer(minval=0, maxval=100),
+            )
+            loss = flow.nn.conv1d(
+                x,
+                weight,
+                strides=[stride],
+                padding=of_padding,
+                data_format=data_format,
+                dilations=dilation,
+                groups=groups,
+            )
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch(weight, test_global_storage.Setter("weight"))
+            flow.watch_diff(weight, test_global_storage.Setter("weight_diff"))
+            flow.watch(loss, test_global_storage.Setter("loss"))
+            flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))
+            return loss
+
+    of_out = ConvJob().get()
+    with tf.GradientTape(persistent=True) as tape:
+        x = tf.Variable(test_global_storage.Get("x").transpose(xy_data_transpose))
+        assert groups > 0
+        assert x_shape[1] % groups == 0
+        assert filters % groups == 0
+        weight = tf.Variable(
+            test_global_storage.Get("weight").transpose(weight_data_transpose)
+        )
+        tf_out = tf.nn.conv1d(
+            x,
+            weight,
+            stride=[1, stride, 1],
+            padding=tf_padding,
+            data_format="NWC",
+            dilations=[1, dilation, 1],
+        )
+    loss_diff = test_global_storage.Get("loss_diff").transpose(xy_data_transpose)
+    tf_x_diff = tape.gradient(tf_out, x, loss_diff)
+    tf_weight_diff = tape.gradient(tf_out, weight, loss_diff)
+    assert np.allclose(
+        of_out.numpy().transpose(xy_data_transpose),
+        tf_out.numpy(),
+        rtol=1e-05,
+        atol=1e-05,
+    )
+    assert np.allclose(
+        test_global_storage.Get("x_diff").transpose(xy_data_transpose),
+        tf_x_diff.numpy(),
+        rtol=0.0001,
+        atol=0.0001,
+    )
+    assert np.allclose(
+        test_global_storage.Get("weight_diff").transpose(weight_data_transpose),
+        tf_weight_diff.numpy(),
+        rtol=1e-05,
+        atol=1e-05,
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestNnConv1d(flow.unittest.TestCase):
+    def test_padding_valid(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["x_shape"] = [(10, 32, 10)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [3, 2]
+        arg_dict["groups"] = [1]
+        arg_dict["of_padding"] = ["VALID"]
+        arg_dict["tf_padding"] = ["VALID"]
+        arg_dict["stride"] = [2]
+        arg_dict["data_format"] = ["NCW", "NWC"]
+        arg_dict["dilation"] = [2]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_padding_same(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["x_shape"] = [(10, 32, 11)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [2]
+        arg_dict["groups"] = [1]
+        arg_dict["of_padding"] = ["SAME_UPPER"]
+        arg_dict["tf_padding"] = ["SAME"]
+        arg_dict["stride"] = [2]
+        arg_dict["data_format"] = ["NCW"]
+        arg_dict["dilation"] = [1]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_nn_conv2d.py b/python/oneflow/compatible/single_client/test/ops/test_nn_conv2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f5a657633ec5c218581860a0c9aaad660896de5
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_nn_conv2d.py
@@ -0,0 +1,264 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import test_global_storage
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def grouped_convolution2D(
+    inputs, filters, padding, num_groups, strides=None, dilation_rate=None
+):
+    input_list = tf.split(inputs, num_groups, axis=-1)
+    filter_list = tf.split(filters, num_groups, axis=-1)
+    output_list = []
+    for (conv_idx, (input_tensor, filter_tensor)) in enumerate(
+        zip(input_list, filter_list)
+    ):
+        output_list.append(
+            tf.nn.conv2d(
+                input_tensor,
+                filter_tensor,
+                padding="VALID",
+                strides=[1, 1, 1, 1],
+                data_format="NHWC",
+            )
+        )
+    outputs = tf.concat(output_list, axis=-1)
+    return outputs
+
+
+def compare_with_tensorflow(
+    device_type,
+    x_shape,
+    filters,
+    kernel_size,
+    groups,
+    data_format="NCHW",
+    padding="VALID",
+    stride=1,
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    if data_format == "NCHW":
+        xy_data_transpose = (0, 2, 3, 1)
+        weight_data_transpose = (2, 3, 1, 0)
+    else:
+        xy_data_transpose = (0, 1, 2, 3)
+        weight_data_transpose = (1, 2, 3, 0)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def ConvJob():
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "x",
+                shape=x_shape,
+                dtype=flow.float,
+                initializer=flow.random_uniform_initializer(minval=0, maxval=100),
+                trainable=True,
+            )
+            if data_format == "NCHW":
+                weight_shape = (filters, x.shape[1] // groups, kernel_size, kernel_size)
+            else:
+                weight_shape = (filters, kernel_size, kernel_size, x.shape[3] // groups)
+            weight = flow.get_variable(
+                "conv-weight",
+                shape=weight_shape,
+                dtype=flow.float,
+                initializer=flow.random_uniform_initializer(minval=0, maxval=100),
+            )
+            loss = flow.nn.conv2d(
+                x,
+                weight,
+                strides=[stride, stride],
+                padding=padding,
+                data_format=data_format,
+                dilations=[1, 1],
+                groups=groups,
+                name="conv",
+            )
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch(weight, test_global_storage.Setter("weight"))
+            flow.watch_diff(weight, test_global_storage.Setter("weight_diff"))
+            flow.watch(loss, test_global_storage.Setter("loss"))
+            flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))
+            return loss
+
+    of_out = ConvJob().get()
+    with tf.GradientTape(persistent=True) as tape:
+        x = tf.Variable(test_global_storage.Get("x").transpose(xy_data_transpose))
+        assert groups > 0
+        assert filters % groups == 0
+        if groups == 1:
+            weight = tf.Variable(
+                test_global_storage.Get("weight").transpose(weight_data_transpose)
+            )
+            tf_out = tf.nn.conv2d(
+                x,
+                weight,
+                strides=[1, stride, stride, 1],
+                padding=padding,
+                data_format="NHWC",
+            )
+        else:
+            weight = tf.Variable(
+                test_global_storage.Get("weight").transpose(weight_data_transpose)
+            )
+            tf_out = grouped_convolution2D(
+                x, weight, padding=padding, num_groups=groups
+            )
+    loss_diff = test_global_storage.Get("loss_diff").transpose(xy_data_transpose)
+    tf_x_diff = tape.gradient(tf_out, x, loss_diff)
+    tf_weight_diff = tape.gradient(tf_out, weight, loss_diff)
+    max_diff = np.max(
+        np.absolute(of_out.numpy().transpose(xy_data_transpose) - tf_out.numpy())
+    )
+    assert np.allclose(
+        of_out.numpy().transpose(xy_data_transpose),
+        tf_out.numpy(),
+        rtol=0.005,
+        atol=0.005,
+    ), max_diff
+    assert np.allclose(
+        test_global_storage.Get("x_diff").transpose(xy_data_transpose),
+        tf_x_diff.numpy(),
+        rtol=0.005,
+        atol=0.005,
+    )
+    assert np.allclose(
+        test_global_storage.Get("weight_diff").transpose(weight_data_transpose),
+        tf_weight_diff.numpy(),
+        rtol=0.005,
+        atol=0.005,
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestNnConv2d(flow.unittest.TestCase):
+    def test_cpu1(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu"]
+        arg_dict["x_shape"] = [(10, 10, 10, 32)]
+        arg_dict["filters"] = [128]
+        arg_dict["kernel_size"] = [1]
+        arg_dict["groups"] = [32]
+        arg_dict["data_format"] = ["NHWC"]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_cpu2(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu"]
+        arg_dict["x_shape"] = [(10, 32, 226, 226)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [1]
+        arg_dict["groups"] = [4]
+        arg_dict["data_format"] = ["NCHW"]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_cpu3(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu"]
+        arg_dict["x_shape"] = [(10, 32, 20, 20)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [5]
+        arg_dict["groups"] = [1]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_conv1(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(10, 32, 20, 20)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [3]
+        arg_dict["groups"] = [32]
+        arg_dict["data_format"] = ["NCHW"]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_conv2(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(10, 32, 20, 20)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [3]
+        arg_dict["groups"] = [4]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_conv3(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(10, 32, 20, 20)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [3]
+        arg_dict["groups"] = [8]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_conv4(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(10, 32, 20, 20)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [3]
+        arg_dict["groups"] = [32]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_conv5(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(10, 32, 20, 20)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [1]
+        arg_dict["groups"] = [8]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_conv6(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(10, 32, 20, 20)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [1]
+        arg_dict["groups"] = [32]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_nn_conv2d_bias.py b/python/oneflow/compatible/single_client/test/ops/test_nn_conv2d_bias.py
new file mode 100644
index 0000000000000000000000000000000000000000..26ab7a1ffd00ff4780d002546902cfbdb86c2f7f
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_nn_conv2d_bias.py
@@ -0,0 +1,269 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import test_global_storage
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def grouped_convolution2D(
+    inputs, filters, padding, num_groups, strides=None, dilation_rate=None
+):
+    input_list = tf.split(inputs, num_groups, axis=-1)
+    filter_list = tf.split(filters, num_groups, axis=-1)
+    output_list = []
+    for (conv_idx, (input_tensor, filter_tensor)) in enumerate(
+        zip(input_list, filter_list)
+    ):
+        output_list.append(
+            tf.nn.conv2d(
+                input_tensor,
+                filter_tensor,
+                padding="VALID",
+                strides=[1, 1, 1, 1],
+                data_format="NHWC",
+            )
+        )
+    outputs = tf.concat(output_list, axis=-1)
+    return outputs
+
+
+def compare_with_tensorflow(
+    device_type,
+    x_shape,
+    filters,
+    kernel_size,
+    groups,
+    data_format="NCHW",
+    padding="VALID",
+    stride=1,
+):
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    flow.clear_default_session()
+    if data_format == "NCHW":
+        xy_data_transpose = (0, 2, 3, 1)
+        weight_data_transpose = (2, 3, 1, 0)
+    else:
+        xy_data_transpose = (0, 1, 2, 3)
+        weight_data_transpose = (1, 2, 3, 0)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def RunConvBias():
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "x",
+                shape=x_shape,
+                dtype=flow.float,
+                initializer=flow.random_uniform_initializer(minval=0, maxval=100),
+                trainable=True,
+            )
+            if data_format == "NCHW":
+                weight_shape = (filters, x.shape[1] // groups, kernel_size, kernel_size)
+            else:
+                weight_shape = (filters, kernel_size, kernel_size, x.shape[3] // groups)
+            weight = flow.get_variable(
+                "conv-weight",
+                shape=weight_shape,
+                dtype=flow.float,
+                initializer=flow.random_uniform_initializer(minval=0, maxval=100),
+            )
+            bias = flow.get_variable(
+                "conv-bias",
+                shape=(filters,),
+                dtype=flow.float,
+                initializer=flow.random_uniform_initializer(minval=0, maxval=100),
+            )
+            loss = flow.nn.conv2d(
+                x,
+                weight,
+                bias=bias,
+                strides=[stride, stride],
+                padding=padding,
+                dilations=[1, 1],
+                groups=groups,
+                name="conv",
+            )
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch(weight, test_global_storage.Setter("weight"))
+            flow.watch_diff(weight, test_global_storage.Setter("weight_diff"))
+            flow.watch(bias, test_global_storage.Setter("bias"))
+            flow.watch_diff(bias, test_global_storage.Setter("bias_diff"))
+            flow.watch(loss, test_global_storage.Setter("loss"))
+            flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))
+            return loss
+
+    of_out = RunConvBias().get()
+    flow.clear_default_session()
+    with tf.GradientTape(persistent=True) as tape:
+        x = tf.Variable(test_global_storage.Get("x").transpose(xy_data_transpose))
+        assert groups > 0
+        assert filters % groups == 0
+        if groups == 1:
+            weight = tf.Variable(
+                test_global_storage.Get("weight").transpose(weight_data_transpose)
+            )
+            conv_out = tf.nn.conv2d(
+                x,
+                weight,
+                strides=[1, stride, stride, 1],
+                padding=padding,
+                data_format="NHWC",
+            )
+        else:
+            weight = tf.Variable(
+                test_global_storage.Get("weight").transpose(weight_data_transpose)
+            )
+            conv_out = grouped_convolution2D(
+                x, weight, padding=padding, num_groups=groups
+            )
+        bias = tf.Variable(test_global_storage.Get("bias"))
+        tf_out = tf.nn.bias_add(conv_out, bias, data_format="NHWC")
+    loss_diff = test_global_storage.Get("loss_diff").transpose(xy_data_transpose)
+    tf_x_diff = tape.gradient(tf_out, x, loss_diff)
+    tf_weight_diff = tape.gradient(tf_out, weight, loss_diff)
+    tf_bias_diff = tape.gradient(tf_out, bias, loss_diff)
+    max_diff = np.max(
+        np.absolute(of_out.numpy().transpose(xy_data_transpose) - tf_out.numpy())
+    )
+    assert np.allclose(
+        of_out.numpy().transpose(xy_data_transpose),
+        tf_out.numpy(),
+        rtol=0.005,
+        atol=0.005,
+    ), max_diff
+    assert np.allclose(
+        test_global_storage.Get("x_diff").transpose(xy_data_transpose),
+        tf_x_diff.numpy(),
+        rtol=0.005,
+        atol=0.005,
+    )
+    assert np.allclose(
+        test_global_storage.Get("weight_diff").transpose(weight_data_transpose),
+        tf_weight_diff.numpy(),
+        rtol=0.005,
+        atol=0.005,
+    )
+    assert np.allclose(
+        test_global_storage.Get("bias_diff"),
+        tf_bias_diff.numpy(),
+        rtol=0.005,
+        atol=0.005,
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestNnConv2dBias(flow.unittest.TestCase):
+    def test_cpu_1x1(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu"]
+        arg_dict["x_shape"] = [(3, 32, 128, 128)]
+        arg_dict["filters"] = [5]
+        arg_dict["kernel_size"] = [1]
+        arg_dict["groups"] = [1]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_cpu_depthwise(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu"]
+        arg_dict["x_shape"] = [(10, 32, 10, 10)]
+        arg_dict["filters"] = [128]
+        arg_dict["kernel_size"] = [1]
+        arg_dict["groups"] = [32]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_cpu_group(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu"]
+        arg_dict["x_shape"] = [(10, 32, 226, 226)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [1]
+        arg_dict["groups"] = [4]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_cpu_5x5(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu"]
+        arg_dict["x_shape"] = [(10, 32, 20, 20)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [5]
+        arg_dict["groups"] = [1]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_gpu_1x1(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(10, 32, 20, 20)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [1]
+        arg_dict["groups"] = [8]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_gpu_depthwise(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(10, 32, 20, 20)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [3]
+        arg_dict["groups"] = [32]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_gpu_group(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(10, 32, 20, 20)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [3]
+        arg_dict["groups"] = [4]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_gpu_5x5(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(10, 32, 20, 20)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [5]
+        arg_dict["groups"] = [1]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_nn_conv2d_padding.py b/python/oneflow/compatible/single_client/test/ops/test_nn_conv2d_padding.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f98ee78193e6578020e902e619340ffacd81e1f
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_nn_conv2d_padding.py
@@ -0,0 +1,250 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import test_global_storage
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def compare_with_tensorflow(
+    device_type,
+    x_shape,
+    filters,
+    kernel_size,
+    groups,
+    of_padding="SAME",
+    tf_padding="SAME",
+    stride_h=1,
+    stride_w=1,
+    data_format="NCHW",
+    dilation_h=1,
+    dilation_w=1,
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.consistent_view())
+    if data_format == "NCHW":
+        xy_data_transpose = (0, 2, 3, 1)
+        weight_data_transpose = (2, 3, 1, 0)
+    else:
+        xy_data_transpose = (0, 1, 2, 3)
+        weight_data_transpose = (1, 2, 3, 0)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def ConvJob():
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "x",
+                shape=x_shape,
+                dtype=flow.float,
+                initializer=flow.random_uniform_initializer(minval=0, maxval=100),
+                trainable=True,
+            )
+            if data_format == "NCHW":
+                weight_shape = (filters, x.shape[1] // groups, kernel_size, kernel_size)
+            else:
+                weight_shape = (filters, kernel_size, kernel_size, x.shape[3] // groups)
+            weight = flow.get_variable(
+                "conv-weight",
+                shape=weight_shape,
+                dtype=flow.float,
+                initializer=flow.random_uniform_initializer(minval=0, maxval=100),
+            )
+            loss = flow.nn.conv2d(
+                x,
+                weight,
+                strides=[stride_h, stride_w],
+                padding=of_padding,
+                data_format=data_format,
+                dilations=[dilation_h, dilation_w],
+                groups=groups,
+            )
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch(weight, test_global_storage.Setter("weight"))
+            flow.watch_diff(weight, test_global_storage.Setter("weight_diff"))
+            flow.watch(loss, test_global_storage.Setter("loss"))
+            flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))
+            return loss
+
+    of_out = ConvJob().get()
+    with tf.GradientTape(persistent=True) as tape:
+        x = tf.Variable(test_global_storage.Get("x").transpose(xy_data_transpose))
+        assert groups > 0
+        assert x_shape[1] % groups == 0
+        assert filters % groups == 0
+        weight = tf.Variable(
+            test_global_storage.Get("weight").transpose(weight_data_transpose)
+        )
+        tf_out = tf.nn.conv2d(
+            x,
+            weight,
+            strides=[1, stride_h, stride_w, 1],
+            padding=tf_padding,
+            data_format="NHWC",
+            dilations=[1, dilation_h, dilation_w, 1],
+        )
+    loss_diff = test_global_storage.Get("loss_diff").transpose(xy_data_transpose)
+    tf_x_diff = tape.gradient(tf_out, x, loss_diff)
+    tf_weight_diff = tape.gradient(tf_out, weight, loss_diff)
+    idx = np.where(
+        np.abs(of_out.numpy().transpose(xy_data_transpose) - tf_out.numpy()) > 0.0005
+    )
+    assert np.allclose(
+        of_out.numpy().transpose(xy_data_transpose),
+        tf_out.numpy(),
+        rtol=1e-05,
+        atol=1e-05,
+    )
+    assert np.allclose(
+        test_global_storage.Get("x_diff").transpose(xy_data_transpose),
+        tf_x_diff.numpy(),
+        rtol=0.0001,
+        atol=0.0001,
+    )
+    assert np.allclose(
+        test_global_storage.Get("weight_diff").transpose(weight_data_transpose),
+        tf_weight_diff.numpy(),
+        rtol=1e-05,
+        atol=1e-05,
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestNnConv2dPadding(flow.unittest.TestCase):
+    def test_padding_valid(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(10, 32, 10, 10), (10, 32, 11, 11)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [3, 2]
+        arg_dict["groups"] = [1]
+        arg_dict["of_padding"] = ["VALID"]
+        arg_dict["tf_padding"] = ["VALID"]
+        arg_dict["stride_h"] = [1]
+        arg_dict["stride_w"] = [1]
+        arg_dict["data_format"] = ["NCHW", "NHWC"]
+        arg_dict["dilation_h"] = [2]
+        arg_dict["dilation_w"] = [3]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_padding_same(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(10, 32, 10, 10), (10, 32, 11, 11)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [3, 2]
+        arg_dict["groups"] = [1]
+        arg_dict["of_padding"] = ["SAME_UPPER"]
+        arg_dict["tf_padding"] = ["SAME"]
+        arg_dict["stride_h"] = [2]
+        arg_dict["stride_w"] = [3]
+        arg_dict["data_format"] = ["NCHW", "NHWC"]
+        arg_dict["dilation_h"] = [1]
+        arg_dict["dilation_w"] = [1]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_pad_list1(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(10, 32, 10, 10), (10, 32, 11, 11)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [3, 2]
+        arg_dict["groups"] = [1]
+        arg_dict["of_padding"] = [[[0, 0], [0, 1], [1, 0], [0, 0]]]
+        arg_dict["tf_padding"] = [[[0, 0], [0, 1], [1, 0], [0, 0]]]
+        arg_dict["stride_h"] = [2]
+        arg_dict["stride_w"] = [3]
+        arg_dict["data_format"] = ["NHWC"]
+        arg_dict["dilation_h"] = [2]
+        arg_dict["dilation_w"] = [4]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_pad_list2(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(10, 32, 10, 10), (10, 32, 11, 11)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [3, 2]
+        arg_dict["groups"] = [1]
+        arg_dict["of_padding"] = [[[0, 0], [0, 0], [1, 1], [1, 1]]]
+        arg_dict["tf_padding"] = [[[0, 0], [1, 1], [1, 1], [0, 0]]]
+        arg_dict["stride_h"] = [2]
+        arg_dict["stride_w"] = [3]
+        arg_dict["data_format"] = ["NCHW"]
+        arg_dict["dilation_h"] = [2]
+        arg_dict["dilation_w"] = [4]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_pad_list3(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(10, 32, 10, 10)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [3, 2]
+        arg_dict["groups"] = [1]
+        arg_dict["of_padding"] = [[[0, 0], [0, 0], [1, 0], [1, 0]]]
+        arg_dict["tf_padding"] = [[[0, 0], [1, 0], [1, 0], [0, 0]]]
+        arg_dict["stride_h"] = [1]
+        arg_dict["stride_w"] = [2]
+        arg_dict["data_format"] = ["NCHW"]
+        arg_dict["dilation_h"] = [1]
+        arg_dict["dilation_w"] = [3]
+        arg_dict["data_format"] = ["NCHW"]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_pad_list4(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(10, 32, 10, 10), (10, 32, 11, 11)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [3]
+        arg_dict["groups"] = [1]
+        arg_dict["of_padding"] = [[[0, 0], [0, 0], [10, 2], [10, 2]]]
+        arg_dict["tf_padding"] = [[[0, 0], [10, 2], [10, 2], [0, 0]]]
+        arg_dict["stride_h"] = [2]
+        arg_dict["stride_w"] = [3]
+        arg_dict["data_format"] = ["NCHW"]
+        arg_dict["dilation_h"] = [2]
+        arg_dict["dilation_w"] = [4]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_nn_conv2d_padding_dynamic.py b/python/oneflow/compatible/single_client/test/ops/test_nn_conv2d_padding_dynamic.py
new file mode 100644
index 0000000000000000000000000000000000000000..f49211614943e4ee83892c93e1fb7de88f929e07
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_nn_conv2d_padding_dynamic.py
@@ -0,0 +1,246 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+global_storage = {}
+
+
+def global_storage_setter(name):
+    global global_storage
+
+    def _set(x):
+        global_storage[name] = x
+
+    return _set
+
+
+def compare_with_tensorflow(
+    device_type,
+    x_shape,
+    filters,
+    kernel_size,
+    groups,
+    of_padding="SAME",
+    tf_padding="SAME",
+    stride=1,
+    data_format="NCHW",
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.mirrored_view())
+    if data_format == "NCHW":
+        xy_data_transpose = (0, 2, 3, 1)
+        weight_data_transpose = (2, 3, 1, 0)
+    else:
+        xy_data_transpose = (0, 1, 2, 3)
+        weight_data_transpose = (1, 2, 3, 0)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def DynamicConvJob(x: oft.ListNumpy.Placeholder((10, 3, 100, 100))):
+        with flow.scope.placement(device_type, "0:0"):
+            x_var = flow.get_variable(
+                name="v1",
+                shape=(1,),
+                dtype=flow.float,
+                initializer=flow.zeros_initializer(),
+            )
+            x_var = flow.cast_to_current_logical_view(x_var)
+            x += x_var
+            if data_format == "NCHW":
+                weight_shape = (filters, x_shape[1] // groups, kernel_size, kernel_size)
+            else:
+                weight_shape = (filters, kernel_size, kernel_size, x_shape[3] // groups)
+            weight = flow.get_variable(
+                "conv-weight",
+                shape=weight_shape,
+                dtype=flow.float,
+                initializer=flow.random_uniform_initializer(minval=0, maxval=100),
+            )
+            weight = flow.cast_to_current_logical_view(weight)
+            loss = flow.nn.conv2d(
+                x,
+                weight,
+                strides=[stride, stride],
+                padding=of_padding,
+                data_format=data_format,
+                dilations=[1, 1],
+                groups=groups,
+            )
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            flow.watch(x, global_storage_setter("x"))
+            flow.watch_diff(x, global_storage_setter("x_diff"))
+            flow.watch(weight, global_storage_setter("weight"))
+            flow.watch_diff(weight, global_storage_setter("weight_diff"))
+            flow.watch(loss, global_storage_setter("loss"))
+            flow.watch_diff(loss, global_storage_setter("loss_diff"))
+            return loss
+
+    data = [np.random.rand(*x_shape).astype(np.float32)]
+    of_out = DynamicConvJob(data).get().numpy_list()[0]
+    with tf.GradientTape(persistent=True) as tape:
+        x = tf.Variable(data[0].transpose(xy_data_transpose))
+        assert groups > 0
+        assert x_shape[1] % groups == 0
+        assert filters % groups == 0
+        weight = tf.Variable(
+            global_storage["weight"].numpy().transpose(weight_data_transpose)
+        )
+        tf_out = tf.nn.conv2d(
+            x,
+            weight,
+            strides=[1, stride, stride, 1],
+            padding=tf_padding,
+            data_format="NHWC",
+        )
+    idx = np.where(
+        np.abs(of_out.transpose(xy_data_transpose) - tf_out.numpy()) > 0.0005
+    )
+    assert np.allclose(
+        of_out.transpose(xy_data_transpose), tf_out.numpy(), rtol=0.001, atol=0.001
+    )
+    loss_diff = global_storage["loss_diff"].numpy_list()[0].transpose(xy_data_transpose)
+    tf_x_diff = tape.gradient(tf_out, x, loss_diff)
+    tf_weight_diff = tape.gradient(tf_out, weight, loss_diff)
+    rtol = 0.0001
+    atol = 0.0001
+    if device_type == "cpu":
+        rtol *= 100
+        atol *= 100
+    assert np.allclose(
+        global_storage["x_diff"].numpy_list()[0].transpose(xy_data_transpose),
+        tf_x_diff.numpy(),
+        rtol=rtol,
+        atol=atol,
+    ), (
+        global_storage["x_diff"].numpy_list()[0].transpose(xy_data_transpose)
+        - tf_x_diff.numpy()
+    )
+    assert np.allclose(
+        global_storage["weight_diff"].numpy().transpose(weight_data_transpose),
+        tf_weight_diff.numpy(),
+        rtol=0.005,
+        atol=0.005,
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+@unittest.skip("skip_for_ci")
+class TestNnConv2dPaddingDynamic(flow.unittest.TestCase):
+    def test_padding_valid(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(10, 3, 10, 10), (10, 3, 11, 11)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [3, 2]
+        arg_dict["groups"] = [1]
+        arg_dict["of_padding"] = ["VALID"]
+        arg_dict["tf_padding"] = ["VALID"]
+        arg_dict["stride"] = [1, 2]
+        arg_dict["data_format"] = ["NCHW"]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_padding_same(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(10, 3, 10, 10), (10, 3, 11, 11)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [3, 2]
+        arg_dict["groups"] = [1]
+        arg_dict["of_padding"] = ["SAME_UPPER"]
+        arg_dict["tf_padding"] = ["SAME"]
+        arg_dict["stride"] = [1, 2]
+        arg_dict["data_format"] = ["NCHW"]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_pad_list1(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(10, 3, 10, 10), (10, 3, 11, 11)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [3, 2]
+        arg_dict["groups"] = [1]
+        arg_dict["of_padding"] = [[[0, 0], [0, 0], [0, 1], [1, 0]]]
+        arg_dict["tf_padding"] = [[[0, 0], [0, 1], [1, 0], [0, 0]]]
+        arg_dict["stride"] = [1, 2]
+        arg_dict["data_format"] = ["NCHW"]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_pad_list2(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(10, 3, 10, 10), (10, 3, 11, 11)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [3, 2]
+        arg_dict["groups"] = [1]
+        arg_dict["of_padding"] = [[[0, 0], [0, 0], [1, 1], [1, 1]]]
+        arg_dict["tf_padding"] = [[[0, 0], [1, 1], [1, 1], [0, 0]]]
+        arg_dict["stride"] = [1, 2]
+        arg_dict["data_format"] = ["NCHW"]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_pad_list3(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(10, 3, 10, 10), (10, 3, 11, 11)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [3, 2]
+        arg_dict["groups"] = [1]
+        arg_dict["of_padding"] = [[[0, 0], [0, 0], [1, 0], [1, 0]]]
+        arg_dict["tf_padding"] = [[[0, 0], [1, 0], [1, 0], [0, 0]]]
+        arg_dict["stride"] = [1, 2]
+        arg_dict["data_format"] = ["NCHW"]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_pad_list4(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(10, 3, 10, 10), (10, 3, 11, 11)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [3, 2]
+        arg_dict["groups"] = [1]
+        arg_dict["of_padding"] = [[[0, 0], [0, 0], [10, 2], [10, 2]]]
+        arg_dict["tf_padding"] = [[[0, 0], [10, 2], [10, 2], [0, 0]]]
+        arg_dict["stride"] = [1, 2]
+        arg_dict["data_format"] = ["NCHW"]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_nn_conv3d.py b/python/oneflow/compatible/single_client/test/ops/test_nn_conv3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..85507f53b1a2dd388680319d45de4cf0f17faca4
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_nn_conv3d.py
@@ -0,0 +1,220 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import test_global_storage
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def compare_with_tensorflow(
+    device_type,
+    x_shape,
+    filters,
+    kernel_size,
+    groups,
+    of_padding="SAME",
+    tf_padding="SAME",
+    stride_d=1,
+    stride_h=1,
+    stride_w=1,
+    data_format="NCDHW",
+    dilation_d=1,
+    dilation_h=1,
+    dilation_w=1,
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.consistent_view())
+    func_config.cudnn_conv_heuristic_search_algo(False)
+    if data_format == "NCDHW":
+        xy_data_transpose = (0, 2, 3, 4, 1)
+        weight_data_transpose = (2, 3, 4, 1, 0)
+    else:
+        xy_data_transpose = (0, 1, 2, 3, 4)
+        weight_data_transpose = (1, 2, 3, 4, 0)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def ConvJob():
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "x",
+                shape=x_shape,
+                dtype=flow.float,
+                initializer=flow.random_uniform_initializer(minval=0, maxval=100),
+                trainable=True,
+            )
+            if data_format == "NCDHW":
+                weight_shape = (
+                    filters,
+                    x.shape[1] // groups,
+                    kernel_size,
+                    kernel_size,
+                    kernel_size,
+                )
+            else:
+                weight_shape = (
+                    filters,
+                    kernel_size,
+                    kernel_size,
+                    kernel_size,
+                    x.shape[4] // groups,
+                )
+            weight = flow.get_variable(
+                "conv-weight",
+                shape=weight_shape,
+                dtype=flow.float,
+                initializer=flow.random_uniform_initializer(minval=0, maxval=100),
+            )
+            loss = flow.nn.conv3d(
+                x,
+                weight,
+                strides=[stride_d, stride_h, stride_w],
+                padding=of_padding,
+                data_format=data_format,
+                dilations=[dilation_d, dilation_h, dilation_w],
+                groups=groups,
+            )
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch(weight, test_global_storage.Setter("weight"))
+            flow.watch_diff(weight, test_global_storage.Setter("weight_diff"))
+            flow.watch(loss, test_global_storage.Setter("loss"))
+            flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))
+            return loss
+
+    of_out = ConvJob().get()
+    with tf.GradientTape(persistent=True) as tape:
+        x = tf.Variable(test_global_storage.Get("x").transpose(xy_data_transpose))
+        assert groups > 0
+        assert x_shape[1] % groups == 0
+        assert filters % groups == 0
+        weight = tf.Variable(
+            test_global_storage.Get("weight").transpose(weight_data_transpose)
+        )
+        tf_out = tf.nn.conv3d(
+            x,
+            weight,
+            strides=[1, stride_d, stride_h, stride_w, 1],
+            padding=tf_padding,
+            data_format="NDHWC",
+            dilations=[1, dilation_d, dilation_h, dilation_w, 1],
+        )
+    loss_diff = test_global_storage.Get("loss_diff").transpose(xy_data_transpose)
+    tf_x_diff = tape.gradient(tf_out, x, loss_diff)
+    tf_weight_diff = tape.gradient(tf_out, weight, loss_diff)
+    assert np.allclose(
+        of_out.numpy().transpose(xy_data_transpose),
+        tf_out.numpy(),
+        rtol=1e-05,
+        atol=1e-05,
+    )
+    diff_idx = np.where(
+        np.abs(
+            test_global_storage.Get("x_diff").transpose(xy_data_transpose)
+            - tf_x_diff.numpy()
+        )
+        > 0.0005
+    )
+    assert np.allclose(
+        test_global_storage.Get("x_diff").transpose(xy_data_transpose),
+        tf_x_diff.numpy(),
+        rtol=0.0001,
+        atol=0.0001,
+    )
+    assert np.allclose(
+        test_global_storage.Get("weight_diff").transpose(weight_data_transpose),
+        tf_weight_diff.numpy(),
+        rtol=1e-05,
+        atol=1e-05,
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestNnConv3d(flow.unittest.TestCase):
+    def test_padding_valid_NDHWC(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["x_shape"] = [(10, 32, 10, 10, 10), (10, 32, 10, 10, 11)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [3]
+        arg_dict["groups"] = [1]
+        arg_dict["of_padding"] = ["VALID"]
+        arg_dict["tf_padding"] = ["VALID"]
+        arg_dict["stride_d"] = [1]
+        arg_dict["stride_h"] = [2]
+        arg_dict["stride_w"] = [3]
+        arg_dict["data_format"] = ["NDHWC"]
+        arg_dict["dilation"] = [1]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_padding_valid_NCDHW(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["x_shape"] = [(10, 32, 11, 11, 11)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [3]
+        arg_dict["groups"] = [1]
+        arg_dict["of_padding"] = ["SAME_UPPER"]
+        arg_dict["tf_padding"] = ["SAME"]
+        arg_dict["stride_d"] = [1]
+        arg_dict["stride_h"] = [2]
+        arg_dict["stride_w"] = [3]
+        arg_dict["data_format"] = ["NCDHW"]
+        arg_dict["dilation"] = [1]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_padding_same(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["x_shape"] = [(10, 32, 11, 11, 11)]
+        arg_dict["filters"] = [64]
+        arg_dict["kernel_size"] = [2]
+        arg_dict["groups"] = [1]
+        arg_dict["of_padding"] = ["VALID"]
+        arg_dict["tf_padding"] = ["VALID"]
+        arg_dict["stride_d"] = [2]
+        arg_dict["stride_h"] = [2]
+        arg_dict["stride_w"] = [3]
+        arg_dict["data_format"] = ["NCDHW"]
+        arg_dict["dilation_d"] = [2]
+        arg_dict["dilation_h"] = [2]
+        arg_dict["dilation_w"] = [3]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_nvtx_range.py b/python/oneflow/compatible/single_client/test/ops/test_nvtx_range.py
new file mode 100644
index 0000000000000000000000000000000000000000..e68b883917e38a1d99ce55d330886d277fee2be8
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_nvtx_range.py
@@ -0,0 +1,70 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+func_config = flow.FunctionConfig()
+func_config.default_data_type(flow.float)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestProfilerNvtxRange(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_profiler_nvtx_range(test_case):
+        @flow.global_function(type="train", function_config=func_config)
+        def nvtx_range_job(x: oft.Numpy.Placeholder((4, 4, 1024, 1024))):
+            x += flow.get_variable(
+                name="v1",
+                shape=(1,),
+                dtype=flow.float,
+                initializer=flow.zeros_initializer(),
+            )
+            x = flow.math.relu(x)
+            x = flow.profiler.nvtx_start(x, mark_prefix="softmax")
+            x = flow.nn.softmax(x)
+            x = flow.nn.softmax(x)
+            x = flow.nn.softmax(x)
+            x = flow.nn.softmax(x)
+            x = flow.nn.softmax(x)
+            x = flow.profiler.nvtx_end(x, mark_prefix="softmax")
+            x = flow.math.relu(x)
+            x = flow.profiler.nvtx_start(x, mark_prefix="gelu")
+            x = flow.math.gelu(x)
+            x = flow.math.gelu(x)
+            x = flow.math.gelu(x)
+            x = flow.math.gelu(x)
+            x = flow.math.gelu(x)
+            x = flow.math.gelu(x)
+            x = flow.profiler.nvtx_end(x, mark_prefix="gelu")
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0]), momentum=0
+            ).minimize(x)
+            return flow.identity(x)
+
+        input = np.random.rand(4, 4, 1024, 1024).astype(np.float32)
+        for i in range(3):
+            res = nvtx_range_job(input).get()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_object_bbox_flip.py b/python/oneflow/compatible/single_client/test/ops/test_object_bbox_flip.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce1937eb5aa21a3cf6303b76b3ecfdb40ab1ea6d
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_object_bbox_flip.py
@@ -0,0 +1,117 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import random
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def _of_object_bbox_flip(bbox_list, image_size, flip_code):
+    bbox_shape = _get_bbox_static_shape(bbox_list)
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.mirrored_view())
+
+    @flow.global_function(function_config=func_config)
+    def object_bbox_flip_job(
+        bbox_def: oft.ListListNumpy.Placeholder(
+            shape=tuple(bbox_shape), dtype=flow.float
+        ),
+        image_size_def: oft.ListNumpy.Placeholder(
+            shape=image_size.shape, dtype=flow.int32
+        ),
+    ):
+        bbox_buffer = flow.tensor_list_to_tensor_buffer(bbox_def)
+        flip_bbox = flow.object_bbox_flip(bbox_buffer, image_size_def, flip_code)
+        return flow.tensor_buffer_to_tensor_list(
+            flip_bbox, shape=bbox_shape[1:], dtype=flow.float
+        )
+
+    input_bbox_list = [np.expand_dims(bbox, axis=0) for bbox in bbox_list]
+    bbox_tensor = object_bbox_flip_job([input_bbox_list], [image_size]).get()
+    return bbox_tensor.numpy_lists()[0]
+
+
+def _get_bbox_static_shape(bbox_list):
+    bbox_shapes = [bbox.shape for bbox in bbox_list]
+    bbox_static_shape = np.amax(bbox_shapes, axis=0)
+    assert isinstance(
+        bbox_static_shape, np.ndarray
+    ), "bbox_shapes: {}, bbox_static_shape: {}".format(
+        str(bbox_shapes), str(bbox_static_shape)
+    )
+    bbox_static_shape = bbox_static_shape.tolist()
+    bbox_static_shape.insert(0, len(bbox_list))
+    return bbox_static_shape
+
+
+def _compare_bbox_flip(
+    test_case, anno_file, batch_size, flip_code, print_debug_info=False
+):
+    from pycocotools.coco import COCO
+
+    coco = COCO(anno_file)
+    img_ids = coco.getImgIds()
+    bbox_list = []
+    image_size_list = []
+    sample_cnt = 0
+    while sample_cnt < batch_size:
+        rand_img_id = random.choice(img_ids)
+        anno_ids = coco.getAnnIds(imgIds=[rand_img_id])
+        if len(anno_ids) == 0:
+            continue
+        bbox_array = np.array(
+            [coco.anns[anno_id]["bbox"] for anno_id in anno_ids], dtype=np.single
+        )
+        bbox_list.append(bbox_array)
+        image_size_list.append(
+            [coco.imgs[rand_img_id]["width"], coco.imgs[rand_img_id]["height"]]
+        )
+        sample_cnt += 1
+    image_size_array = np.array(image_size_list, dtype=np.int32)
+    of_bbox_list = _of_object_bbox_flip(bbox_list, image_size_array, flip_code)
+    for (of_bbox, bbox, image_size) in zip(of_bbox_list, bbox_list, image_size_list):
+        (w, h) = image_size
+        if flip_code == 1:
+            xmin = bbox[:, 0].copy()
+            xmax = bbox[:, 2].copy()
+            bbox[:, 0] = w - xmax - 1
+            bbox[:, 2] = w - xmin - 1
+        else:
+            raise NotImplementedError
+        if print_debug_info:
+            print("-" * 20)
+            print("ret_bbox:\n", of_bbox.squeeze())
+            print("bbox:\n", bbox)
+        test_case.assertTrue(np.allclose(of_bbox.squeeze(), bbox))
+
+
+@unittest.skipIf(True, "skip for now because of single-client tensor_list removed")
+class TestObjectBboxFlip(flow.unittest.TestCase):
+    def test_object_bbox_flip(test_case):
+        _compare_bbox_flip(
+            test_case, "/dataset/mscoco_2017/annotations/instances_val2017.json", 4, 1
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_object_bbox_scale.py b/python/oneflow/compatible/single_client/test/ops/test_object_bbox_scale.py
new file mode 100644
index 0000000000000000000000000000000000000000..50b93c2cdf55707f84d00083b6a49dcc23f21fc0
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_object_bbox_scale.py
@@ -0,0 +1,174 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import random
+import unittest
+
+import cv2
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def _random_sample_images(anno_file, image_dir, batch_size):
+    from pycocotools.coco import COCO
+
+    image_files = []
+    image_ids = []
+    batch_group_id = -1
+    coco = COCO(anno_file)
+    img_ids = coco.getImgIds()
+    while len(image_files) < batch_size:
+        rand_img_id = random.choice(img_ids)
+        img_h = coco.imgs[rand_img_id]["height"]
+        img_w = coco.imgs[rand_img_id]["width"]
+        group_id = int(img_h / img_w)
+        if batch_group_id == -1:
+            batch_group_id = group_id
+        if group_id != batch_group_id:
+            continue
+        anno_ids = coco.getAnnIds(imgIds=[rand_img_id])
+        if len(anno_ids) == 0:
+            continue
+        image_files.append(os.path.join(image_dir, coco.imgs[rand_img_id]["file_name"]))
+        image_ids.append(rand_img_id)
+    assert len(image_files) == len(image_ids)
+    images = [cv2.imread(image_file).astype(np.single) for image_file in image_files]
+    bbox_list = _get_images_bbox_list(coco, image_ids)
+    return (images, bbox_list)
+
+
+def _get_images_bbox_list(coco, image_ids):
+    bbox_list = []
+    for img_id in image_ids:
+        anno_ids = coco.getAnnIds(imgIds=[img_id])
+        anno_ids = list(
+            filter(lambda anno_id: coco.anns[anno_id]["iscrowd"] == 0, anno_ids)
+        )
+        bbox_array = np.array(
+            [coco.anns[anno_id]["bbox"] for anno_id in anno_ids], dtype=np.single
+        )
+        bbox_list.append(bbox_array)
+    return bbox_list
+
+
+def _get_images_static_shape(images):
+    image_shapes = [image.shape for image in images]
+    image_static_shape = np.amax(image_shapes, axis=0)
+    assert isinstance(
+        image_static_shape, np.ndarray
+    ), "image_shapes: {}, image_static_shape: {}".format(
+        str(image_shapes), str(image_static_shape)
+    )
+    image_static_shape = image_static_shape.tolist()
+    image_static_shape.insert(0, len(image_shapes))
+    return image_static_shape
+
+
+def _get_bbox_static_shape(bbox_list):
+    bbox_shapes = [bbox.shape for bbox in bbox_list]
+    bbox_static_shape = np.amax(bbox_shapes, axis=0)
+    assert isinstance(
+        bbox_static_shape, np.ndarray
+    ), "bbox_shapes: {}, bbox_static_shape: {}".format(
+        str(bbox_shapes), str(bbox_static_shape)
+    )
+    bbox_static_shape = bbox_static_shape.tolist()
+    bbox_static_shape.insert(0, len(bbox_list))
+    return bbox_static_shape
+
+
+def _of_target_resize_bbox_scale(images, bbox_list, target_size, max_size):
+    image_shape = _get_images_static_shape(images)
+    bbox_shape = _get_bbox_static_shape(bbox_list)
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.mirrored_view())
+
+    @flow.global_function(function_config=func_config)
+    def target_resize_bbox_scale_job(
+        image_def: oft.ListListNumpy.Placeholder(
+            shape=tuple(image_shape), dtype=flow.float
+        ),
+        bbox_def: oft.ListListNumpy.Placeholder(
+            shape=tuple(bbox_shape), dtype=flow.float
+        ),
+    ):
+        images_buffer = flow.tensor_list_to_tensor_buffer(image_def)
+        (resized_images_buffer, new_size, scale) = flow.image_target_resize(
+            images_buffer, target_size=target_size, max_size=max_size
+        )
+        bbox_buffer = flow.tensor_list_to_tensor_buffer(bbox_def)
+        scaled_bbox = flow.object_bbox_scale(bbox_buffer, scale)
+        scaled_bbox_list = flow.tensor_buffer_to_tensor_list(
+            scaled_bbox, shape=bbox_shape[1:], dtype=flow.float
+        )
+        return (scaled_bbox_list, new_size)
+
+    input_image_list = [np.expand_dims(image, axis=0) for image in images]
+    input_bbox_list = [np.expand_dims(bbox, axis=0) for bbox in bbox_list]
+    (output_bbox_list, output_image_size) = target_resize_bbox_scale_job(
+        [input_image_list], [input_bbox_list]
+    ).get()
+    return (output_bbox_list.numpy_lists()[0], output_image_size.numpy_list()[0])
+
+
+def _compare_bbox_scale(
+    test_case,
+    anno_file,
+    image_dir,
+    batch_size,
+    target_size,
+    max_size,
+    print_debug_info=False,
+):
+    (images, bbox_list) = _random_sample_images(anno_file, image_dir, batch_size)
+    (of_bbox_list, image_size_list) = _of_target_resize_bbox_scale(
+        images, bbox_list, target_size, max_size
+    )
+    for (image, bbox, of_bbox, image_size) in zip(
+        images, bbox_list, of_bbox_list, image_size_list
+    ):
+        (w, h) = image_size
+        (oh, ow) = image.shape[0:2]
+        scale_h = h / oh
+        scale_w = w / ow
+        bbox[:, 0] *= scale_w
+        bbox[:, 1] *= scale_h
+        bbox[:, 2] *= scale_w
+        bbox[:, 3] *= scale_h
+        test_case.assertTrue(np.allclose(bbox, of_bbox))
+
+
+@unittest.skipIf(True, "skip for now because of single-client tensor_list removed")
+class TestObjectBboxScale(flow.unittest.TestCase):
+    def test_object_bbox_scale(test_case):
+        _compare_bbox_scale(
+            test_case,
+            "/dataset/mscoco_2017/annotations/instances_val2017.json",
+            "/dataset/mscoco_2017/val2017",
+            4,
+            800,
+            1333,
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_object_segm_poly_flip.py b/python/oneflow/compatible/single_client/test/ops/test_object_segm_poly_flip.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4dc968cd5a1b09c95ae7feede6be0cd6c21f99b
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_object_segm_poly_flip.py
@@ -0,0 +1,127 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import random
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def _of_object_segm_poly_flip(poly_list, image_size, flip_code):
+    poly_shape = _get_segm_poly_static_shape(poly_list)
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.mirrored_view())
+
+    @flow.global_function(function_config=func_config)
+    def object_segm_poly_flip_job(
+        poly_def: oft.ListListNumpy.Placeholder(
+            shape=tuple(poly_shape), dtype=flow.float
+        ),
+        image_size_def: oft.ListNumpy.Placeholder(
+            shape=image_size.shape, dtype=flow.int32
+        ),
+    ):
+        poly_buffer = flow.tensor_list_to_tensor_buffer(poly_def)
+        flip_poly = flow.object_segmentation_polygon_flip(
+            poly_buffer, image_size_def, flip_code
+        )
+        return flow.tensor_buffer_to_tensor_list(
+            flip_poly, shape=poly_shape[1:], dtype=flow.float
+        )
+
+    input_poly_list = [np.expand_dims(bbox, axis=0) for bbox in poly_list]
+    poly_tensor = object_segm_poly_flip_job([input_poly_list], [image_size]).get()
+    return poly_tensor.numpy_lists()[0]
+
+
+def _get_segm_poly_static_shape(poly_list):
+    poly_shapes = [poly.shape for poly in poly_list]
+    poly_static_shape = np.amax(poly_shapes, axis=0)
+    assert isinstance(
+        poly_static_shape, np.ndarray
+    ), "poly_shapes: {}, poly_static_shape: {}".format(
+        str(poly_shapes), str(poly_static_shape)
+    )
+    poly_static_shape = poly_static_shape.tolist()
+    poly_static_shape.insert(0, len(poly_list))
+    return poly_static_shape
+
+
+def _compare_segm_poly_flip(
+    test_case, anno_file, batch_size, flip_code, print_debug_info=False
+):
+    from pycocotools.coco import COCO
+
+    coco = COCO(anno_file)
+    img_ids = coco.getImgIds()
+    segm_poly_list = []
+    image_size_list = []
+    sample_cnt = 0
+    while sample_cnt < batch_size:
+        rand_img_id = random.choice(img_ids)
+        anno_ids = coco.getAnnIds(imgIds=[rand_img_id])
+        if len(anno_ids) == 0:
+            continue
+        poly_pts = []
+        for anno_id in anno_ids:
+            anno = coco.anns[anno_id]
+            if anno["iscrowd"] != 0:
+                continue
+            assert isinstance(anno["segmentation"], list)
+            for poly in anno["segmentation"]:
+                assert isinstance(poly, list)
+                poly_pts.extend(poly)
+        poly_array = np.array(poly_pts, dtype=np.single).reshape(-1, 2)
+        segm_poly_list.append(poly_array)
+        image_size_list.append(
+            [coco.imgs[rand_img_id]["width"], coco.imgs[rand_img_id]["height"]]
+        )
+        sample_cnt += 1
+    image_size_array = np.array(image_size_list, dtype=np.int32)
+    of_segm_poly_list = _of_object_segm_poly_flip(
+        segm_poly_list, image_size_array, flip_code
+    )
+    for (of_poly, poly, image_size) in zip(
+        of_segm_poly_list, segm_poly_list, image_size_list
+    ):
+        (w, h) = image_size
+        if flip_code == 1:
+            poly[:, 0] = w - poly[:, 0]
+        else:
+            raise NotImplementedError
+        if print_debug_info:
+            print("-" * 20)
+            print("of_poly:", of_poly.squeeze().shape, "\n", of_poly.squeeze())
+            print("poly:", poly.shape, "\n", poly)
+    test_case.assertTrue(np.allclose(of_poly.squeeze(), poly))
+
+
+@unittest.skipIf(True, "skip for now because of single-client tensor_list removed")
+class TestObjectSegmPolyFlip(flow.unittest.TestCase):
+    def test_object_segm_poly_flip(test_case):
+        _compare_segm_poly_flip(
+            test_case, "/dataset/mscoco_2017/annotations/instances_val2017.json", 4, 1
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_object_segm_poly_to_mask.py b/python/oneflow/compatible/single_client/test/ops/test_object_segm_poly_to_mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9eca0c854c7ad5a4cace8685eef30538c39aecb
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_object_segm_poly_to_mask.py
@@ -0,0 +1,424 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import random
+import unittest
+
+import cv2
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+coco_dict = dict()
+
+
+def _coco(anno_file):
+    global coco_dict
+    if anno_file not in coco_dict:
+        from pycocotools.coco import COCO
+
+        coco_dict[anno_file] = COCO(anno_file)
+    return coco_dict[anno_file]
+
+
+def _random_sample_image_ids(coco, batch_size):
+    img_ids = coco.getImgIds()
+    batch_img_ids = []
+    batch_group_id = -1
+    while len(batch_img_ids) < batch_size:
+        rand_img_id = random.choice(img_ids)
+        img_h = coco.imgs[rand_img_id]["height"]
+        img_w = coco.imgs[rand_img_id]["width"]
+        group_id = int(img_h / img_w)
+        if batch_group_id == -1:
+            batch_group_id = group_id
+        if group_id != batch_group_id:
+            continue
+        anno_ids = coco.getAnnIds(imgIds=[rand_img_id])
+        if len(anno_ids) == 0:
+            continue
+        batch_img_ids.append(rand_img_id)
+    return batch_img_ids
+
+
+def _read_images_with_cv(coco, image_dir, image_ids):
+    image_files = [
+        os.path.join(image_dir, coco.imgs[img_id]["file_name"]) for img_id in image_ids
+    ]
+    return [cv2.imread(image_file).astype(np.single) for image_file in image_files]
+
+
+def _get_images_segm_poly(coco, image_ids):
+    img_segm_poly_list = []
+    for img_id in image_ids:
+        anno_ids = coco.getAnnIds(imgIds=[img_id])
+        assert len(anno_ids) > 0, "img {} has no anno".format(img_id)
+        segm_poly_list = []
+        for anno_id in anno_ids:
+            anno = coco.anns[anno_id]
+            if anno["iscrowd"] != 0:
+                continue
+            segm = anno["segmentation"]
+            assert isinstance(segm, list)
+            assert len(segm) > 0, str(len(segm))
+            assert all([len(poly) > 0 for poly in segm]), str(
+                [len(poly) for poly in segm]
+            )
+            segm_poly_list.append(segm)
+        img_segm_poly_list.append(segm_poly_list)
+    return img_segm_poly_list
+
+
+def _get_check_image_size(coco, image_ids, images):
+    assert len(image_ids) == len(images)
+    image_size_list = []
+    for (i, img_id) in enumerate(image_ids):
+        img_h = coco.imgs[img_id]["height"]
+        img_w = coco.imgs[img_id]["width"]
+        assert img_h == images[i].shape[0]
+        assert img_w == images[i].shape[1]
+        image_size_list.append([img_w, img_h])
+    return image_size_list
+
+
+def _segm_poly_to_tensor(img_segm_poly_list):
+    poly_array_list = []
+    poly_index_array_list = []
+    for (img_idx, segm_poly_list) in enumerate(img_segm_poly_list):
+        img_poly_elem_list = []
+        img_poly_index_list = []
+        for (obj_idx, poly_list) in enumerate(segm_poly_list):
+            for (poly_idx, poly) in enumerate(poly_list):
+                img_poly_elem_list.extend(poly)
+                for (pt_idx, pt) in enumerate(poly):
+                    if pt_idx % 2 == 0:
+                        img_poly_index_list.append([pt_idx / 2, poly_idx, obj_idx])
+        img_poly_array = np.array(img_poly_elem_list, dtype=np.single).reshape(-1, 2)
+        assert img_poly_array.size > 0, segm_poly_list
+        poly_array_list.append(img_poly_array)
+        img_poly_index_array = np.array(img_poly_index_list, dtype=np.int32)
+        assert img_poly_index_array.size > 0, segm_poly_list
+        poly_index_array_list.append(img_poly_index_array)
+    return (poly_array_list, poly_index_array_list)
+
+
+def _get_images_static_shape(images):
+    image_shapes = [image.shape for image in images]
+    image_static_shape = np.amax(image_shapes, axis=0)
+    assert isinstance(
+        image_static_shape, np.ndarray
+    ), "image_shapes: {}, image_static_shape: {}".format(
+        str(image_shapes), str(image_static_shape)
+    )
+    image_static_shape = image_static_shape.tolist()
+    image_static_shape.insert(0, len(image_shapes))
+    return image_static_shape
+
+
+def _get_segm_poly_static_shape(poly_list, poly_index_list):
+    assert len(poly_list) == len(poly_index_list)
+    num_images = len(poly_list)
+    max_poly_elems = 0
+    for (poly, poly_index) in zip(poly_list, poly_index_list):
+        assert len(poly.shape) == 2
+        assert len(poly_index.shape) == 2, str(poly_index.shape)
+        assert poly.shape[0] == poly_index.shape[0]
+        assert poly.shape[1] == 2
+        assert poly_index.shape[1] == 3
+        max_poly_elems = max(max_poly_elems, poly.shape[0])
+    return ([num_images, max_poly_elems, 2], [num_images, max_poly_elems, 3])
+
+
+def _of_poly_to_mask_pipline(
+    images, poly_list, poly_index_list, num_segms_list, target_size, max_size
+):
+    assert len(images) == len(poly_list)
+    assert len(poly_list) == len(poly_index_list)
+    image_shape = _get_images_static_shape(images)
+    (poly_shape, poly_index_shape) = _get_segm_poly_static_shape(
+        poly_list, poly_index_list
+    )
+    max_num_segms = max(num_segms_list)
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_logical_view(flow.scope.mirrored_view())
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(function_config=func_config)
+    def poly_to_mask_job(
+        image_def: oft.ListListNumpy.Placeholder(
+            shape=tuple(image_shape), dtype=flow.float
+        ),
+        poly_def: oft.ListListNumpy.Placeholder(
+            shape=tuple(poly_shape), dtype=flow.float
+        ),
+        poly_index_def: oft.ListListNumpy.Placeholder(
+            shape=tuple(poly_index_shape), dtype=flow.int32
+        ),
+    ):
+        images_buffer = flow.tensor_list_to_tensor_buffer(image_def)
+        (resized_images_buffer, new_size, scale) = flow.image_target_resize(
+            images_buffer, target_size=target_size, max_size=max_size
+        )
+        poly_buffer = flow.tensor_list_to_tensor_buffer(poly_def)
+        poly_index_buffer = flow.tensor_list_to_tensor_buffer(poly_index_def)
+        scaled_poly_buffer = flow.object_segmentation_polygon_scale(poly_buffer, scale)
+        mask_buffer = flow.object_segmentation_polygon_to_mask(
+            scaled_poly_buffer, poly_index_buffer, new_size
+        )
+        mask_list = flow.tensor_buffer_to_tensor_list(
+            mask_buffer, shape=(max_num_segms, target_size, max_size), dtype=flow.int8
+        )
+        scaled_poly_list = flow.tensor_buffer_to_tensor_list(
+            scaled_poly_buffer, shape=poly_shape[1:], dtype=flow.float
+        )
+        return (mask_list, scaled_poly_list)
+
+    input_image_list = [np.expand_dims(image, axis=0) for image in images]
+    input_poly_list = [np.expand_dims(poly, axis=0) for poly in poly_list]
+    input_poly_index_list = [
+        np.expand_dims(poly_index, axis=0) for poly_index in poly_index_list
+    ]
+    (output_mask_list, output_poly_list) = poly_to_mask_job(
+        [input_image_list], [input_poly_list], [input_poly_index_list]
+    ).get()
+    return (output_mask_list.numpy_lists()[0], output_poly_list.numpy_lists()[0])
+
+
+def _get_target_resize_scale(size, target_size, max_size):
+    (w, h) = size
+    min_ori_size = float(min((w, h)))
+    max_ori_size = float(max((w, h)))
+    min_res_size = target_size
+    max_res_size = int(round(max_ori_size / min_ori_size * min_res_size))
+    if max_res_size > max_size:
+        max_res_size = max_size
+        min_res_size = int(round(max_res_size * min_ori_size / max_ori_size))
+    if w < h:
+        res_w = min_res_size
+        res_h = max_res_size
+    else:
+        res_w = max_res_size
+        res_h = min_res_size
+    return ([res_w, res_h], [res_w / w, res_h / h])
+
+
+def _scale_poly_list(img_segm_poly_list, scale_list):
+    assert len(img_segm_poly_list) == len(scale_list)
+    for (img_idx, segm_poly_list) in enumerate(img_segm_poly_list):
+        (scale_w, scale_h) = scale_list[img_idx]
+        for poly_list in segm_poly_list:
+            for poly in poly_list:
+                for pt_idx in range(len(poly)):
+                    if pt_idx % 2 == 0:
+                        poly[pt_idx] *= scale_w
+                    else:
+                        poly[pt_idx] *= scale_h
+    return img_segm_poly_list
+
+
+def _poly_to_mask_with_cv(img_segm_poly_list, image_size_list):
+    assert len(img_segm_poly_list) == len(image_size_list)
+    img_segm_mask_list = []
+    for (segm_poly_list, size) in zip(img_segm_poly_list, image_size_list):
+        segm_mask_list = []
+        for poly_list in segm_poly_list:
+            segm_mask = np.zeros(shape=size[::-1], dtype=np.int8)
+            poly_array_list = [
+                np.int32(np.round(np.asarray(poly)).reshape(-1, 2))
+                for poly in poly_list
+            ]
+            cv2.fillPoly(segm_mask, poly_array_list, 1, lineType=8)
+            segm_mask_list.append(segm_mask)
+        segm_mask_array = np.asarray(segm_mask_list)
+        img_segm_mask_list.append(segm_mask_array)
+    return img_segm_mask_list
+
+
+def _poly_to_mask_with_of_and_cv(
+    test_case,
+    anno_file,
+    image_dir,
+    batch_size,
+    target_size,
+    max_size,
+    img_ids=None,
+    print_debug_info=False,
+):
+    coco = _coco(anno_file)
+    if img_ids is not None:
+        assert len(img_ids) == batch_size
+        rand_img_ids = img_ids
+    else:
+        rand_img_ids = _random_sample_image_ids(coco, batch_size)
+    images = _read_images_with_cv(coco, image_dir, rand_img_ids)
+    image_size_list = _get_check_image_size(coco, rand_img_ids, images)
+    img_segm_poly_list = _get_images_segm_poly(coco, rand_img_ids)
+    assert len(img_segm_poly_list) == len(image_size_list)
+    (poly_list, poly_index_list) = _segm_poly_to_tensor(img_segm_poly_list)
+    num_segms_list = [len(segm_poly_list) for segm_poly_list in img_segm_poly_list]
+    if print_debug_info:
+        print("poly_shapes:", [poly.shape for poly in poly_list])
+        print("poly_index_shapes", [poly_index.shape for poly_index in poly_index_list])
+        print("image_size_list:", image_size_list)
+        print("num_segms_list:", num_segms_list)
+    (of_mask_list, of_scaled_poly_list) = _of_poly_to_mask_pipline(
+        images, poly_list, poly_index_list, num_segms_list, target_size, max_size
+    )
+    of_mask_list = [
+        mask_array.reshape(-1, mask_array.shape[-2], mask_array.shape[-1])
+        for mask_array in of_mask_list
+    ]
+    if print_debug_info:
+        print("of_mask_list shapes:", [of_mask.shape for of_mask in of_mask_list])
+    new_image_size_list = []
+    scale_list = []
+    for image_size in image_size_list:
+        (new_size, scale) = _get_target_resize_scale(image_size, target_size, max_size)
+        new_image_size_list.append(new_size)
+        scale_list.append(scale)
+    if print_debug_info:
+        print("resized size: {}, scale: {}".format(new_image_size_list, scale_list))
+    scaled_img_segm_poly_list = _scale_poly_list(img_segm_poly_list, scale_list)
+    (scaled_poly_list, scaled_poly_index_list) = _segm_poly_to_tensor(
+        scaled_img_segm_poly_list
+    )
+    img_segm_mask_list = _poly_to_mask_with_cv(
+        scaled_img_segm_poly_list, new_image_size_list
+    )
+    assert len(img_segm_mask_list) == len(of_mask_list)
+    if test_case is not None:
+        for (of_scaled_poly, scaled_poly, poly_index, scaled_poly_index) in zip(
+            of_scaled_poly_list,
+            scaled_poly_list,
+            poly_index_list,
+            scaled_poly_index_list,
+        ):
+            if print_debug_info:
+                print(
+                    "compare scaled poly: shape {} vs {}\n\tmax_abs_diff: {}".format(
+                        of_scaled_poly.shape,
+                        scaled_poly.shape,
+                        np.max(np.absolute(of_scaled_poly - scaled_poly)),
+                    )
+                )
+            test_case.assertTrue(np.allclose(of_scaled_poly, scaled_poly))
+            test_case.assertTrue(np.array_equal(poly_index, scaled_poly_index))
+        for (of_mask, gt_mask) in zip(of_mask_list, img_segm_mask_list):
+            if print_debug_info:
+                print(
+                    "compare segm mask: shape {} vs {}".format(
+                        of_mask.shape, gt_mask.shape
+                    )
+                )
+            test_case.assertTrue(np.array_equal(of_mask.shape, gt_mask.shape))
+    return (of_mask_list, img_segm_mask_list)
+
+
+def _vis_img_segm_mask_cmp(mask_list, cmp_mask_list):
+    assert len(mask_list) == len(cmp_mask_list)
+    import ipywidgets as ipw
+    import matplotlib.pyplot as plt
+    from IPython.display import clear_output, display
+
+    plt.close("all")
+    plt.ioff()
+    (fig, (ax1, ax2)) = plt.subplots(1, 2)
+    fig.set_dpi(150)
+    out_widget = ipw.Output()
+    next_btn = ipw.Button(description="Next")
+    vbox = ipw.VBox(children=(out_widget, next_btn))
+    display(vbox)
+    cur_img_idx = 0
+    cur_mask_idx = 0
+
+    def display_fig():
+        nonlocal cur_img_idx, cur_mask_idx
+        mask_array = mask_list[cur_img_idx]
+        cmp_mask_array = cmp_mask_list[cur_img_idx]
+        assert mask_array.shape == cmp_mask_array.shape, "{} vs {}".format(
+            str(mask_array.shape), str(cmp_mask_array.shape)
+        )
+        mask = mask_array[cur_mask_idx]
+        cmp_mask = cmp_mask_array[cur_mask_idx]
+        ax1.clear()
+        ax2.clear()
+        fig.suptitle(
+            "img_idx:{}, mask_idx:{}".format(cur_img_idx, cur_mask_idx), fontsize=10
+        )
+        ax1.imshow(mask)
+        ax2.imshow(cmp_mask)
+        nonlocal out_widget
+        with out_widget:
+            clear_output(wait=True)
+            display(fig)
+
+    def on_next_clicked(b):
+        nonlocal cur_img_idx, cur_mask_idx
+        eof = False
+        cur_mask_array_len = len(mask_list[cur_img_idx])
+        if cur_mask_idx < cur_mask_array_len - 1:
+            cur_mask_idx += 1
+        else:
+            cur_mask_list_len = len(mask_list)
+            if cur_img_idx < cur_mask_list_len - 1:
+                cur_img_idx += 1
+                cur_mask_idx = 0
+            else:
+                eof = True
+        if eof:
+            nonlocal next_btn
+            next_btn.close()
+            del next_btn
+        else:
+            display_fig()
+
+    next_btn.on_click(on_next_clicked)
+    display_fig()
+
+
+def _check_empty_anno_img_ids(anno_file):
+    coco = _coco(anno_file)
+    img_ids = coco.getImgIds()
+    empty_anno_img_ids = []
+    for img_id in img_ids:
+        anno_ids = coco.getAnnIds(imgIds=[img_id])
+        if len(anno_ids) == 0:
+            empty_anno_img_ids.append(img_id)
+    print("empty_anno_img_ids:", empty_anno_img_ids)
+
+
+'\nif __name__ == "__main__":\n    # _check_empty_anno_img_ids("/dataset/mscoco_2017/annotations/instances_val2017.json")\n    of_mask_list, mask_list = _poly_to_mask_with_of_and_cv(\n        None,\n        "/dataset/mscoco_2017/annotations/instances_val2017.json",\n        "/dataset/mscoco_2017/val2017",\n        4,\n        800,\n        1333,\n        # img_ids=[226111, 58636, 458790, 461275],\n        print_debug_info=True,\n    )\n    _vis_img_segm_mask_cmp(of_mask_list, mask_list)\n'
+
+
+@unittest.skipIf(True, "skip for now because of single-client tensor_list removed")
+class TestObjectSegmPolyToMask(flow.unittest.TestCase):
+    def test_poly_to_mask(test_case):
+        _poly_to_mask_with_of_and_cv(
+            test_case,
+            "/dataset/mscoco_2017/annotations/instances_val2017.json",
+            "/dataset/mscoco_2017/val2017",
+            4,
+            800,
+            1333,
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_one_hot.py b/python/oneflow/compatible/single_client/test/ops/test_one_hot.py
new file mode 100644
index 0000000000000000000000000000000000000000..66750d8cf835772c2eeb9b4c1a5b80fe0869452f
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_one_hot.py
@@ -0,0 +1,82 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+from test_util import GenArgList, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def _check(test_case, x, y, depth, on_value, off_value, axis):
+    out = tf.one_hot(x, depth=depth, axis=axis, on_value=on_value, off_value=off_value)
+    test_case.assertTrue(np.array_equal(out.numpy(), y))
+
+
+def _run_test(
+    test_case, device_type, x_shape, depth, dtype, out_dtype, on_value, off_value, axis
+):
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(function_config=func_config)
+    def one_hot_job(
+        x: oft.Numpy.Placeholder(x_shape, dtype=type_name_to_flow_type[dtype])
+    ):
+        with flow.scope.placement(device_type, "0:0"):
+            return flow.one_hot(
+                x,
+                depth=depth,
+                on_value=on_value,
+                off_value=off_value,
+                axis=axis,
+                dtype=type_name_to_flow_type[out_dtype],
+            )
+
+    x = np.random.randint(0, depth, x_shape).astype(type_name_to_np_type[dtype])
+    y = one_hot_job(x).get()
+    _check(test_case, x, y.numpy(), depth, on_value, off_value, axis)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestOneHot(flow.unittest.TestCase):
+    def test_one_hot(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_case"] = [test_case]
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["x_shape"] = [(10, 20, 20)]
+        arg_dict["depth"] = [10]
+        arg_dict["dtype"] = ["int32", "int64"]
+        arg_dict["out_dtype"] = ["int32", "double"]
+        arg_dict["on_value"] = [5]
+        arg_dict["off_value"] = [2]
+        arg_dict["axis"] = [-1, 0, 2]
+        for arg in GenArgList(arg_dict):
+            _run_test(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_ones.py b/python/oneflow/compatible/single_client/test/ops/test_ones.py
new file mode 100644
index 0000000000000000000000000000000000000000..d71f6767412ee14d1e0d2d75c1505213ceab8472
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_ones.py
@@ -0,0 +1,101 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+from typing import Dict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+def _compare_ones_with_np(input_shape, device_type, machine_ids, device_counts):
+    assert device_type in ["cpu", "gpu"]
+    flow.clear_default_session()
+    if device_type == "cpu":
+        flow.config.cpu_device_num(device_counts)
+    else:
+        flow.config.gpu_device_num(device_counts)
+    func_config = flow.FunctionConfig()
+    func_config.default_placement_scope(flow.scope.placement(device_type, machine_ids))
+    np_out_ones = np.ones(shape=input_shape, dtype=np.float32)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def oneflow_ones() -> tp.Numpy:
+        with flow.scope.placement(device_type, "0:0"):
+            v = flow.get_variable(
+                shape=np_out_ones.shape,
+                dtype=flow.float32,
+                initializer=flow.zeros_initializer(),
+                name="x_var",
+            )
+        of_ones = flow.ones(shape=input_shape, dtype=flow.float32)
+        of_out = of_ones + v
+        with flow.scope.placement(device_type, "0:0"):
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+            ).minimize(of_out)
+        return of_ones
+
+    of_out_ones = oneflow_ones()
+    assert np.allclose(of_out_ones, np_out_ones)
+
+
+def _gen_arg_dict(shape, device_type, machine_ids, device_counts):
+    arg_dict = OrderedDict()
+    arg_dict["input_shape"] = [shape]
+    arg_dict["device_type"] = [device_type]
+    arg_dict["machine_ids"] = [machine_ids]
+    arg_dict["device_counts"] = [device_counts]
+    return arg_dict
+
+
+@flow.unittest.skip_unless_1n1d()
+class Testones1n1d(flow.unittest.TestCase):
+    def test_ones_cpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(3, 3), device_type="cpu", machine_ids="0:0", device_counts=1
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_ones_with_np(*arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_ones_gpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(3, 16, 32), device_type="gpu", machine_ids="0:0", device_counts=1
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_ones_with_np(*arg)
+
+
+@flow.unittest.skip_unless_1n2d()
+class Testones1n2d(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_ones_gpu_1n2d(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(3, 8, 8, 4), device_type="gpu", machine_ids="0:0-1", device_counts=2
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_ones_with_np(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_optimizer_placement_optimization.py b/python/oneflow/compatible/single_client/test/ops/test_optimizer_placement_optimization.py
new file mode 100644
index 0000000000000000000000000000000000000000..c612e5dea179411bb831ea77ac48c56328d50e88
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_optimizer_placement_optimization.py
@@ -0,0 +1,55 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def _test(test_case, mode):
+    flow.config.gpu_device_num(2)
+    flow.config.enable_debug_mode(True)
+    func_config = flow.FunctionConfig()
+    func_config.default_logical_view(flow.scope.consistent_view())
+    func_config.optimizer_placement_optimization_mode(mode)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def Foo(x: oft.Numpy.Placeholder((2, 1024 * 1024))):
+        w = flow.get_variable(
+            "w", (1024 * 1024,), initializer=flow.constant_initializer(100)
+        )
+        flow.optimizer.SGD(
+            flow.optimizer.PiecewiseConstantScheduler([], [5]), momentum=0
+        ).minimize(x + w)
+
+    Foo(np.ones((2, 1024 * 1024), dtype=np.float32))
+
+
+@flow.unittest.skip_unless_1n2d()
+class TestOptimizerPlacementOptimization(flow.unittest.TestCase):
+    def test_non_distributed(test_case):
+        _test(test_case, "non_distributed")
+
+    def test_distributed_split(test_case):
+        _test(test_case, "distributed_split")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_optimizers.py b/python/oneflow/compatible/single_client/test/ops/test_optimizers.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff7cd3ceafdc3746aded2121e6a505cdc1ca5702
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_optimizers.py
@@ -0,0 +1,1055 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import test_global_storage
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def compare_with_tensorflow_rmsprop(
+    device_type, x_shape, centered, decay_rate, learning_rate, train_iters
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float32)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def testRmsprop(
+        random_mask: flow.typing.Numpy.Placeholder(x_shape, dtype=flow.float32)
+    ) -> flow.typing.Numpy:
+        with flow.scope.placement(device_type, "0:0-0"):
+            x = flow.get_variable(
+                name="x",
+                shape=x_shape,
+                dtype=flow.float32,
+                initializer=flow.random_uniform_initializer(minval=0, maxval=100),
+                trainable=True,
+            )
+            loss = flow.math.reduce_mean(x * random_mask)
+            flow.optimizer.RMSProp(
+                flow.optimizer.PiecewiseConstantScheduler([], [learning_rate]),
+                decay_rate=decay_rate,
+                epsilon=0,
+                centered=centered,
+            ).minimize(loss)
+            return x
+
+    random_masks_seq = []
+    for i in range(train_iters + 1):
+        random_masks_seq.append(np.random.uniform(size=x_shape).astype(np.float32))
+    init_value = None
+    for i in range(train_iters + 1):
+        x = testRmsprop(random_masks_seq[i])
+        if i == 0:
+            init_value = np.copy(x)
+    var = tf.Variable(init_value)
+    opt = tf.keras.optimizers.RMSprop(
+        learning_rate=learning_rate,
+        rho=decay_rate,
+        momentum=0.0,
+        epsilon=0,
+        centered=centered,
+    )
+    for i in range(train_iters):
+        with tf.GradientTape() as tape:
+            random_mask = tf.Variable(random_masks_seq[i])
+            loss = tf.reduce_mean(var * random_mask)
+        gradients = tape.gradient(loss, var)
+        opt.apply_gradients(zip([gradients], [var]))
+    assert np.allclose(x.flatten(), var.numpy().flatten(), rtol=0.1, atol=0.1), (
+        x.flatten() - var.numpy().flatten()
+    )
+
+
+def compare_with_tensorflow_adam(
+    device_type, x_shape, beta1, beta2, epsilon, learning_rate, train_iters
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float32)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def testAdam(
+        random_mask: flow.typing.Numpy.Placeholder(x_shape, dtype=flow.float32)
+    ) -> flow.typing.Numpy:
+        with flow.scope.placement(device_type, "0:0-0"):
+            x = flow.get_variable(
+                name="x",
+                shape=x_shape,
+                dtype=flow.float32,
+                initializer=flow.random_uniform_initializer(minval=0, maxval=100),
+                trainable=True,
+            )
+            loss = flow.math.reduce_mean(x * random_mask)
+            flow.optimizer.Adam(
+                flow.optimizer.PiecewiseConstantScheduler([], [learning_rate]),
+                beta1=beta1,
+                beta2=beta2,
+                epsilon=epsilon,
+                do_bias_correction=True,
+            ).minimize(loss)
+            return x
+
+    random_masks_seq = []
+    for i in range(train_iters + 1):
+        random_masks_seq.append(np.random.uniform(size=x_shape).astype(np.float32))
+    init_value = None
+    for i in range(train_iters + 1):
+        x = testAdam(random_masks_seq[i])
+        if i == 0:
+            init_value = np.copy(x)
+    var = tf.Variable(init_value)
+    opt = tf.keras.optimizers.Adam(
+        learning_rate=learning_rate,
+        beta_1=beta1,
+        beta_2=beta2,
+        epsilon=epsilon,
+        amsgrad=False,
+    )
+    for i in range(train_iters):
+        with tf.GradientTape() as tape:
+            random_mask = tf.Variable(random_masks_seq[i])
+            loss = tf.reduce_mean(var * random_mask)
+        gradients = tape.gradient(loss, var)
+        opt.apply_gradients(zip([gradients], [var]))
+    assert np.allclose(x.flatten(), var.numpy().flatten(), rtol=0.0001, atol=0.0001)
+
+
+def compare_with_numpy_adamw(
+    device_type,
+    x_shape,
+    beta1,
+    beta2,
+    epsilon,
+    weight_decay,
+    learning_rate,
+    train_iters,
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float32)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def testAdamW(
+        random_mask: flow.typing.Numpy.Placeholder(x_shape, dtype=flow.float32)
+    ) -> flow.typing.Numpy:
+        with flow.scope.placement(device_type, "0:0-0"):
+            x = flow.get_variable(
+                name="x",
+                shape=x_shape,
+                dtype=flow.float32,
+                initializer=flow.random_uniform_initializer(minval=0, maxval=100),
+                trainable=True,
+            )
+            loss = flow.math.reduce_mean(x * random_mask)
+            flow.optimizer.AdamW(
+                flow.optimizer.PiecewiseConstantScheduler([], [learning_rate]),
+                beta1=beta1,
+                beta2=beta2,
+                epsilon=epsilon,
+                weight_decay=weight_decay,
+                do_bias_correction=True,
+            ).minimize(loss)
+            return x
+
+    random_masks_seq = []
+    for i in range(train_iters + 1):
+        random_masks_seq.append(np.random.uniform(size=x_shape).astype(np.float32))
+    init_value = None
+    for i in range(train_iters + 1):
+        x = testAdamW(random_masks_seq[i])
+        if i == 0:
+            init_value = np.copy(x)
+
+    def adamw_update_numpy(
+        param,
+        gradient,
+        iter,
+        m,
+        v,
+        lr=0.001,
+        beta1=0.9,
+        beta2=0.999,
+        epsilon=1e-07,
+        weight_decay=0.9,
+    ):
+        lr_t = lr * np.sqrt(1 - beta2 ** (iter + 1)) / (1 - beta1 ** (iter + 1))
+        m_t = beta1 * m + (1 - beta1) * gradient
+        v_t = beta2 * v + (1 - beta2) * gradient * gradient
+        param_t = param - lr_t * (m_t / (np.sqrt(v_t) + epsilon) + weight_decay * param)
+        return (param_t, m_t, v_t)
+
+    param = init_value
+    gradient = np.full(param.shape, 1.0 / np.prod(param.shape))
+    m = np.zeros(param.shape)
+    v = np.zeros(param.shape)
+    for i in range(train_iters):
+        (param, m, v) = adamw_update_numpy(
+            param,
+            gradient * random_masks_seq[i],
+            i,
+            m,
+            v,
+            learning_rate,
+            beta1,
+            beta2,
+            epsilon,
+            weight_decay,
+        )
+    assert np.allclose(x.flatten(), param.flatten(), rtol=0.0001, atol=0.0001)
+
+
+def compare_with_numpy_lazy_adam(
+    device_type, x_shape, beta1, beta2, epsilon, learning_rate, train_iters
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float32)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def testLazyAdam() -> flow.typing.Numpy:
+        with flow.scope.placement(device_type, "0:0-0"):
+            x = flow.get_variable(
+                name="x",
+                shape=x_shape,
+                dtype=flow.float32,
+                initializer=flow.random_uniform_initializer(minval=0, maxval=100),
+                trainable=True,
+            )
+            loss = flow.math.reduce_mean(x)
+            flow.optimizer.LazyAdam(
+                flow.optimizer.PiecewiseConstantScheduler([], [learning_rate]),
+                beta1=beta1,
+                beta2=beta2,
+                epsilon=epsilon,
+            ).minimize(loss)
+            return x
+
+    init_value = None
+    for i in range(train_iters + 1):
+        x = testLazyAdam()
+        if i == 0:
+            init_value = np.copy(x)
+
+    def lazy_adam_update_numpy(
+        param, gradient, iter, m, v, lr=0.001, beta1=0.9, beta2=0.999, epsilon=1e-07
+    ):
+        lr_t = lr * np.sqrt(1 - beta2 ** (iter + 1)) / (1 - beta1 ** (iter + 1))
+        m_t = np.copy(m)
+        v_t = np.copy(v)
+        m_t_o = beta1 * m + (1 - beta1) * gradient
+        v_t_o = beta2 * v + (1 - beta2) * gradient * gradient
+        m_t = m_t_o
+        v_t = v_t_o
+        param_t = np.copy(param)
+        param_t_o = param - lr_t * m_t / (np.sqrt(v_t) + epsilon)
+        param_t = param_t_o
+        return (param_t, m_t, v_t)
+
+    param = init_value
+    gradient = np.full(param.shape, 1.0 / np.prod(param.shape))
+    m = np.zeros(param.shape)
+    v = np.zeros(param.shape)
+    for i in range(train_iters):
+        (param, m, v) = lazy_adam_update_numpy(
+            param, gradient, i, m, v, learning_rate, beta1, beta2, epsilon
+        )
+    assert np.allclose(x.flatten(), param.flatten(), rtol=0.0001, atol=0.0001)
+
+
+def compare_with_numpy_lars(
+    device_type,
+    x_shape,
+    momentum_beta,
+    epsilon,
+    lars_coefficient,
+    learning_rate,
+    weight_decay,
+    train_iters,
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float32)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def testLars(
+        random_mask: flow.typing.Numpy.Placeholder(x_shape, dtype=flow.float32)
+    ) -> flow.typing.Numpy:
+        with flow.scope.placement(device_type, "0:0-0"):
+            x = flow.get_variable(
+                name="x",
+                shape=x_shape,
+                dtype=flow.float32,
+                initializer=flow.random_uniform_initializer(minval=0, maxval=100),
+                trainable=True,
+            )
+            loss = flow.math.reduce_mean(x * random_mask)
+            flow.optimizer.LARS(
+                flow.optimizer.PiecewiseConstantScheduler([], [learning_rate]),
+                momentum_beta=momentum_beta,
+                epsilon=epsilon,
+                lars_coefficient=lars_coefficient,
+                weight_decay=weight_decay,
+            ).minimize(loss)
+            return x
+
+    random_masks_seq = []
+    for i in range(train_iters + 1):
+        random_masks_seq.append(np.random.uniform(size=x_shape).astype(np.float32))
+    init_value = None
+    for i in range(train_iters + 1):
+        x = testLars(random_masks_seq[i])
+        if i == 0:
+            init_value = np.copy(x)
+
+    def lars_update_numpy(
+        param,
+        gradient,
+        momentum,
+        learning_rate,
+        momentum_beta,
+        weight_decay,
+        epsilon,
+        lars_coefficient,
+    ):
+        import math
+
+        model_norm = math.sqrt(np.sum(param * param))
+        model_diff_norm = math.sqrt(np.sum(gradient * gradient))
+        if model_norm > 0 and model_diff_norm > 0:
+            lars = (
+                lars_coefficient
+                * model_norm
+                / (model_diff_norm + weight_decay * model_norm + epsilon)
+            )
+        else:
+            lars = 1.0
+        local_learning_rate = learning_rate * lars
+        momentum_t = momentum_beta * momentum - local_learning_rate * gradient
+        param_t = param + momentum_t - local_learning_rate * weight_decay * param
+        return (param_t, momentum_t)
+
+    param = init_value
+    gradient = np.full(param.shape, 1.0 / np.prod(param.shape))
+    momentum = np.zeros(param.shape)
+    for i in range(train_iters):
+        (param, momentum) = lars_update_numpy(
+            param,
+            gradient * random_masks_seq[i],
+            momentum,
+            learning_rate,
+            momentum_beta,
+            weight_decay,
+            epsilon,
+            lars_coefficient,
+        )
+    assert np.allclose(x.flatten(), param.flatten(), rtol=0.0001, atol=0.0001)
+
+
+def compare_with_tensorflow_sgd(
+    device_type, x_shape, momentum, learning_rate, train_iters
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float32)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def testSGD(
+        random_mask: flow.typing.Numpy.Placeholder(x_shape, dtype=flow.float32)
+    ) -> flow.typing.Numpy:
+        with flow.scope.placement(device_type, "0:0-0"):
+            x = flow.get_variable(
+                name="x",
+                shape=x_shape,
+                dtype=flow.float32,
+                initializer=flow.random_uniform_initializer(minval=0, maxval=100),
+                trainable=True,
+            )
+            loss = flow.math.reduce_mean(x * random_mask)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [learning_rate]),
+                momentum=momentum,
+            ).minimize(loss)
+            return x
+
+    random_masks_seq = []
+    for i in range(train_iters + 1):
+        random_masks_seq.append(np.random.uniform(size=x_shape).astype(np.float32))
+    init_value = None
+    for i in range(train_iters + 1):
+        x = testSGD(random_masks_seq[i])
+        if i == 0:
+            init_value = np.copy(x)
+    var = tf.Variable(init_value)
+    opt = tf.keras.optimizers.SGD(
+        learning_rate=learning_rate, momentum=momentum, nesterov=False
+    )
+    for i in range(train_iters):
+        with tf.GradientTape() as tape:
+            random_mask = tf.Variable(random_masks_seq[i])
+            loss = tf.reduce_mean(var * random_mask)
+        gradients = tape.gradient(loss, var)
+        opt.apply_gradients(zip([gradients], [var]))
+    assert np.allclose(x.flatten(), var.numpy().flatten(), rtol=0.0001, atol=0.0001)
+
+
+def unique_grads(sparse_ids, sparse_grads):
+    num_ids = np.prod(sparse_ids.shape)
+    sparse_grads_shape = (num_ids,) + sparse_grads.shape[len(sparse_ids.shape) :]
+    sparse_grads = sparse_grads.reshape(sparse_grads_shape)
+    sparse_ids = sparse_ids.flatten()
+    unique_dict = {}
+    for i in range(num_ids):
+        if sparse_ids[i] in unique_dict:
+            unique_dict[sparse_ids[i]] += sparse_grads[i].copy()
+        else:
+            unique_dict[sparse_ids[i]] = sparse_grads[i].copy()
+    return unique_dict
+
+
+def compare_with_numpy_indexed_slices_sgd(
+    device_type,
+    model_shape,
+    ids_shape,
+    grad_shape,
+    momentum_beta,
+    learning_rate,
+    train_iters,
+    mul_scalar,
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float32)
+    func_config.indexed_slices_optimizer_conf(
+        dict(include_op_names=dict(op_name=["embeddings"]))
+    )
+
+    @flow.global_function(type="train", function_config=func_config)
+    def testIndexedSlicesSGD(
+        sparse_ids: flow.typing.Numpy.Placeholder(ids_shape, dtype=flow.int32)
+    ) -> flow.typing.Numpy:
+        with flow.scope.placement(device_type, "0:0"):
+            embedding_table = flow.get_variable(
+                name="embeddings",
+                shape=model_shape,
+                initializer=flow.random_uniform_initializer(minval=0, maxval=100),
+            )
+            embedding = flow.gather(
+                params=embedding_table * mul_scalar, indices=sparse_ids
+            )
+            loss = flow.math.reduce_mean(embedding)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [learning_rate]),
+                momentum=momentum_beta,
+            ).minimize(loss)
+            return embedding_table
+
+    sparse_ids = np.random.randint(model_shape[0], size=ids_shape).astype(np.int32)
+    init_value = None
+    for i in range(train_iters + 1):
+        x = testIndexedSlicesSGD(sparse_ids)
+        if i == 0:
+            init_value = np.copy(x)
+
+    def indexed_slices_update_numpy(
+        param, unique_dict, iter, momentum, lr=0.001, momentum_beta=0
+    ):
+        param_t = np.copy(param)
+        momentum_t = np.copy(momentum)
+        for ids in unique_dict.keys():
+            next_momentum = momentum_beta * momentum_t[ids] - lr * unique_dict[ids]
+            momentum_t[ids] = next_momentum
+            param_t_o = param[ids] + next_momentum
+            param_t[ids] = param_t_o
+        return (param_t, momentum_t)
+
+    param = init_value
+    gradient = np.full(grad_shape, float(mul_scalar) / np.prod(grad_shape))
+    momentum = np.zeros(param.shape)
+    unique_dict = unique_grads(sparse_ids, gradient)
+    for i in range(train_iters):
+        (param, momentum) = indexed_slices_update_numpy(
+            param, unique_dict, i, momentum, learning_rate, momentum_beta
+        )
+    assert np.allclose(x.flatten(), param.flatten(), rtol=0.0001, atol=0.0001)
+
+
+def compare_with_numpy_indexed_slices_sgdw(
+    device_type,
+    model_shape,
+    ids_shape,
+    grad_shape,
+    momentum_beta,
+    learning_rate,
+    train_iters,
+    mul_scalar,
+    weight_decay,
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float32)
+    func_config.indexed_slices_optimizer_conf(
+        dict(include_op_names=dict(op_name=["embeddings"]))
+    )
+
+    @flow.global_function(type="train", function_config=func_config)
+    def testIndexedSlicesSGDW(
+        sparse_ids: flow.typing.Numpy.Placeholder(ids_shape, dtype=flow.int32)
+    ) -> flow.typing.Numpy:
+        with flow.scope.placement(device_type, "0:0"):
+            embedding_table = flow.get_variable(
+                name="embeddings",
+                shape=model_shape,
+                initializer=flow.random_uniform_initializer(minval=0, maxval=100),
+            )
+            embedding = flow.gather(
+                params=embedding_table * mul_scalar, indices=sparse_ids
+            )
+            loss = flow.math.reduce_mean(embedding)
+            flow.optimizer.SGDW(
+                flow.optimizer.PiecewiseConstantScheduler([], [learning_rate]),
+                momentum=momentum_beta,
+                weight_decay=weight_decay,
+            ).minimize(loss)
+            return embedding_table
+
+    sparse_ids = np.random.randint(model_shape[0], size=ids_shape).astype(np.int32)
+    init_value = None
+    for i in range(train_iters + 1):
+        x = testIndexedSlicesSGDW(sparse_ids)
+        if i == 0:
+            init_value = np.copy(x)
+
+    def indexed_slices_update_numpy(
+        param, unique_dict, iter, momentum, lr=0.001, momentum_beta=0, weight_decay=0.9
+    ):
+        param_t = np.copy(param)
+        momentum_t = np.copy(momentum)
+        for ids in unique_dict.keys():
+            next_momentum = momentum_beta * momentum_t[ids] - lr * unique_dict[ids]
+            momentum_t[ids] = next_momentum
+            param_t_o = param[ids] + next_momentum - lr * weight_decay * param[ids]
+            param_t[ids] = param_t_o
+        return (param_t, momentum_t)
+
+    param = init_value
+    gradient = np.full(grad_shape, float(mul_scalar) / np.prod(grad_shape))
+    momentum = np.zeros(param.shape)
+    unique_dict = unique_grads(sparse_ids, gradient)
+    for i in range(train_iters):
+        (param, momentum) = indexed_slices_update_numpy(
+            param, unique_dict, i, momentum, learning_rate, momentum_beta, weight_decay
+        )
+    assert np.allclose(x.flatten(), param.flatten(), rtol=0.0001, atol=0.0001)
+
+
+def compare_with_numpy_indexed_slices_adam(
+    device_type,
+    model_shape,
+    ids_shape,
+    grad_shape,
+    beta1,
+    beta2,
+    epsilon,
+    learning_rate,
+    train_iters,
+    mul_scalar,
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float32)
+    func_config.indexed_slices_optimizer_conf(
+        dict(include_op_names=dict(op_name=["embeddings"]))
+    )
+
+    @flow.global_function(type="train", function_config=func_config)
+    def testIndexedSlicesAdam(
+        sparse_ids: flow.typing.Numpy.Placeholder(ids_shape, dtype=flow.int32)
+    ) -> flow.typing.Numpy:
+        with flow.scope.placement(device_type, "0:0"):
+            embedding_table = flow.get_variable(
+                name="embeddings",
+                shape=model_shape,
+                initializer=flow.random_uniform_initializer(minval=0, maxval=100),
+            )
+            embedding = flow.gather(
+                params=embedding_table * mul_scalar, indices=sparse_ids
+            )
+            loss = flow.math.reduce_mean(embedding)
+            flow.optimizer.Adam(
+                flow.optimizer.PiecewiseConstantScheduler([], [learning_rate]),
+                beta1=beta1,
+                beta2=beta2,
+                epsilon=epsilon,
+                do_bias_correction=True,
+            ).minimize(loss)
+            return embedding_table
+
+    sparse_ids = np.random.randint(model_shape[0], size=ids_shape).astype(np.int32)
+    init_value = None
+    for i in range(train_iters + 1):
+        x = testIndexedSlicesAdam(sparse_ids)
+        if i == 0:
+            init_value = np.copy(x)
+
+    def indexed_slices_update_numpy(
+        param, unique_dict, iter, m, v, lr=0.001, beta1=0.9, beta2=0.999, epsilon=1e-07
+    ):
+        param_t = np.copy(param)
+        m_t = np.copy(m)
+        v_t = np.copy(v)
+        for ids in unique_dict.keys():
+            lr_t = lr * np.sqrt(1 - beta2 ** (iter + 1)) / (1 - beta1 ** (iter + 1))
+            m_t_o = beta1 * m[ids] + (1 - beta1) * unique_dict[ids]
+            v_t_o = beta2 * v[ids] + (1 - beta2) * unique_dict[ids] * unique_dict[ids]
+            m_t[ids] = m_t_o
+            v_t[ids] = v_t_o
+            param_t_o = param[ids] - lr_t * m_t[ids] / (np.sqrt(v_t[ids]) + epsilon)
+            param_t[ids] = param_t_o
+        return (param_t, m_t, v_t)
+
+    param = init_value
+    gradient = np.full(grad_shape, float(mul_scalar) / np.prod(grad_shape))
+    m = np.zeros(param.shape)
+    v = np.zeros(param.shape)
+    unique_dict = unique_grads(sparse_ids, gradient)
+    for i in range(train_iters):
+        (param, m, v) = indexed_slices_update_numpy(
+            param, unique_dict, i, m, v, learning_rate, beta1, beta2, epsilon
+        )
+    assert np.allclose(x.flatten(), param.flatten(), rtol=0.0001, atol=0.0001)
+
+
+def compare_with_numpy_indexed_slices_adamw(
+    device_type,
+    model_shape,
+    ids_shape,
+    grad_shape,
+    beta1,
+    beta2,
+    epsilon,
+    learning_rate,
+    train_iters,
+    mul_scalar,
+    weight_decay,
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float32)
+    func_config.indexed_slices_optimizer_conf(
+        dict(include_op_names=dict(op_name=["embeddings"]))
+    )
+
+    @flow.global_function(type="train", function_config=func_config)
+    def testIndexedSlicesAdamW(
+        sparse_ids: flow.typing.Numpy.Placeholder(ids_shape, dtype=flow.int32)
+    ) -> flow.typing.Numpy:
+        with flow.scope.placement(device_type, "0:0"):
+            embedding_table = flow.get_variable(
+                name="embeddings",
+                shape=model_shape,
+                initializer=flow.random_uniform_initializer(minval=0, maxval=100),
+            )
+            embedding = flow.gather(
+                params=embedding_table * mul_scalar, indices=sparse_ids
+            )
+            loss = flow.math.reduce_mean(embedding)
+            flow.optimizer.AdamW(
+                flow.optimizer.PiecewiseConstantScheduler([], [learning_rate]),
+                beta1=beta1,
+                beta2=beta2,
+                epsilon=epsilon,
+                do_bias_correction=True,
+                weight_decay=weight_decay,
+            ).minimize(loss)
+            return embedding_table
+
+    sparse_ids = np.random.randint(model_shape[0], size=ids_shape).astype(np.int32)
+    init_value = None
+    for i in range(train_iters + 1):
+        x = testIndexedSlicesAdamW(sparse_ids)
+        if i == 0:
+            init_value = np.copy(x)
+
+    def indexed_slices_update_numpy(
+        param,
+        unique_dict,
+        iter,
+        m,
+        v,
+        lr=0.001,
+        beta1=0.9,
+        beta2=0.999,
+        epsilon=1e-07,
+        weight_decay=0.9,
+    ):
+        param_t = np.copy(param)
+        m_t = np.copy(m)
+        v_t = np.copy(v)
+        for ids in unique_dict.keys():
+            lr_t = lr * np.sqrt(1 - beta2 ** (iter + 1)) / (1 - beta1 ** (iter + 1))
+            m_t_o = beta1 * m[ids] + (1 - beta1) * unique_dict[ids]
+            v_t_o = beta2 * v[ids] + (1 - beta2) * unique_dict[ids] * unique_dict[ids]
+            m_t[ids] = m_t_o
+            v_t[ids] = v_t_o
+            param_t_o = param[ids] - lr_t * (
+                m_t[ids] / (np.sqrt(v_t[ids]) + epsilon) + weight_decay * param[ids]
+            )
+            param_t[ids] = param_t_o
+        return (param_t, m_t, v_t)
+
+    param = init_value
+    gradient = np.full(grad_shape, float(mul_scalar) / np.prod(grad_shape))
+    m = np.zeros(param.shape)
+    v = np.zeros(param.shape)
+    unique_dict = unique_grads(sparse_ids, gradient)
+    for i in range(train_iters):
+        (param, m, v) = indexed_slices_update_numpy(
+            param,
+            unique_dict,
+            i,
+            m,
+            v,
+            learning_rate,
+            beta1,
+            beta2,
+            epsilon,
+            weight_decay,
+        )
+    assert np.allclose(x.flatten(), param.flatten(), rtol=0.0001, atol=0.0001)
+
+
+def compare_with_flow_job_fused_sgd_model_update(
+    device_type, x_shape, momentum, learning_rate, train_iters
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+
+    def flow_net(var_name, random_mask):
+        with flow.scope.placement(device_type, "0:0-0"):
+            x = flow.get_variable(
+                name=var_name,
+                shape=x_shape,
+                dtype=flow.float32,
+                initializer=flow.ones_initializer(),
+                trainable=True,
+            )
+            constant_val = flow.constant(3.0, dtype=flow.float32, shape=(1,))
+            x = x * constant_val
+            x = x * 2.0
+            if device_type == "gpu":
+                x = flow.cast(x, flow.float16)
+                x = flow.math.relu(x)
+                x = flow.cast(x, flow.float)
+            loss = flow.math.reduce_mean(x * random_mask)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [learning_rate]),
+                momentum=momentum,
+            ).minimize(loss)
+            return x
+
+    def make_sgd_job():
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float32)
+
+        @flow.global_function(type="train", function_config=func_config)
+        def testSGD(
+            random_mask: flow.typing.Numpy.Placeholder(x_shape, dtype=flow.float32)
+        ) -> flow.typing.Numpy:
+            return flow_net("x1", random_mask)
+
+        return testSGD
+
+    def make_fused_sgd_job():
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float32)
+        func_config.enable_fuse_model_update_ops(True)
+
+        @flow.global_function(type="train", function_config=func_config)
+        def testFusedSGD(
+            random_mask: flow.typing.Numpy.Placeholder(x_shape, dtype=flow.float32)
+        ) -> flow.typing.Numpy:
+            return flow_net("x2", random_mask)
+
+        return testFusedSGD
+
+    sgd_job = make_sgd_job()
+    fused_sgd_job = make_fused_sgd_job()
+    random_masks_seq = []
+    for i in range(train_iters + 1):
+        random_masks_seq.append(np.random.uniform(size=x_shape).astype(np.float32))
+    for i in range(train_iters + 1):
+        var1 = sgd_job(random_masks_seq[i])
+    for i in range(train_iters + 1):
+        var2 = fused_sgd_job(random_masks_seq[i])
+    assert np.allclose(var1.flatten(), var2.flatten(), rtol=0.0001, atol=0.0001)
+
+
+def compare_with_flow_job_fused_adam_model_update(
+    device_type, x_shape, beta1, beta2, epsilon, learning_rate, train_iters
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+
+    def flow_net(var_name, random_mask):
+        with flow.scope.placement(device_type, "0:0-0"):
+            x = flow.get_variable(
+                name=var_name,
+                shape=x_shape,
+                dtype=flow.float32,
+                initializer=flow.ones_initializer(),
+                trainable=True,
+            )
+            constant_val = flow.constant(3.0, dtype=flow.float32, shape=(1,))
+            x = x * constant_val
+            x = x * 2.0
+            if device_type == "gpu":
+                x = flow.cast(x, flow.float16)
+                x = flow.math.relu(x)
+                x = flow.cast(x, flow.float)
+            loss = flow.math.reduce_mean(x * random_mask)
+            flow.optimizer.Adam(
+                flow.optimizer.PiecewiseConstantScheduler([], [learning_rate]),
+                beta1=beta1,
+                beta2=beta2,
+                epsilon=epsilon,
+                do_bias_correction=True,
+            ).minimize(loss)
+            return x
+
+    def make_adam_job():
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float32)
+
+        @flow.global_function(type="train", function_config=func_config)
+        def testAdam(
+            random_mask: flow.typing.Numpy.Placeholder(x_shape, dtype=flow.float32)
+        ) -> flow.typing.Numpy:
+            return flow_net("x1", random_mask)
+
+        return testAdam
+
+    def make_fused_adam_job():
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float32)
+        func_config.enable_fuse_model_update_ops(True)
+
+        @flow.global_function(type="train", function_config=func_config)
+        def testFusedAdam(
+            random_mask: flow.typing.Numpy.Placeholder(x_shape, dtype=flow.float32)
+        ) -> flow.typing.Numpy:
+            return flow_net("x2", random_mask)
+
+        return testFusedAdam
+
+    adam_job = make_adam_job()
+    fused_adam_job = make_fused_adam_job()
+    random_masks_seq = []
+    for i in range(train_iters + 1):
+        random_masks_seq.append(np.random.uniform(size=x_shape).astype(np.float32))
+    for i in range(train_iters + 1):
+        var1 = adam_job(random_masks_seq[i])
+    for i in range(train_iters + 1):
+        var2 = fused_adam_job(random_masks_seq[i])
+    assert np.allclose(var1.flatten(), var2.flatten(), rtol=0.0001, atol=0.0001)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestOptimizers(flow.unittest.TestCase):
+    def test_rmsprop(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["x_shape"] = [(10,)]
+        arg_dict["centered"] = [True, False]
+        arg_dict["decay_rate"] = [0.9]
+        arg_dict["learning_rate"] = [1]
+        arg_dict["train_iters"] = [10]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow_rmsprop(*arg)
+
+    def test_adam(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["x_shape"] = [(10,)]
+        arg_dict["beta1"] = [0.9]
+        arg_dict["beta2"] = [0.99]
+        arg_dict["epsilon"] = [1e-09]
+        arg_dict["learning_rate"] = [1]
+        arg_dict["train_iters"] = [10]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow_adam(*arg)
+
+    def test_lazy_adam(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["x_shape"] = [(10,)]
+        arg_dict["beta1"] = [0.9]
+        arg_dict["beta2"] = [0.99]
+        arg_dict["epsilon"] = [1e-09]
+        arg_dict["learning_rate"] = [1]
+        arg_dict["train_iters"] = [10]
+        for arg in GenArgList(arg_dict):
+            compare_with_numpy_lazy_adam(*arg)
+
+    def test_adamw(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["x_shape"] = [(10,)]
+        arg_dict["beta1"] = [0.9]
+        arg_dict["beta2"] = [0.99]
+        arg_dict["epsilon"] = [1e-09]
+        arg_dict["weight_decay"] = [0.9]
+        arg_dict["learning_rate"] = [1]
+        arg_dict["train_iters"] = [10]
+        for arg in GenArgList(arg_dict):
+            compare_with_numpy_adamw(*arg)
+
+    def test_lars(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["x_shape"] = [(10,)]
+        arg_dict["momentum_beta"] = [0.9]
+        arg_dict["epsilon"] = [1e-09]
+        arg_dict["lars_coefficient"] = [0.0001]
+        arg_dict["learning_rate"] = [1]
+        arg_dict["weight_decay"] = [0.9]
+        arg_dict["train_iters"] = [10]
+        for arg in GenArgList(arg_dict):
+            compare_with_numpy_lars(*arg)
+
+    def test_sgd(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["x_shape"] = [(10,)]
+        arg_dict["momentum"] = [0.9, 0.0]
+        arg_dict["learning_rate"] = [1]
+        arg_dict["train_iters"] = [10]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow_sgd(*arg)
+
+    def test_indexed_slices_sgd(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["model_shape"] = [(200, 2)]
+        arg_dict["ids"] = [(10, 4)]
+        arg_dict["grad_shape"] = [(10, 4, 2)]
+        arg_dict["momentum_beta"] = [0, 0.9]
+        arg_dict["learning_rate"] = [1]
+        arg_dict["train_iters"] = [10]
+        arg_dict["mul_scalar"] = [1, 2]
+        for arg in GenArgList(arg_dict):
+            compare_with_numpy_indexed_slices_sgd(*arg)
+
+    @unittest.skipIf(
+        flow.unittest.env.eager_execution_enabled(),
+        "indexed slices sgdw doesn't work in eager mode",
+    )
+    def test_indexed_slices_sgdw(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["model_shape"] = [(200, 2)]
+        arg_dict["ids"] = [(10, 4)]
+        arg_dict["grad_shape"] = [(10, 4, 2)]
+        arg_dict["momentum_beta"] = [0, 0.9]
+        arg_dict["learning_rate"] = [1]
+        arg_dict["train_iters"] = [10]
+        arg_dict["mul_scalar"] = [2]
+        arg_dict["weight_decay"] = [0.5, 0.3]
+        for arg in GenArgList(arg_dict):
+            compare_with_numpy_indexed_slices_sgdw(*arg)
+
+    def test_indexed_slices_adam(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["model_shape"] = [(200, 2)]
+        arg_dict["ids"] = [(10, 4)]
+        arg_dict["grad_shape"] = [(10, 4, 2)]
+        arg_dict["beta1"] = [0.9]
+        arg_dict["beta2"] = [0.99]
+        arg_dict["epsilon"] = [1e-09]
+        arg_dict["learning_rate"] = [1]
+        arg_dict["train_iters"] = [10]
+        arg_dict["mul_scalar"] = [1, 2]
+        for arg in GenArgList(arg_dict):
+            compare_with_numpy_indexed_slices_adam(*arg)
+
+    @unittest.skipIf(
+        flow.unittest.env.eager_execution_enabled(),
+        "indexed slices adamw doesn't work in eager mode",
+    )
+    def test_indexed_slices_adamw(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["model_shape"] = [(200, 2)]
+        arg_dict["ids"] = [(10, 4)]
+        arg_dict["grad_shape"] = [(10, 4, 2)]
+        arg_dict["beta1"] = [0.9]
+        arg_dict["beta2"] = [0.99]
+        arg_dict["epsilon"] = [1e-09]
+        arg_dict["learning_rate"] = [1]
+        arg_dict["train_iters"] = [10]
+        arg_dict["mul_scalar"] = [2]
+        arg_dict["weight_decay"] = [0.5, 0.3]
+        for arg in GenArgList(arg_dict):
+            compare_with_numpy_indexed_slices_adamw(*arg)
+
+    def test_fused_sgd(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["x_shape"] = [(10,)]
+        arg_dict["momentum"] = [0.9, 0.0]
+        arg_dict["learning_rate"] = [1]
+        arg_dict["train_iters"] = [10]
+        for arg in GenArgList(arg_dict):
+            compare_with_flow_job_fused_sgd_model_update(*arg)
+
+    def test_fused_adam(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["x_shape"] = [(10,)]
+        arg_dict["beta1"] = [0.9]
+        arg_dict["beta2"] = [0.99]
+        arg_dict["epsilon"] = [1e-09]
+        arg_dict["learning_rate"] = [1]
+        arg_dict["train_iters"] = [10]
+        for arg in GenArgList(arg_dict):
+            compare_with_flow_job_fused_adam_model_update(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_pad.py b/python/oneflow/compatible/single_client/test/ops/test_pad.py
new file mode 100644
index 0000000000000000000000000000000000000000..d22a1b46fd729d30f73b169c16d5d1cd13c55490
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_pad.py
@@ -0,0 +1,83 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+from test_util import Args, CompareOpWithTensorFlow, GenArgDict
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+
+@flow.unittest.skip_unless_1n4d()
+class TestPad(flow.unittest.TestCase):
+    def test_pad(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["flow_op"] = [flow.pad]
+        arg_dict["tf_op"] = [tf.pad]
+        arg_dict["input_shape"] = [(2, 2, 1, 3), (1, 1, 2, 3)]
+        arg_dict["op_args"] = [
+            Args(
+                [([1, 2], [0, 0], [1, 2], [1, 1])],
+                tf.constant([([1, 2], [0, 0], [1, 2], [1, 1])]),
+            ),
+            Args(
+                [([0, 0], [30, 0], [0, 1], [1, 0]), 99999999999999999999999999999999],
+                [
+                    tf.constant(([0, 0], [30, 0], [0, 1], [1, 0])),
+                    "constant",
+                    99999999999999999999999999999999,
+                ],
+            ),
+            Args(
+                [([10, 0], [0, 0], [10, 20], [0, 0])],
+                tf.constant([([10, 0], [0, 0], [10, 20], [0, 0])]),
+            ),
+        ]
+        for arg in GenArgDict(arg_dict):
+            CompareOpWithTensorFlow(**arg)
+
+    def test_pad_5d(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["flow_op"] = [flow.pad]
+        arg_dict["tf_op"] = [tf.pad]
+        arg_dict["input_shape"] = [(2, 2, 1, 3, 1), (1, 1, 2, 3, 1)]
+        arg_dict["op_args"] = [
+            Args(
+                [([1, 2], [3, 4], [5, 6], [7, 8], [9, 10])],
+                tf.constant([([1, 2], [3, 4], [5, 6], [7, 8], [9, 10])]),
+            ),
+            Args(
+                [([1, 1], [2, 2], [3, 3], [4, 4], [5, 5])],
+                tf.constant([([1, 1], [2, 2], [3, 3], [4, 4], [5, 5])]),
+            ),
+            Args(
+                [([0, 0], [0, 0], [10, 20], [0, 0], [3, 2])],
+                tf.constant([([0, 0], [0, 0], [10, 20], [0, 0], [3, 2])]),
+            ),
+        ]
+        for arg in GenArgDict(arg_dict):
+            CompareOpWithTensorFlow(**arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_parallel.py b/python/oneflow/compatible/single_client/test/ops/test_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..a47255424ad48119cb11fd88090bb17a06e27af5
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_parallel.py
@@ -0,0 +1,59 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def NaiveTest(test_case):
+    shape = (16, 2)
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(function_config=func_config)
+    def AddJob(a: oft.Numpy.Placeholder(shape), b: oft.Numpy.Placeholder(shape)):
+        return a + b + b
+
+    x = np.random.rand(*shape).astype(np.float32)
+    y = np.random.rand(*shape).astype(np.float32)
+    z = AddJob(x, y).get().numpy()
+    test_case.assertTrue(np.array_equal(z, x + y + y))
+
+
+class TestParallel(flow.unittest.TestCase):
+    @flow.unittest.skip_unless_1n1d()
+    def test_1n1c(test_case):
+        flow.config.gpu_device_num(1)
+        NaiveTest(test_case)
+
+    @flow.unittest.skip_unless_1n2d()
+    def test_1n2c(test_case):
+        flow.config.gpu_device_num(2)
+        NaiveTest(test_case)
+
+    @flow.unittest.skip_unless_2n1d()
+    def test_2n2c(test_case):
+        flow.config.gpu_device_num(1)
+        NaiveTest(test_case)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_parallel_cast.py b/python/oneflow/compatible/single_client/test/ops/test_parallel_cast.py
new file mode 100644
index 0000000000000000000000000000000000000000..a744e76d5a0abcded843248e4503d47525b50196
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_parallel_cast.py
@@ -0,0 +1,82 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+
+def _test(test_case, device_num):
+    (m, k, n) = (5, 6, 7)
+    a_shape = (m, k)
+    b_shape = (k, n)
+    c_shape = (n,)
+    flow.config.gpu_device_num(device_num)
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float32)
+    func_config.prune_parallel_cast_ops(True)
+
+    @flow.global_function("train", function_config=func_config)
+    def test_fn(
+        a: flow.typing.Numpy.Placeholder(a_shape),
+        b: flow.typing.Numpy.Placeholder(b_shape),
+        c: flow.typing.Numpy.Placeholder(c_shape),
+    ) -> flow.typing.Numpy:
+        var_a = flow.get_variable(
+            name="var_a",
+            shape=a_shape,
+            dtype=flow.float32,
+            initializer=flow.ones_initializer(),
+            distribute=flow.distribute.split(1),
+        )
+        a = flow.parallel_cast(a, distribute=flow.distribute.split(1))
+        a = var_a * a
+        out = flow.matmul(a, b)
+        out = flow.parallel_cast(
+            out,
+            distribute=flow.distribute.broadcast(),
+            gradient_distribute=flow.distribute.broadcast(),
+        )
+        c = flow.parallel_cast(c, distribute=flow.distribute.broadcast())
+        out = flow.nn.bias_add(out, c)
+        lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.001])
+        flow.optimizer.SGD(lr_scheduler, momentum=0).minimize(out)
+        return out
+
+    a = np.random.rand(*a_shape).astype(np.float32)
+    b = np.random.rand(*b_shape).astype(np.float32)
+    c = np.random.rand(*c_shape).astype(np.float32)
+    out = test_fn(a, b, c)
+    test_case.assertTrue(np.allclose(out, np.matmul(a, b) + c))
+
+
+@flow.unittest.skip_unless_1n2d()
+@unittest.skipIf(
+    flow.unittest.env.eager_execution_enabled(),
+    "Parallel cast SBP doesn't work in eager mode",
+)
+class TestParallelCast(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_on_gpu(test_case):
+        _test(test_case, 2)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_partial_fc.py b/python/oneflow/compatible/single_client/test/ops/test_partial_fc.py
new file mode 100644
index 0000000000000000000000000000000000000000..09f5c651c518b49bd2c102ce61cecd065c89425a
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_partial_fc.py
@@ -0,0 +1,154 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import test_global_storage
+from test_util import GenArgList, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def compare_with_np(device_type, label_type, num_classes, num_sample, batch_size):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    if device_type == "cpu":
+        flow.config.gpu_device_num(0)
+        flow.config.cpu_device_num(4)
+    else:
+        flow.config.gpu_device_num(4)
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.indexed_slices_optimizer_conf(dict(include_op_names=dict(op_name=[])))
+
+    @flow.global_function(type="train", function_config=func_config)
+    def PartialFcJob(
+        labels: oft.Numpy.Placeholder(
+            (batch_size,), dtype=type_name_to_flow_type[label_type]
+        )
+    ):
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "x-weight",
+                shape=(num_classes, 128),
+                dtype=flow.float,
+                initializer=flow.random_uniform_initializer(minval=-10, maxval=10),
+                trainable=True,
+            )
+        with flow.scope.placement(device_type, "0:0-3"):
+            lebels_distribute = flow.distribute.broadcast()
+            weight_distribute = flow.distribute.split(0)
+            (
+                maped_label,
+                sampled_label,
+                sampled_weight,
+            ) = flow.distributed_partial_fc_sample(
+                weight=x.with_distribute(weight_distribute),
+                label=labels.with_distribute(lebels_distribute),
+                num_sample=num_sample,
+            )
+        with flow.scope.placement(device_type, "0:0"):
+            sampled_weight = flow.identity(sampled_weight)
+            loss = flow.math.square(sampled_weight)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch_diff(
+                sampled_weight, test_global_storage.Setter("sampled_weight_diff")
+            )
+        return (x, maped_label, sampled_label, sampled_weight)
+
+    labels = np.random.randint(0, num_classes, size=(batch_size,)).astype(
+        type_name_to_np_type[label_type]
+    )
+    (weight, maped_label, sampled_label, sampled_weight) = PartialFcJob(labels).get()
+    gpu_num = 4
+    device_class_num = num_classes // gpu_num
+    device_num_sample = num_sample // gpu_num
+    global_sample_labels_list = []
+    np_mapped_label = []
+    label_map = {}
+    for i in range(gpu_num):
+        lower = i * device_class_num
+        upper = (i + 1) * device_class_num
+        condition = (labels >= lower) & (labels < upper)
+        local_label = labels[condition]
+        local_label = np.unique(local_label).astype(np.int32)
+        idx_start = int(i * device_num_sample)
+        idx_end = int((i + 1) * device_num_sample)
+        local_sample_labels = sampled_label[idx_start:idx_end]
+        global_sample_labels = local_sample_labels
+        global_sample_labels_list.append(global_sample_labels)
+        assert (
+            np.all((local_sample_labels >= lower) & (local_sample_labels < upper))
+            == True
+        )
+        assert len(local_sample_labels) == len(np.unique(local_sample_labels))
+        assert (
+            np.array_equal(local_label, global_sample_labels[0 : len(local_label)])
+            == True
+        )
+        for j in range(len(global_sample_labels)):
+            label_map[global_sample_labels[j]] = j + idx_start
+    for i in range(len(labels)):
+        np_mapped_label.append(label_map[labels[i]])
+    assert np.array_equal(np.array(np_mapped_label), maped_label.numpy()) == True
+    global_sample_label = np.array(global_sample_labels_list).flatten().astype(np.int32)
+    np_sample_weight = weight[global_sample_label]
+    assert np.array_equal(sampled_weight.numpy(), np_sample_weight) == True
+    sampled_weight_diff = test_global_storage.Get("sampled_weight_diff")
+    np_weight_diff = np.zeros(weight.shape)
+    for i in range(len(global_sample_label)):
+        np_weight_diff[global_sample_label[i]] = sampled_weight_diff[i]
+    x_diff = test_global_storage.Get("x_diff")
+    assert np.array_equal(test_global_storage.Get("x_diff"), np_weight_diff) == True
+
+
+flow.clear_default_session()
+
+
+@flow.unittest.skip_unless_1n4d()
+class TestPartialFc(flow.unittest.TestCase):
+    def test_partial_fc1(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["label_type"] = ["int32"]
+        arg_dict["num_classes"] = [85744]
+        arg_dict["num_sample"] = [8600]
+        arg_dict["batch_size"] = [512]
+        for arg in GenArgList(arg_dict):
+            compare_with_np(*arg)
+
+    def test_partial_fc2(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["label_type"] = ["int32"]
+        arg_dict["num_classes"] = [200]
+        arg_dict["num_sample"] = [64]
+        arg_dict["batch_size"] = [32]
+        for arg in GenArgList(arg_dict):
+            compare_with_np(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_pixel_shuffle.py b/python/oneflow/compatible/single_client/test/ops/test_pixel_shuffle.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9031530dae043fbbd795c592d3f0206dc1ff426
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_pixel_shuffle.py
@@ -0,0 +1,166 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+def _compare_pixel_shuffle_with_np(
+    input_shape, upscale_factor, device_type, machine_ids, device_counts
+):
+    input_1 = np.random.random(size=input_shape).astype(np.float32)
+    assert device_type in ["cpu", "gpu"]
+    flow.clear_default_session()
+    if device_type == "cpu":
+        flow.config.cpu_device_num(device_counts)
+    else:
+        flow.config.gpu_device_num(device_counts)
+    func_config = flow.FunctionConfig()
+    func_config.default_placement_scope(flow.scope.placement(device_type, machine_ids))
+
+    def np_pixel_shuffle(input):
+        (_batch, _channel, _height, _width) = input.shape
+        assert (
+            _channel % upscale_factor ** 2 == 0
+        ), "The channels of input tensor must be divisible by (upscale_factor * upscale_factor)"
+        _new_c = int(_channel / upscale_factor ** 2)
+        out = np.reshape(
+            input, [_batch, _new_c, upscale_factor * upscale_factor, _height, _width]
+        )
+        out = np.reshape(
+            out, [_batch * _new_c, upscale_factor, upscale_factor, _height, _width]
+        )
+        out = np.transpose(out, [0, 3, 1, 4, 2])
+        out = np.reshape(
+            out, [_batch, _new_c, _height * upscale_factor, _width * upscale_factor]
+        )
+        return out
+
+    np_out_pixel_shuffle = np_pixel_shuffle(input_1)
+    np_random_mul = np.random.random(size=np_out_pixel_shuffle.shape).astype(np.float32)
+
+    def np_pixel_shuffle_diff(input, np_diff, upscale_factor):
+        (_batch, _new_channel, _height_mul_factor, _width_mul_factor) = input.shape
+        _channel = _new_channel * upscale_factor ** 2
+        _height = _height_mul_factor // upscale_factor
+        _width = _width_mul_factor // upscale_factor
+        bp_result = np.zeros((_batch, _channel, _height, _width)).astype(np.float32)
+        for c in range(_channel):
+            for h in range(_height):
+                for w in range(_width):
+                    out_c_idx = int(c / upscale_factor ** 2)
+                    inner_c = c - out_c_idx * upscale_factor * upscale_factor
+                    out_h_idx = h * upscale_factor + int(inner_c / upscale_factor)
+                    out_w_idx = w * upscale_factor + int(inner_c % upscale_factor)
+                    bp_result[:, c, h, w] = np_diff[:, out_c_idx, out_h_idx, out_w_idx]
+        return bp_result
+
+    _np_grad = np_pixel_shuffle_diff(
+        np_out_pixel_shuffle, np_random_mul, upscale_factor
+    )
+
+    def assert_prediction_grad(blob: tp.Numpy):
+        assert np.allclose(blob, _np_grad)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def oneflow_pixel_shuffle(
+        of_input_1: tp.Numpy.Placeholder(shape=input_1.shape),
+        of_mul: tp.Numpy.Placeholder(shape=np_random_mul.shape),
+    ) -> tp.Numpy:
+        with flow.scope.placement(device_type, "0:0"):
+            v = flow.get_variable(
+                shape=input_1.shape,
+                dtype=flow.float32,
+                initializer=flow.zeros_initializer(),
+                name="x_var",
+            )
+            x_var = of_input_1 + v
+        flow.watch_diff(x_var, assert_prediction_grad)
+        of_pixel_shuffle_out = flow.nn.PixelShuffle(
+            x_var, upscale_factor, name="PixelShuffle"
+        )
+        out = of_pixel_shuffle_out * of_mul
+        with flow.scope.placement(device_type, "0:0"):
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+            ).minimize(out)
+        return of_pixel_shuffle_out
+
+    of_out_pixel_shuffle = oneflow_pixel_shuffle(input_1, np_random_mul)
+    assert np.allclose(of_out_pixel_shuffle, np_out_pixel_shuffle)
+
+
+def _gen_arg_dict(shape, upscale_factor, device_type, machine_ids, device_counts):
+    arg_dict = OrderedDict()
+    arg_dict["input_shape"] = [shape]
+    arg_dict["upscale_factor"] = [upscale_factor]
+    arg_dict["device_type"] = [device_type]
+    arg_dict["machine_ids"] = [machine_ids]
+    arg_dict["device_counts"] = [device_counts]
+    return arg_dict
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestPixelShuffle1n1d(flow.unittest.TestCase):
+    def test_pixel_shuffle_cpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(3, 4, 2, 2),
+            upscale_factor=2,
+            device_type="cpu",
+            machine_ids="0:0",
+            device_counts=1,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_pixel_shuffle_with_np(*arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_pixel_shuffle_gpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(3, 16, 2, 2),
+            upscale_factor=2,
+            device_type="gpu",
+            machine_ids="0:0",
+            device_counts=1,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_pixel_shuffle_with_np(*arg)
+
+
+@flow.unittest.skip_unless_1n2d()
+class TestPixelShuffle1n2d(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_pixel_shuffle_gpu_1n2d(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(4, 16, 2, 2),
+            upscale_factor=2,
+            device_type="gpu",
+            machine_ids="0:0-1",
+            device_counts=2,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_pixel_shuffle_with_np(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_pixel_shufflev2.py b/python/oneflow/compatible/single_client/test/ops/test_pixel_shufflev2.py
new file mode 100644
index 0000000000000000000000000000000000000000..205bfa487b5b931d12033b56f70907f29fa1b722
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_pixel_shufflev2.py
@@ -0,0 +1,151 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+def _compare_pixel_shuffle_with_np(
+    input_shape, h_factor, w_factor, device_type, machine_ids, device_counts
+):
+    input_1 = np.random.random(size=input_shape).astype(np.float32)
+    assert device_type in ["cpu", "gpu"]
+    flow.clear_default_session()
+    if device_type == "cpu":
+        flow.config.cpu_device_num(device_counts)
+    else:
+        flow.config.gpu_device_num(device_counts)
+    func_config = flow.FunctionConfig()
+    func_config.default_placement_scope(flow.scope.placement(device_type, machine_ids))
+
+    def np_pixel_shuffle(input):
+        (_batch, _channel, _height, _width) = input.shape
+        assert (
+            _channel % (h_factor * w_factor) == 0
+        ), "The channels of input tensor must be divisible by (h_upscale_factor * w_upscale_factor)"
+        _new_c = int(_channel / (h_factor * w_factor))
+        out = np.reshape(input, [_batch, _new_c, h_factor * w_factor, _height, _width])
+        out = np.reshape(out, [_batch, _new_c, h_factor, w_factor, _height, _width])
+        out = np.transpose(out, [0, 1, 4, 2, 5, 3])
+        out = np.reshape(out, [_batch, _new_c, _height * h_factor, _width * w_factor])
+        return out
+
+    np_out_pixel_shuffle = np_pixel_shuffle(input_1)
+
+    def np_pixel_shuffle_diff(input, h_factor, w_factor):
+        (_batch, _new_channel, _height_mul_factor, _width_mul_factor) = input.shape
+        _channel = _new_channel * (h_factor * w_factor)
+        _height = _height_mul_factor // h_factor
+        _width = _width_mul_factor // w_factor
+        out = np.ones(shape=(_batch, _channel, _height, _width))
+        return out
+
+    _np_grad = np_pixel_shuffle_diff(np_out_pixel_shuffle, h_factor, w_factor)
+
+    def assert_prediction_grad(blob: tp.Numpy):
+        assert np.allclose(blob, _np_grad)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def oneflow_pixel_shuffle(
+        of_input_1: tp.Numpy.Placeholder(shape=input_1.shape),
+    ) -> tp.Numpy:
+        with flow.scope.placement(device_type, "0:0"):
+            v = flow.get_variable(
+                shape=input_1.shape,
+                dtype=flow.float32,
+                initializer=flow.zeros_initializer(),
+                name="x_var",
+            )
+            x_var = of_input_1 + v
+        flow.watch_diff(x_var, assert_prediction_grad)
+        of_pixel_shuffle_out = flow.nn.PixelShufflev2(
+            x_var, h_factor, w_factor, name="PixelShufflev2"
+        )
+        with flow.scope.placement(device_type, "0:0"):
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+            ).minimize(of_pixel_shuffle_out)
+        return of_pixel_shuffle_out
+
+    of_out_pixel_shuffle = oneflow_pixel_shuffle(input_1)
+    assert np.allclose(of_out_pixel_shuffle, np_out_pixel_shuffle)
+
+
+def _gen_arg_dict(shape, h_factor, w_factor, device_type, machine_ids, device_counts):
+    arg_dict = OrderedDict()
+    arg_dict["input_shape"] = [shape]
+    arg_dict["h_factor"] = [h_factor]
+    arg_dict["w_factor"] = [w_factor]
+    arg_dict["device_type"] = [device_type]
+    arg_dict["machine_ids"] = [machine_ids]
+    arg_dict["device_counts"] = [device_counts]
+    return arg_dict
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestPixelShuffle1n1d(flow.unittest.TestCase):
+    def test_pixel_shuffle_cpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(3, 16, 2, 4),
+            h_factor=2,
+            w_factor=4,
+            device_type="cpu",
+            machine_ids="0:0",
+            device_counts=1,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_pixel_shuffle_with_np(*arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_pixel_shuffle_gpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(2, 16, 2, 2),
+            h_factor=2,
+            w_factor=2,
+            device_type="gpu",
+            machine_ids="0:0",
+            device_counts=1,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_pixel_shuffle_with_np(*arg)
+
+
+@flow.unittest.skip_unless_1n2d()
+class TestPixelShuffle1n2d(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_pixel_shuffle_gpu_1n2d(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(4, 16, 2, 4),
+            h_factor=2,
+            w_factor=2,
+            device_type="gpu",
+            machine_ids="0:0-1",
+            device_counts=2,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_pixel_shuffle_with_np(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_polyval.py b/python/oneflow/compatible/single_client/test/ops/test_polyval.py
new file mode 100644
index 0000000000000000000000000000000000000000..0203531fca6ac74e7a02fff6c1a394715846fa3a
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_polyval.py
@@ -0,0 +1,107 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+def compare_with_numpy(device_type, device_num, in_shape, data_type, coeffs):
+    assert device_type in ["cpu", "gpu"]
+    assert data_type in ["float32", "double"]
+    flow_data_type = type_name_to_flow_type[data_type]
+    flow.clear_default_session()
+    if device_type == "cpu":
+        flow.config.cpu_device_num(device_num)
+    else:
+        flow.config.gpu_device_num(device_num)
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow_data_type)
+    func_config.default_placement_scope(
+        flow.scope.placement(device_type, "0:0-{}".format(device_num - 1))
+    )
+    func_config.default_logical_view(flow.scope.consistent_view())
+    x = (np.random.random(in_shape) * 100).astype(type_name_to_np_type[data_type])
+
+    def np_polyval_grad(coeffs, x):
+        coeffs_len = len(coeffs)
+        coeffs_diff = [(coeffs_len - i - 1) * coeffs[i] for i in range(coeffs_len - 1)]
+        np_x_diff = np.polyval(coeffs_diff, x)
+        return np_x_diff
+
+    def assert_prediction_grad(blob: tp.Numpy):
+        np_x_diff = np_polyval_grad(coeffs, x)
+        assert np.allclose(blob, np_x_diff, rtol=1e-05, atol=1e-05)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def PolyValJob(x: tp.Numpy.Placeholder(shape=in_shape)):
+        with flow.scope.placement(device_type, "0:0"):
+            x += flow.get_variable(
+                name="x",
+                shape=in_shape,
+                dtype=flow_data_type,
+                initializer=flow.zeros_initializer(),
+                trainable=True,
+            )
+        flow.watch_diff(x, assert_prediction_grad)
+        out = flow.math.polyval(coeffs, x)
+        with flow.scope.placement(device_type, "0:0"):
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(out)
+        return out
+
+    of_out = PolyValJob(x).get().numpy()
+    np_out = np.polyval(coeffs, x)
+    assert np.allclose(of_out, np_out, rtol=1e-05, atol=1e-05)
+
+
+def gen_arg_list(type):
+    arg_dict = OrderedDict()
+    if type == "1n2d":
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["device_num"] = [2]
+    else:
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["device_num"] = [1]
+    arg_dict["in_shape"] = [(2, 3)]
+    arg_dict["data_type"] = ["float32"]
+    arg_dict["coeffs"] = [[1.0, 2.0], [1.0, 2.0, 3.0]]
+    return GenArgList(arg_dict)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestPolyval1n1d(flow.unittest.TestCase):
+    def test_polyval(test_case):
+        for arg in gen_arg_list("1n1d"):
+            compare_with_numpy(*arg)
+
+
+@flow.unittest.skip_unless_1n2d()
+class TestPolyval1n2d(flow.unittest.TestCase):
+    def test_polyval(test_case):
+        for arg in gen_arg_list("1n2d"):
+            compare_with_numpy(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_pool.py b/python/oneflow/compatible/single_client/test/ops/test_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..698452126dfd862c9d1fdab0cbdb384bf6ba1aa1
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_pool.py
@@ -0,0 +1,246 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import collections
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+from test_util import GenArgList, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+pool_confs = [
+    {
+        "x_shape": (1, 1, 6, 6),
+        "ksize": 1,
+        "strides": 1,
+        "padding": "VALID",
+        "data_format": "NCHW",
+    },
+    {
+        "x_shape": (1, 3, 7, 7),
+        "ksize": 3,
+        "strides": 2,
+        "padding": "SAME",
+        "data_format": "NCHW",
+    },
+    {
+        "x_shape": (1, 7, 7, 3),
+        "ksize": 3,
+        "strides": 2,
+        "padding": "SAME",
+        "data_format": "NHWC",
+    },
+    {
+        "x_shape": (1, 5, 6, 6),
+        "ksize": 3,
+        "strides": 2,
+        "padding": "VALID",
+        "data_format": "NCHW",
+    },
+    {
+        "x_shape": (1, 7, 5, 5),
+        "ksize": 3,
+        "strides": 2,
+        "padding": "SAME",
+        "data_format": "NCHW",
+    },
+    {
+        "x_shape": (1, 3, 3, 3),
+        "ksize": 1,
+        "strides": 1,
+        "padding": "VALID",
+        "data_format": "NCHW",
+    },
+    {
+        "x_shape": (1, 1, 9, 9),
+        "ksize": 2,
+        "strides": 2,
+        "padding": "VALID",
+        "data_format": "NCHW",
+    },
+    {
+        "x_shape": (1, 9, 9, 1),
+        "ksize": 2,
+        "strides": 2,
+        "padding": "VALID",
+        "data_format": "NHWC",
+    },
+    {
+        "x_shape": (1, 1, 9, 9, 9),
+        "ksize": 2,
+        "strides": 2,
+        "padding": "VALID",
+        "data_format": "NCDHW",
+    },
+    {
+        "x_shape": (1, 7, 5, 5, 5),
+        "ksize": 3,
+        "strides": 2,
+        "padding": "SAME",
+        "data_format": "NCDHW",
+    },
+    {
+        "x_shape": (1, 5, 5, 5, 7),
+        "ksize": 3,
+        "strides": 2,
+        "padding": "VALID",
+        "data_format": "NDHWC",
+    },
+    {
+        "x_shape": (1, 3, 3, 3, 3),
+        "ksize": 1,
+        "strides": 1,
+        "padding": "VALID",
+        "data_format": "NCDHW",
+    },
+]
+
+
+def _GetSequence(value, n, name):
+    """Formats value from input"""
+    if value is None:
+        value = [1]
+    elif not isinstance(value, collections.Sized):
+        value = [value]
+    current_n = len(value)
+    if current_n == 1:
+        return list(value * n)
+    elif current_n == n:
+        return list(value)
+    else:
+        raise ValueError(
+            "{} should be of length 1 or {} but was {}".format(name, n, current_n)
+        )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestPool(flow.unittest.TestCase):
+    def test_pool(_):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["pool_conf"] = pool_confs
+        arg_dict["data_type"] = ["float32"]
+        arg_dict["pooling_type"] = ["AVG", "MAX"]
+        arg_dict["is_dynamic"] = [True, False]
+        for case in GenArgList(arg_dict):
+            (device_type, pool_conf, data_type, pooling_type, is_dynamic) = case
+            x_shape = pool_conf["x_shape"]
+            ksize = pool_conf["ksize"]
+            strides = pool_conf["strides"]
+            padding = pool_conf["padding"]
+            data_format = pool_conf["data_format"]
+            if os.getenv("ONEFLOW_TEST_CPU_ONLY") and data_format != "NHWC":
+                continue
+            flow.clear_default_session()
+            x = np.random.randn(*x_shape).astype(type_name_to_np_type[data_type])
+            dim = len(x.shape) - 2
+            if dim == 3 and data_format == "NDHWC":
+                continue
+            with tf.GradientTape(persistent=True) as tape:
+                x_tf = tf.Variable(x)
+                strides = _GetSequence(strides, dim, "strides")
+                pooling_f = None
+                if pooling_type == "AVG":
+                    pooling_f = getattr(tf.nn, "avg_pool{}d".format(dim))
+                elif pooling_type == "MAX":
+                    pooling_f = getattr(tf.nn, "max_pool{}d".format(dim))
+                else:
+                    raise ValueError("pooling_type must be AVG or MAX")
+                y_tf = pooling_f(x_tf, ksize, strides, padding, data_format=data_format)
+            dx_tf = tape.gradient(y_tf, x_tf, tf.constant(1.0, shape=y_tf.shape))
+
+            def assert_grad(b):
+                if b.is_dynamic:
+                    b_ndarray = b.numpy_list()[0]
+                else:
+                    b_ndarray = b.numpy()
+                assert np.allclose(dx_tf.numpy(), b_ndarray), (
+                    case,
+                    dx_tf.numpy(),
+                    b_ndarray,
+                )
+
+            dtype = type_name_to_flow_type[data_type]
+            func_config = flow.FunctionConfig()
+            func_config.default_data_type(flow.float)
+            tensor_def = None
+            if is_dynamic:
+                func_config.default_logical_view(flow.scope.mirrored_view())
+                tensor_def = oft.ListNumpy.Placeholder
+            else:
+                tensor_def = oft.Numpy.Placeholder
+
+            @flow.global_function(type="train", function_config=func_config)
+            def pooling_job(x: tensor_def(x_shape, dtype=dtype)):
+                v = flow.get_variable(
+                    "x",
+                    shape=x_shape,
+                    dtype=dtype,
+                    initializer=flow.constant_initializer(0),
+                    trainable=True,
+                )
+                v = flow.cast_to_current_logical_view(v)
+                flow.watch_diff(v, assert_grad)
+                x += v
+                with flow.scope.placement(device_type, "0:0"):
+                    pooling_f = None
+                    if pooling_type == "AVG":
+                        pooling_f = getattr(flow.nn, "avg_pool{}d".format(dim))
+                    elif pooling_type == "MAX":
+                        pooling_f = getattr(flow.nn, "max_pool{}d".format(dim))
+                    else:
+                        raise ValueError("pooling_type must be AVG or MAX")
+                    y = pooling_f(
+                        x,
+                        ksize=ksize,
+                        strides=strides,
+                        padding=padding,
+                        data_format=data_format,
+                    )
+                flow.optimizer.SGD(
+                    flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+                ).minimize(y)
+                return y
+
+            if is_dynamic:
+                x = [x]
+            y = pooling_job(x).get()
+            y_ndarray = None
+            if is_dynamic:
+                y_ndarray = y.numpy_list()[0]
+            else:
+                y_ndarray = y.numpy()
+            assert y_ndarray.shape == y_tf.numpy().shape, (
+                y_ndarray.shape,
+                y_tf.numpy().shape,
+            )
+            assert np.allclose(y_ndarray, y_tf.numpy(), rtol=1e-05, atol=1e-05), (
+                case,
+                y_ndarray - y_tf.numpy(),
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_pool_padding.py b/python/oneflow/compatible/single_client/test/ops/test_pool_padding.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b24b8e6cce09a14f0388333befee832aedcb104
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_pool_padding.py
@@ -0,0 +1,249 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import collections
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+from test_util import GenArgList, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+pool_confs = [
+    {
+        "x_shape": (1, 1, 10, 10),
+        "ksize": 2,
+        "strides": 1,
+        "padding": "SAME",
+        "data_format": "NCHW",
+    },
+    {
+        "x_shape": (1, 3, 7, 7),
+        "ksize": 3,
+        "strides": 2,
+        "padding": "SAME",
+        "data_format": "NCHW",
+    },
+    {
+        "x_shape": (1, 7, 7, 3),
+        "ksize": 3,
+        "strides": 2,
+        "padding": "SAME",
+        "data_format": "NHWC",
+    },
+    {
+        "x_shape": (1, 5, 6, 6),
+        "ksize": 3,
+        "strides": 2,
+        "padding": "SAME",
+        "data_format": "NCHW",
+    },
+    {
+        "x_shape": (1, 7, 5, 5),
+        "ksize": 3,
+        "strides": 2,
+        "padding": "SAME",
+        "data_format": "NCHW",
+    },
+    {
+        "x_shape": (1, 3, 12, 12),
+        "ksize": 2,
+        "strides": 1,
+        "padding": "SAME",
+        "data_format": "NCHW",
+    },
+    {
+        "x_shape": (1, 1, 11, 11),
+        "ksize": 3,
+        "strides": 2,
+        "padding": "SAME",
+        "data_format": "NCHW",
+    },
+    {
+        "x_shape": (1, 10, 10, 1),
+        "ksize": 3,
+        "strides": 2,
+        "padding": "SAME",
+        "data_format": "NHWC",
+    },
+    {
+        "x_shape": (1, 1, 10, 10, 10),
+        "ksize": 2,
+        "strides": 2,
+        "padding": "VALID",
+        "data_format": "NCDHW",
+    },
+    {
+        "x_shape": (1, 7, 5, 5, 5),
+        "ksize": 3,
+        "strides": 1,
+        "padding": "SAME",
+        "data_format": "NCDHW",
+    },
+    {
+        "x_shape": (1, 5, 5, 5, 7),
+        "ksize": 3,
+        "strides": 2,
+        "padding": "VALID",
+        "data_format": "NDHWC",
+    },
+    {
+        "x_shape": (1, 3, 3, 3, 3),
+        "ksize": 2,
+        "strides": 1,
+        "padding": "SAME",
+        "data_format": "NCDHW",
+    },
+]
+
+
+def _GetSequence(value, n, name):
+    """Formats value from input"""
+    if value is None:
+        value = [1]
+    elif not isinstance(value, collections.Sized):
+        value = [value]
+    current_n = len(value)
+    if current_n == 1:
+        return list(value * n)
+    elif current_n == n:
+        return list(value)
+    else:
+        raise ValueError(
+            "{} should be of length 1 or {} but was {}".format(name, n, current_n)
+        )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestPoolPadding(flow.unittest.TestCase):
+    def test_pool(_):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["pool_conf"] = pool_confs
+        arg_dict["data_type"] = ["float32"]
+        arg_dict["pooling_type"] = ["AVG", "MAX"]
+        arg_dict["is_dynamic"] = [True, False]
+        for case in GenArgList(arg_dict):
+            (device_type, pool_conf, data_type, pooling_type, is_dynamic) = case
+            x_shape = pool_conf["x_shape"]
+            ksize = pool_conf["ksize"]
+            strides = pool_conf["strides"]
+            padding = pool_conf["padding"]
+            data_format = pool_conf["data_format"]
+            if os.getenv("ONEFLOW_TEST_CPU_ONLY") and data_format != "NHWC":
+                continue
+            flow.clear_default_session()
+            x = np.random.randn(*x_shape).astype(type_name_to_np_type[data_type])
+            dim = len(x.shape) - 2
+            if dim == 3 and data_format == "NDHWC":
+                continue
+            with tf.GradientTape(persistent=True) as tape:
+                x_tf = tf.Variable(x)
+                strides = _GetSequence(strides, dim, "strides")
+                pooling_f = None
+                if pooling_type == "AVG":
+                    pooling_f = getattr(tf.nn, "avg_pool{}d".format(dim))
+                elif pooling_type == "MAX":
+                    pooling_f = getattr(tf.nn, "max_pool{}d".format(dim))
+                else:
+                    raise ValueError("pooling_type must be AVG or MAX")
+                y_tf = pooling_f(x_tf, ksize, strides, padding, data_format=data_format)
+            dx_tf = tape.gradient(y_tf, x_tf, tf.constant(1.0, shape=y_tf.shape))
+
+            def assert_grad(b):
+                if b.is_dynamic:
+                    b_ndarray = b.numpy_list()[0]
+                else:
+                    b_ndarray = b.numpy()
+                assert np.allclose(dx_tf.numpy(), b_ndarray), (
+                    case,
+                    dx_tf.numpy(),
+                    b_ndarray,
+                )
+
+            dtype = type_name_to_flow_type[data_type]
+            func_config = flow.FunctionConfig()
+            func_config.default_data_type(flow.float)
+            tensor_def = None
+            if is_dynamic:
+                func_config.default_logical_view(flow.scope.mirrored_view())
+                tensor_def = oft.ListNumpy.Placeholder
+            else:
+                tensor_def = oft.Numpy.Placeholder
+
+            @flow.global_function(type="train", function_config=func_config)
+            def pooling_job(x: tensor_def(x_shape, dtype=dtype)):
+                v = flow.get_variable(
+                    "x",
+                    shape=x_shape,
+                    dtype=dtype,
+                    initializer=flow.constant_initializer(0),
+                    trainable=True,
+                )
+                v = flow.cast_to_current_logical_view(v)
+                flow.watch_diff(v, assert_grad)
+                x += v
+                with flow.scope.placement(device_type, "0:0"):
+                    pooling_f = None
+                    if pooling_type == "AVG":
+                        pooling_f = getattr(flow.nn, "avg_pool{}d".format(dim))
+                    elif pooling_type == "MAX":
+                        pooling_f = getattr(flow.nn, "max_pool{}d".format(dim))
+                    else:
+                        raise ValueError("pooling_type must be AVG or MAX")
+                    padding = pool_conf["padding"]
+                    if padding == "SAME":
+                        padding = "SAME_UPPER"
+                    y = pooling_f(
+                        x,
+                        ksize=ksize,
+                        strides=strides,
+                        padding=padding,
+                        data_format=data_format,
+                    )
+                flow.optimizer.SGD(
+                    flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+                ).minimize(y)
+                return y
+
+            if is_dynamic:
+                x = [x]
+            y = pooling_job(x).get()
+            y_ndarray = None
+            if is_dynamic:
+                y_ndarray = y.numpy_list()[0]
+            else:
+                y_ndarray = y.numpy()
+            assert y_ndarray.shape == y_tf.numpy().shape, (
+                y_ndarray.shape,
+                y_tf.numpy().shape,
+            )
+            assert np.allclose(y_ndarray, y_tf.numpy(), rtol=1e-05, atol=1e-05), (
+                case,
+                y_ndarray - y_tf.numpy(),
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_prelu.py b/python/oneflow/compatible/single_client/test/ops/test_prelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..2720696e79e825020077e03b5a3a254a36dd5f80
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_prelu.py
@@ -0,0 +1,116 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import test_global_storage
+from test_util import GenArgList, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def _check(test_case, x, y, shared_axes):
+    alpha_of = test_global_storage.Get("alpha")
+    alpha = np.expand_dims(alpha_of, axis=0)
+    dy = test_global_storage.Get("loss_diff")
+    np_prelu_out = np.where(x > 0, x, x * alpha)
+    np_prelu_x_diff = np.where(x > 0, dy, dy * alpha)
+    np_prelu_alpha_diff = np.where(x > 0, 0, dy * x)
+    np_prelu_alpha_diff = np.add.reduce(
+        np_prelu_alpha_diff, axis=shared_axes, keepdims=True
+    )
+    np_prelu_alpha_diff = np.add.reduce(np_prelu_alpha_diff, axis=0)
+    test_case.assertTrue(np.allclose(np_prelu_out, y))
+    test_case.assertTrue(
+        np.allclose(np_prelu_x_diff, test_global_storage.Get("x_diff"))
+    )
+    test_case.assertTrue(
+        np.allclose(np_prelu_alpha_diff, test_global_storage.Get("alpha_diff"))
+    )
+
+
+def _run_test(test_case, device_type, dtype, x_shape, shared_axes):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def PreluJob(
+        x: oft.Numpy.Placeholder(x_shape, dtype=type_name_to_flow_type[dtype])
+    ):
+        with flow.scope.placement(device_type, "0:0"):
+            x += flow.get_variable(
+                name="v1",
+                shape=(1,),
+                dtype=type_name_to_flow_type[dtype],
+                initializer=flow.zeros_initializer(),
+            )
+            loss = flow.layers.prelu(
+                x,
+                alpha_initializer=flow.random_uniform_initializer(
+                    minval=0.1, maxval=0.9
+                ),
+                shared_axes=shared_axes,
+                name="prelu",
+            )
+            alpha_shape = list(x.shape[1:])
+            if shared_axes is not None:
+                for i in shared_axes:
+                    alpha_shape[i - 1] = 1
+            alpha = flow.get_variable(
+                "prelu-alpha",
+                shape=tuple(alpha_shape),
+                dtype=type_name_to_flow_type[dtype],
+                initializer=flow.random_uniform_initializer(minval=0.1, maxval=0.9),
+            )
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch(alpha, test_global_storage.Setter("alpha"))
+            flow.watch_diff(alpha, test_global_storage.Setter("alpha_diff"))
+            flow.watch(loss, test_global_storage.Setter("loss"))
+            flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))
+            return loss
+
+    x = (np.random.random(x_shape) - 1).astype(type_name_to_np_type[dtype])
+    y = PreluJob(x).get()
+    _check(test_case, x, y.numpy(), shared_axes)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestPrelu(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_prelu(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_case"] = [test_case]
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["dtype"] = ["float32"]
+        arg_dict["x_shape"] = [(10, 32, 20, 20)]
+        arg_dict["shared_axes"] = [(2,), (1, 2), (1, 3), (1, 2, 3)]
+        for arg in GenArgList(arg_dict):
+            _run_test(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_quantization_aware_training.py b/python/oneflow/compatible/single_client/test/ops/test_quantization_aware_training.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d4c4e6c356ddf0e4a149903eb546dd11409538a
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_quantization_aware_training.py
@@ -0,0 +1,82 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+def _test(test_case, per_channel, symmetric, target_backend, build_backbone_fn):
+    def run_with_func_config(build_backbone_fn, func_config):
+        flow.clear_default_session()
+        flow.config.enable_debug_mode(True)
+        INPUT_SHAPE = (2, 3, 4, 5)
+
+        @flow.global_function(type="train", function_config=func_config)
+        def Foo(x: tp.Numpy.Placeholder(INPUT_SHAPE)) -> tp.Numpy:
+            y = build_backbone_fn(x)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [5]), momentum=0
+            ).minimize(y)
+            return y
+
+        res = Foo(np.ones(INPUT_SHAPE, dtype=np.float32))
+        return res
+
+    qat_func_config = flow.FunctionConfig()
+    qat_func_config.enable_qat(True)
+    qat_func_config.qat.symmetric(symmetric)
+    qat_func_config.qat.per_channel_weight_quantization(per_channel)
+    qat_func_config.qat.moving_min_max_stop_update_after_iters(1000)
+    qat_func_config.qat.target_backend(target_backend)
+    res_qat = run_with_func_config(build_backbone_fn, qat_func_config)
+
+
+@unittest.skipIf(os.getenv("ONEFLOW_DRY_RUN"), "can't run in dry run")
+class TestQAT(flow.unittest.TestCase):
+    def test_qat(test_case):
+        def build_conv_with_bias(x):
+            y = flow.layers.conv2d(x, 4, 3, 1, "SAME", use_bias=True, name="conv1")
+            with flow.experimental.scope.config(quantization_aware_training=False):
+                z = flow.layers.conv2d(y, 4, 3, 1, "SAME", use_bias=True, name="conv2")
+                return z
+
+        def build_conv_without_bias(x):
+            y = flow.layers.conv2d(x, 4, 3, 1, "SAME", use_bias=False, name="conv1")
+            with flow.experimental.scope.config(quantization_aware_training=False):
+                z = flow.layers.conv2d(y, 4, 3, 1, "SAME", use_bias=False, name="conv2")
+                return z
+
+        arg_dict = OrderedDict()
+        arg_dict["per_channel"] = [True, False]
+        arg_dict["symmetric"] = [True, False]
+        arg_dict["target_backend"] = ["", "cambricon"]
+        arg_dict["build_backbone_fn"] = [build_conv_with_bias, build_conv_without_bias]
+        for arg in GenArgList(arg_dict):
+            if arg[2] == "cambricon" and arg[0] == True:
+                continue
+            _test(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_quantize_op.py b/python/oneflow/compatible/single_client/test/ops/test_quantize_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ab14d8eae691419023de2d9100330b41f545231
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_quantize_op.py
@@ -0,0 +1,511 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import math
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import test_global_storage
+from test_util import GenArgList, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def gen_quant_scale_for_min_max_symmetric(weight, quantization_bit):
+    weight_max = np.max(np.abs(weight))
+    denominator = 2.0 ** (quantization_bit - 1) - 1
+    return (weight_max / denominator, 0)
+
+
+def gen_quant_scale_for_min_max_affine(weight, quantization_bit):
+    weight_max = np.max(weight)
+    weight_min = np.min(weight)
+    denominator = 2.0 ** quantization_bit - 1
+    scale = (weight_max - weight_min) / denominator
+    zero_point = -np.round(weight_min / scale)
+    return (scale, zero_point)
+
+
+def gen_quant_scale_for_min_max_cambricon(weight, quantization_bit):
+    weight_max = np.max(np.abs(weight))
+    scale = math.floor(math.log2(weight_max)) - (quantization_bit - 2)
+    return (scale, 0)
+
+
+def product(tu):
+    return np.prod(tu).astype(np.int).item()
+
+
+def _check_min_max_observer(
+    test_case,
+    weight,
+    scale_of,
+    zero_point_of,
+    quantization_bit,
+    quantization_scheme,
+    quantization_formula,
+    per_layer_quantization,
+):
+    if per_layer_quantization or quantization_formula == "cambricon":
+        outer_num = 1
+        inner_num = product(weight.shape[0:])
+    else:
+        outer_num = weight.shape[0]
+        inner_num = product(weight.shape[1:])
+    scale_np = np.zeros((outer_num,))
+    zero_point_np = np.zeros((outer_num,))
+    weight_flatten = weight.flatten()
+    if quantization_formula == "google":
+        if quantization_scheme == "symmetric":
+            for c in range(outer_num):
+                (scale_np[c], zero_point_np[c]) = gen_quant_scale_for_min_max_symmetric(
+                    weight_flatten[c * inner_num : (c + 1) * inner_num],
+                    quantization_bit,
+                )
+        else:
+            for c in range(outer_num):
+                (scale_np[c], zero_point_np[c]) = gen_quant_scale_for_min_max_affine(
+                    weight_flatten[c * inner_num : (c + 1) * inner_num],
+                    quantization_bit,
+                )
+    else:
+        (scale_np[0], zero_point_np[0]) = gen_quant_scale_for_min_max_cambricon(
+            weight_flatten, quantization_bit
+        )
+    test_case.assertTrue(np.allclose(scale_of, scale_np, rtol=0.001))
+    test_case.assertTrue(
+        np.allclose(
+            zero_point_of.astype(np.int), zero_point_np.astype(np.int), rtol=0.001
+        )
+    )
+
+
+def _run_test_min_max_observer(
+    test_case,
+    device_type,
+    device_num,
+    dtype,
+    weight_shape,
+    quantization_bit,
+    quantization_scheme,
+    quantization_formula,
+    per_layer_quantization,
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    if device_type == "cpu":
+        flow.config.cpu_device_num(device_num)
+    else:
+        flow.config.gpu_device_num(device_num)
+
+    @flow.global_function(type="predict", function_config=flow.FunctionConfig())
+    def QuantizeJob(
+        weight: oft.Numpy.Placeholder(weight_shape, dtype=type_name_to_flow_type[dtype])
+    ):
+        with flow.scope.placement(device_type, "0:0-%d" % (device_num - 1)):
+            (scale, zero_point) = flow.quantization.min_max_observer(
+                weight,
+                quantization_bit,
+                quantization_scheme,
+                quantization_formula,
+                per_layer_quantization,
+            )
+        return (scale, zero_point)
+
+    weight = (np.random.random(weight_shape) - 0.5).astype(type_name_to_np_type[dtype])
+    (scale, zero_point) = QuantizeJob(weight).get()
+    _check_min_max_observer(
+        test_case,
+        weight,
+        scale.numpy(),
+        zero_point.numpy(),
+        quantization_bit,
+        quantization_scheme,
+        quantization_formula,
+        per_layer_quantization,
+    )
+
+
+def gen_quant_scale_for_moving_average_min_max_symmetric(
+    activation, quantization_bit, momentum, moving_max, moving_min
+):
+    activation_max = np.max(np.abs(activation))
+    denominator = 2.0 ** (quantization_bit - 1) - 1
+    if moving_max[0] == 0:
+        moving_max[0] = activation_max
+    else:
+        moving_max[0] = moving_max[0] * momentum + activation_max * (1 - momentum)
+    moving_min[0] = moving_max[0]
+    return (moving_max[0] / denominator, 0)
+
+
+def gen_quant_scale_for_moving_average_min_max_affine(
+    activation, quantization_bit, momentum, moving_max, moving_min
+):
+    activation_max = np.max(activation)
+    activation_min = np.min(activation)
+    denominator = 2.0 ** quantization_bit - 1
+    if moving_max[0] == 0:
+        moving_max[0] = activation_max
+    else:
+        moving_max[0] = moving_max[0] * momentum + activation_max * (1 - momentum)
+    if moving_min[0] == 0:
+        moving_min[0] = activation_min
+    else:
+        moving_min[0] = moving_min[0] * momentum + activation_min * (1 - momentum)
+    scale = (moving_max[0] - moving_min[0]) / denominator
+    zero_point = -np.round(moving_min[0] / scale)
+    return (scale, zero_point)
+
+
+def gen_quant_scale_for_moving_average_min_max_cambricon(
+    activation, quantization_bit, momentum, moving_max, moving_min
+):
+    activation_max = np.max(np.abs(activation))
+    if moving_max[0] == 0:
+        moving_max[0] = activation_max
+    else:
+        moving_max[0] = moving_max[0] * momentum + activation_max * (1 - momentum)
+    moving_min[0] = moving_max[0]
+    return (math.floor(math.log2(moving_max[0])) - (quantization_bit - 2), 0)
+
+
+def _check_moving_average_min_max_observer(
+    test_case,
+    activation,
+    scale_of,
+    zero_point_of,
+    moving_max_np,
+    moving_min_np,
+    quantization_bit,
+    quantization_scheme,
+    quantization_formula,
+    momentum,
+):
+    if quantization_formula == "google":
+        if quantization_scheme == "symmetric":
+            (
+                scale_np,
+                zero_point_np,
+            ) = gen_quant_scale_for_moving_average_min_max_symmetric(
+                activation.flatten(),
+                quantization_bit,
+                momentum,
+                moving_max_np,
+                moving_min_np,
+            )
+        else:
+            (
+                scale_np,
+                zero_point_np,
+            ) = gen_quant_scale_for_moving_average_min_max_affine(
+                activation.flatten(),
+                quantization_bit,
+                momentum,
+                moving_max_np,
+                moving_min_np,
+            )
+    else:
+        (
+            scale_np,
+            zero_point_np,
+        ) = gen_quant_scale_for_moving_average_min_max_cambricon(
+            activation.flatten(),
+            quantization_bit,
+            momentum,
+            moving_max_np,
+            moving_min_np,
+        )
+    test_case.assertTrue(np.allclose(scale_of[0], scale_np, rtol=0.001))
+    test_case.assertTrue(np.allclose(zero_point_of[0], zero_point_np, rtol=0.001))
+
+
+def _run_test_moving_average_min_max_observer(
+    test_case,
+    device_type,
+    device_num,
+    dtype,
+    activation_shape,
+    quantization_bit,
+    quantization_scheme,
+    quantization_formula,
+    momentum,
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    if device_type == "cpu":
+        flow.config.cpu_device_num(device_num)
+    else:
+        flow.config.gpu_device_num(device_num)
+
+    @flow.global_function(type="train", function_config=flow.FunctionConfig())
+    def QuantizeJob(
+        activation: oft.Numpy.Placeholder(
+            activation_shape, dtype=type_name_to_flow_type[dtype]
+        )
+    ):
+        with flow.scope.placement(device_type, "0:0-%d" % (device_num - 1)):
+            x = flow.get_variable(
+                "x",
+                shape=activation_shape,
+                dtype=activation.dtype,
+                initializer=flow.zeros_initializer(activation.dtype),
+                trainable=True,
+            )
+            (scale, zero_point) = flow.quantization.moving_average_min_max_observer(
+                activation,
+                quantization_bit,
+                quantization_scheme,
+                quantization_formula,
+                momentum,
+            )
+            fake = x + activation
+            loss = flow.math.reduce_mean(fake)
+            flow.optimizer.Adam(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.001])
+            ).minimize(loss)
+        return (scale, zero_point)
+
+    moving_max_np = np.zeros((1,))
+    moving_min_np = np.zeros((1,))
+    for i in range(10):
+        activation = (np.random.random(activation_shape) - 0.5).astype(
+            type_name_to_np_type[dtype]
+        )
+        (scale, zero_point) = QuantizeJob(activation).get()
+        _check_moving_average_min_max_observer(
+            test_case,
+            activation,
+            scale.numpy(),
+            zero_point.numpy(),
+            moving_max_np,
+            moving_min_np,
+            quantization_bit,
+            quantization_scheme,
+            quantization_formula,
+            momentum,
+        )
+
+
+def fake_quant_per_layer_symmetric(input, quantization_bit, scale):
+    upper_bound = 2.0 ** (quantization_bit - 1) - 1
+    lower_bound = -upper_bound
+    return np.clip(np.rint(input / scale), lower_bound, upper_bound) * scale
+
+
+def fake_quant_per_layer_affine(input, quantization_bit, scale, zero_point):
+    upper_bound = 2.0 ** quantization_bit - 1
+    lower_bound = 0
+    return (
+        np.clip(np.rint(input / scale + zero_point), lower_bound, upper_bound)
+        - zero_point
+    ) * scale
+
+
+def fake_quant_per_layer_cambricon(input, quantization_bit, shift):
+    upper_bound = 2.0 ** (quantization_bit - 1) - 1
+    lower_bound = -upper_bound
+    scale = 2 ** shift
+    return np.clip(np.rint(input / scale), lower_bound, upper_bound) * scale
+
+
+def _check_fake_quantize(
+    test_case,
+    input,
+    input_diff_of,
+    out_of,
+    quantization_bit,
+    quantization_scheme,
+    quantization_formula,
+    per_layer_quantization,
+):
+    if per_layer_quantization or quantization_formula == "cambricon":
+        outer_num = 1
+        inner_num = product(input.shape[0:])
+    else:
+        outer_num = input.shape[0]
+        inner_num = product(input.shape[1:])
+    scale_np = np.zeros((outer_num,))
+    zero_point_np = np.zeros((outer_num,))
+    out_np = np.zeros((inner_num * outer_num,))
+    input_flatten = input.flatten()
+    input_diff_np = np.full((inner_num * outer_num,), 1.0 / (inner_num * outer_num))
+    if quantization_formula == "google":
+        if quantization_scheme == "symmetric":
+            for c in range(outer_num):
+                (scale_np[c], zero_point_np[c]) = gen_quant_scale_for_min_max_symmetric(
+                    input_flatten[c * inner_num : (c + 1) * inner_num], quantization_bit
+                )
+                out = fake_quant_per_layer_symmetric(
+                    input_flatten[c * inner_num : (c + 1) * inner_num],
+                    quantization_bit,
+                    scale_np[c],
+                )
+                out_np[c * inner_num : (c + 1) * inner_num] = out
+        else:
+            for c in range(outer_num):
+                (scale_np[c], zero_point_np[c]) = gen_quant_scale_for_min_max_affine(
+                    input_flatten[c * inner_num : (c + 1) * inner_num], quantization_bit
+                )
+                out = fake_quant_per_layer_affine(
+                    input_flatten[c * inner_num : (c + 1) * inner_num],
+                    quantization_bit,
+                    scale_np[c],
+                    zero_point_np[c],
+                )
+                out_np[c * inner_num : (c + 1) * inner_num] = out
+    else:
+        (scale_np[0], zero_point_np[0]) = gen_quant_scale_for_min_max_cambricon(
+            input_flatten, quantization_bit
+        )
+        out_np = fake_quant_per_layer_cambricon(
+            input_flatten, quantization_bit, scale_np[0]
+        )
+    test_case.assertTrue(np.allclose(out_of, out_np, rtol=0.001))
+    test_case.assertTrue(np.allclose(input_diff_of, input_diff_np, rtol=0.001))
+
+
+def _run_test_fake_quantize(
+    test_case,
+    device_type,
+    device_num,
+    dtype,
+    in_shape,
+    quantization_bit,
+    quantization_scheme,
+    quantization_formula,
+    per_layer_quantization,
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    if device_type == "cpu":
+        flow.config.cpu_device_num(device_num)
+    else:
+        flow.config.gpu_device_num(device_num)
+
+    @flow.global_function(type="train", function_config=flow.FunctionConfig())
+    def QuantizeJob(
+        input: oft.Numpy.Placeholder(in_shape, dtype=type_name_to_flow_type[dtype])
+    ):
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "x",
+                shape=in_shape,
+                dtype=input.dtype,
+                initializer=flow.zeros_initializer(input.dtype),
+                trainable=True,
+            )
+            input_x = input + x
+        flow.watch_diff(input_x, test_global_storage.Setter("input_diff"))
+        with flow.scope.placement(device_type, "0:0-%d" % (device_num - 1)):
+            (scale, zero_point) = flow.quantization.min_max_observer(
+                input_x,
+                quantization_bit,
+                quantization_scheme,
+                quantization_formula,
+                per_layer_quantization,
+            )
+            out = flow.quantization.fake_quantization(
+                input_x,
+                scale,
+                zero_point,
+                quantization_bit,
+                quantization_scheme,
+                quantization_formula,
+            )
+            loss = flow.math.reduce_mean(out)
+            flow.optimizer.Adam(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.001])
+            ).minimize(loss)
+        return out
+
+    input = (np.random.random(in_shape) - 0.5).astype(type_name_to_np_type[dtype])
+    out = QuantizeJob(input).get()
+    input_diff = test_global_storage.Get("input_diff")
+    _check_fake_quantize(
+        test_case,
+        input,
+        input_diff.flatten(),
+        out.numpy().flatten(),
+        quantization_bit,
+        quantization_scheme,
+        quantization_formula,
+        per_layer_quantization,
+    )
+
+
+@unittest.skip("This test possibly fails")
+@flow.unittest.skip_unless_1n4d()
+class TestMinMaxObserver(flow.unittest.TestCase):
+    def test_min_max_observer(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_case"] = [test_case]
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["device_num"] = [1, 4]
+        arg_dict["dtype"] = ["float32", "double"]
+        arg_dict["weight_shape"] = [(9, 40, 20, 10)]
+        arg_dict["quantization_bit"] = [8, 2]
+        arg_dict["quantization_scheme"] = ["symmetric", "affine"]
+        arg_dict["quantization_formula"] = ["google"]
+        arg_dict["per_layer_quantization"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            if arg[-2] == "cambricon" and arg[-1] == False:
+                continue
+            _run_test_min_max_observer(*arg)
+
+
+@unittest.skip("This test possibly fails")
+class TestMovingAverageMinMaxObserver(flow.unittest.TestCase):
+    def test_moving_average_min_max_observer(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_case"] = [test_case]
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["device_num"] = [1, 4]
+        arg_dict["dtype"] = ["float32", "double"]
+        arg_dict["activation_shape"] = [(9, 40, 20, 10)]
+        arg_dict["quantization_bit"] = [8, 2]
+        arg_dict["quantization_scheme"] = ["symmetric", "affine"]
+        arg_dict["quantization_formula"] = ["google"]
+        arg_dict["momentum"] = [0.95]
+        for arg in GenArgList(arg_dict):
+            _run_test_moving_average_min_max_observer(*arg)
+
+
+@unittest.skip("This test possibly fails")
+@flow.unittest.skip_unless_1n4d()
+class TestFakeQuantize(flow.unittest.TestCase):
+    def test_fake_quantize(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_case"] = [test_case]
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["device_num"] = [1, 4]
+        arg_dict["dtype"] = ["float32", "double"]
+        arg_dict["in_shape"] = [(9, 40, 20, 10)]
+        arg_dict["quantization_bit"] = [8, 2]
+        arg_dict["quantization_scheme"] = ["symmetric", "affine"]
+        arg_dict["quantization_formula"] = ["google"]
+        arg_dict["per_layer_quantization"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            if arg[-2] == "cambricon" and arg[-1] == False:
+                continue
+            _run_test_fake_quantize(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_random_mask_like.py b/python/oneflow/compatible/single_client/test/ops/test_random_mask_like.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bbe67c77df999fb4a9175f02fa704922e5054ee
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_random_mask_like.py
@@ -0,0 +1,62 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import shutil
+import tempfile
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList, type_name_to_flow_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def of_run(device_type, x_shape, rate, seed):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+
+    @flow.global_function(function_config=func_config)
+    def RandomMaskLikeJob(x: oft.Numpy.Placeholder(x_shape)):
+        with flow.scope.placement(device_type, "0:0"):
+            mask = flow.nn.random_mask_like(x, rate=rate, seed=seed, name="random_mask")
+            return mask
+
+    x = np.random.rand(*x_shape).astype(np.float32)
+    of_out = RandomMaskLikeJob(x).get().numpy()
+    assert np.allclose(
+        [1 - np.count_nonzero(of_out) / of_out.size], [rate], atol=rate / 5
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestRandomMaskLike(flow.unittest.TestCase):
+    def test_random_mask_like(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["x_shape"] = [(100, 100, 10, 20), (100, 100, 200)]
+        arg_dict["rate"] = [0.1, 0.4, 0.75]
+        arg_dict["seed"] = [12345, None]
+        for arg in GenArgList(arg_dict):
+            of_run(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_range.py b/python/oneflow/compatible/single_client/test/ops/test_range.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7e6a1d744ff195eddedbffc599e560202a2c21a
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_range.py
@@ -0,0 +1,150 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+from typing import List
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+def compare_range_with_np_CPU(device_type, machine_ids, device_counts):
+    assert device_type in ["cpu"]
+    flow.clear_default_session()
+    flow.env.init()
+    if device_type == "cpu":
+        flow.config.cpu_device_num(device_counts)
+    else:
+        flow.config.gpu_device_num(device_counts)
+    func_config = flow.FunctionConfig()
+
+    @flow.global_function(function_config=func_config, type="train")
+    def oneflow_range() -> List[tp.Numpy]:
+        with flow.scope.placement(device_type, machine_ids):
+            out_1 = flow.range(1, 10, 3, dtype=flow.float64, name="range_float64")
+            out_2 = flow.range(3, 6, 1, dtype=flow.float32, name="range_float32")
+            out_3 = flow.range(3, dtype=flow.int32, name="range_int32")
+            out_4 = flow.range(0, 6, 2, dtype=flow.int64, name="range_int64")
+            x_var = flow.get_variable(
+                "cpu_input",
+                shape=(3,),
+                dtype=flow.float32,
+                initializer=flow.constant_initializer(0),
+            )
+            x_out = out_2 + x_var
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+            ).minimize(x_out)
+        return [out_1, out_2, out_3, out_4]
+
+    def np_range():
+        np_out_1 = np.arange(1, 10, 3).astype(np.float64)
+        np_out_2 = np.arange(3, 6, 1).astype(np.float32)
+        np_out_3 = np.arange(3).astype(np.int32)
+        np_out_4 = np.arange(0, 6, 2).astype(np.int64)
+        return [np_out_1, np_out_2, np_out_3, np_out_4]
+
+    of_out_list = oneflow_range()
+    np_out_list = np_range()
+    for i in range(len(of_out_list)):
+        assert np.array_equal(of_out_list[i], np_out_list[i])
+
+
+def compare_range_with_np_GPU(device_type, machine_ids, device_counts):
+    assert device_type in ["gpu"]
+    flow.clear_default_session()
+    flow.env.init()
+    if device_type == "cpu":
+        flow.config.cpu_device_num(device_counts)
+    else:
+        flow.config.gpu_device_num(device_counts)
+    func_config = flow.FunctionConfig()
+
+    @flow.global_function(function_config=func_config, type="train")
+    def oneflow_range_gpu() -> List[tp.Numpy]:
+        with flow.scope.placement(device_type, machine_ids):
+            out_1 = flow.range(1, 10, 3, dtype=flow.float64, name="range_float64")
+            out_2 = flow.range(3, 6, 1, dtype=flow.float32, name="range_float32")
+            out_3 = flow.range(4, 13, 4, dtype=flow.float32, name="range_float16")
+            out_3 = flow.cast(out_3, dtype=flow.float32)
+            out_4 = flow.range(3, dtype=flow.int32, name="range_int32")
+            out_5 = flow.range(0, 6, 2, dtype=flow.int64, name="range_int64")
+            x_var = flow.get_variable(
+                "gpu_input",
+                shape=(3,),
+                dtype=flow.float32,
+                initializer=flow.constant_initializer(0.0),
+            )
+            x_gpu_out = x_var + out_2
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+            ).minimize(x_gpu_out)
+        return [out_1, out_2, out_3, out_4, out_5]
+
+    def np_range_gpu():
+        np_out_1 = np.arange(1, 10, 3).astype(np.float64)
+        np_out_2 = np.arange(3, 6, 1).astype(np.float32)
+        np_out_3 = np.arange(4, 13, 4).astype(np.float16)
+        np_out_4 = np.arange(3).astype(np.int32)
+        np_out_5 = np.arange(0, 6, 2).astype(np.int64)
+        return [np_out_1, np_out_2, np_out_3, np_out_4, np_out_5]
+
+    of_out_list = oneflow_range_gpu()
+    np_out_list = np_range_gpu()
+    for i in range(len(of_out_list)):
+        assert np.array_equal(of_out_list[i], np_out_list[i])
+
+
+@flow.unittest.skip_unless_1n1d()
+class Testrange1n1d(flow.unittest.TestCase):
+    def test_range_cpu(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu"]
+        arg_dict["machine_ids"] = ["0:0"]
+        arg_dict["device_counts"] = [1]
+        for arg in GenArgList(arg_dict):
+            compare_range_with_np_CPU(*arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_range_gpu(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["machine_ids"] = ["0:0"]
+        arg_dict["device_counts"] = [1]
+        for arg in GenArgList(arg_dict):
+            compare_range_with_np_GPU(*arg)
+
+
+@flow.unittest.skip_unless_1n2d()
+class Testrange1n2d(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_range_gpu_1n2d(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["machine_ids"] = ["0:0-1"]
+        arg_dict["device_counts"] = [2]
+        for arg in GenArgList(arg_dict):
+            compare_range_with_np_GPU(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_reduce_mean.py b/python/oneflow/compatible/single_client/test/ops/test_reduce_mean.py
new file mode 100644
index 0000000000000000000000000000000000000000..04b9a66e4352f98b9ae9d1ed3715a89a16420353
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_reduce_mean.py
@@ -0,0 +1,86 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import test_global_storage
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def compare_with_tensorflow(device_type, input_shape, axis, keepdims):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def ReduceMeanJob():
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "x",
+                shape=input_shape,
+                dtype=flow.float,
+                initializer=flow.random_uniform_initializer(minval=-10, maxval=10),
+                trainable=True,
+            )
+            loss = flow.math.reduce_mean(x, axis=axis, keepdims=keepdims)
+            loss = flow.identity(loss)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch(loss, test_global_storage.Setter("loss"))
+            flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))
+            return loss
+
+    of_out = ReduceMeanJob().get()
+    with tf.GradientTape(persistent=True) as tape:
+        x = tf.Variable(test_global_storage.Get("x"))
+        tf_out = tf.math.reduce_mean(x, axis=axis, keepdims=keepdims)
+    loss_diff = test_global_storage.Get("loss_diff")
+    tf_x_diff = tape.gradient(tf_out, x, loss_diff)
+    assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=1e-05, atol=1e-05)
+    assert np.allclose(
+        test_global_storage.Get("x_diff"), tf_x_diff.numpy(), rtol=1e-05, atol=1e-05
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestReduceMean(flow.unittest.TestCase):
+    def test_reduce_mean(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["input_shape"] = [(64, 64, 64)]
+        arg_dict["axis"] = [None, [1], [0, 2]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_reduce_ops.py b/python/oneflow/compatible/single_client/test/ops/test_reduce_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0bd0073d5ca337a67ff2d18c6f468f6581e6cf6
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_reduce_ops.py
@@ -0,0 +1,844 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import test_global_storage
+from test_util import GenArgList
+
+import oneflow._oneflow_internal
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def compare_reduce_any_with_tensorflow(
+    device_type, input_shape, axis, keepdims, rtol=1e-05, atol=1e-05
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.int8)
+
+    @flow.global_function(function_config=func_config)
+    def ReduceAnyJob(x: oft.Numpy.Placeholder(input_shape, dtype=flow.int8)):
+        with flow.scope.placement(device_type, "0:0"):
+            return flow.math.reduce_any(x, axis=axis, keepdims=keepdims)
+
+    x = np.random.rand(*input_shape).astype(np.int8)
+    of_out = ReduceAnyJob(x).get()
+    tf_out = tf.math.reduce_any(x, axis=axis, keepdims=keepdims)
+    assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=rtol, atol=atol)
+
+
+def compare_reduce_prod_with_tensorflow(
+    device_type, input_shape, axis, keepdims, rtol=1e-05, atol=1e-05
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float32)
+
+    @flow.global_function(function_config=func_config)
+    def ReduceProdJob(x: oft.Numpy.Placeholder(input_shape, dtype=flow.float32)):
+        with flow.scope.placement(device_type, "0:0"):
+            return flow.math.reduce_prod(x, axis=axis, keepdims=keepdims)
+
+    x = np.random.rand(*input_shape).astype(np.float32)
+    of_out = ReduceProdJob(x).get()
+    tf_out = tf.math.reduce_prod(x, axis=axis, keepdims=keepdims)
+    assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=rtol, atol=atol)
+
+
+def compare_reduce_min_with_tensorflow(
+    device_type, input_shape, axis, keepdims, rtol=1e-05, atol=1e-05
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float32)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def ReduceMinJob(x: oft.Numpy.Placeholder(input_shape, dtype=flow.float)):
+        with flow.scope.placement(device_type, "0:0"):
+            x += flow.get_variable(
+                name="v1",
+                shape=input_shape,
+                dtype=flow.float,
+                initializer=flow.zeros_initializer(),
+            )
+            loss = flow.math.reduce_min(x, axis=axis, keepdims=keepdims)
+            loss = flow.identity(loss)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch(loss, test_global_storage.Setter("loss"))
+            flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))
+            return loss
+
+    x = np.random.rand(*input_shape).astype(np.float32)
+    of_out = ReduceMinJob(x).get()
+    with tf.GradientTape(persistent=True) as tape:
+        x = tf.Variable(x)
+        tf_out = tf.math.reduce_min(x, axis=axis, keepdims=keepdims)
+    loss_diff = test_global_storage.Get("loss_diff")
+    tf_x_diff = tape.gradient(tf_out, x, loss_diff)
+    assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=rtol, atol=atol)
+    assert np.allclose(
+        test_global_storage.Get("x_diff"), tf_x_diff.numpy(), rtol=1e-05, atol=1e-05
+    )
+
+
+def compare_reduce_all_with_tensorflow(
+    device_type, input_shape, axis, keepdims, rtol=1e-05, atol=1e-05
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.int8)
+
+    @flow.global_function(function_config=func_config)
+    def ReduceAllJob(x: oft.Numpy.Placeholder(input_shape, dtype=flow.int8)):
+        with flow.scope.placement(device_type, "0:0"):
+            return flow.math.reduce_all(x, axis=axis, keepdims=keepdims)
+
+    x = np.random.rand(*input_shape).astype(np.int8)
+    of_out = ReduceAllJob(x).get()
+    tf_out = tf.math.reduce_all(x, axis=axis, keepdims=keepdims)
+    assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=rtol, atol=atol)
+
+
+def compare_reduce_sum_with_tensorflow(
+    test_case, device_type, input_shape, axis, keepdims
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.int32)
+
+    @flow.global_function(function_config=func_config)
+    def ReduceSumJob(x: oft.Numpy.Placeholder(input_shape, dtype=flow.int32)):
+        with flow.scope.placement(device_type, "0:0"):
+            return flow.math.reduce_sum(x, axis=axis, keepdims=keepdims)
+
+    x = (np.random.rand(*input_shape) * 100).astype(np.int32)
+    of_out = ReduceSumJob(x).get()
+    tf_out = tf.math.reduce_sum(x, axis=axis, keepdims=keepdims)
+    test_case.assertTrue(np.allclose(of_out.numpy(), tf_out.numpy()))
+
+
+def compare_reduce_euclidean_norm_with_tensorflow(
+    device_type, input_shape, axis, keepdims, rtol=0.001, atol=0.001
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float32)
+
+    @flow.global_function(function_config=func_config)
+    def ReduceEuclideanNormJob(x: oft.Numpy.Placeholder(input_shape, dtype=flow.float)):
+        with flow.scope.placement(device_type, "0:0"):
+            return flow.math.reduce_euclidean_norm(x, axis=axis, keepdims=keepdims)
+
+    x = np.random.rand(*input_shape).astype(np.float32)
+    of_out = ReduceEuclideanNormJob(x).get()
+    tf_out = tf.math.reduce_euclidean_norm(x, axis=axis, keepdims=keepdims)
+    assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=rtol, atol=atol)
+
+
+def compare_reduce_logsumexp_with_tensorflow(
+    device_type, input_shape, axis, keepdims, rtol=1e-05, atol=1e-05
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float32)
+
+    @flow.global_function(function_config=func_config)
+    def ReduceLogSumExpJob(x: oft.Numpy.Placeholder(input_shape, dtype=flow.float)):
+        with flow.scope.placement(device_type, "0:0"):
+            return flow.math.reduce_logsumexp(x, axis=axis, keepdims=keepdims)
+
+    x = np.random.rand(*input_shape).astype(np.float32)
+    of_out = ReduceLogSumExpJob(x).get()
+    tf_out = tf.math.reduce_logsumexp(x, axis=axis, keepdims=keepdims)
+    assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=rtol, atol=atol)
+
+
+def compare_reduce_std_with_tensorflow(
+    device_type, input_shape, axis, keepdims, rtol=1e-05, atol=1e-05
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float32)
+
+    @flow.global_function(function_config=func_config)
+    def ReduceStdJob(x: oft.Numpy.Placeholder(input_shape, dtype=flow.float)):
+        with flow.scope.placement(device_type, "0:0"):
+            return flow.math.reduce_std(x, axis=axis, keepdims=keepdims)
+
+    x = np.random.rand(*input_shape).astype(np.float32)
+    of_out = ReduceStdJob(x).get()
+    tf_out = tf.math.reduce_std(x, axis=axis, keepdims=keepdims)
+    assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=rtol, atol=atol)
+
+
+def compare_reduce_variance_with_tensorflow(
+    device_type, input_shape, axis, keepdims, rtol=1e-05, atol=1e-05
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float32)
+
+    @flow.global_function(function_config=func_config)
+    def ReduceVarianceJob(x: oft.Numpy.Placeholder(input_shape, dtype=flow.float)):
+        with flow.scope.placement(device_type, "0:0"):
+            return flow.math.reduce_variance(x, axis=axis, keepdims=keepdims)
+
+    x = np.random.rand(*input_shape).astype(np.float32)
+    of_out = ReduceVarianceJob(x).get()
+    tf_out = tf.math.reduce_variance(x, axis=axis, keepdims=keepdims)
+    assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=rtol, atol=atol)
+
+
+def compare_reduce_max_with_tensorflow(
+    device_type, input_shape, axis, keepdims, rtol=1e-05, atol=1e-05
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def ReduceMaxJob(x: oft.Numpy.Placeholder(input_shape, dtype=flow.float)):
+        with flow.scope.placement(device_type, "0:0"):
+            x += flow.get_variable(
+                name="v1",
+                shape=input_shape,
+                dtype=flow.float,
+                initializer=flow.zeros_initializer(),
+            )
+            loss = flow.math.reduce_max(x, axis=axis, keepdims=keepdims)
+            loss = flow.identity(loss)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch(loss, test_global_storage.Setter("loss"))
+            flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))
+            return loss
+
+    x = np.random.rand(*input_shape).astype(np.float32)
+    of_out = ReduceMaxJob(x).get()
+    with tf.GradientTape(persistent=True) as tape:
+        x = tf.Variable(x)
+        tf_out = tf.math.reduce_max(x, axis=axis, keepdims=keepdims)
+    loss_diff = test_global_storage.Get("loss_diff")
+    tf_x_diff = tape.gradient(tf_out, x, loss_diff)
+    assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=rtol, atol=atol)
+    assert np.allclose(
+        test_global_storage.Get("x_diff"), tf_x_diff.numpy(), rtol=1e-05, atol=1e-05
+    )
+
+
+@flow.unittest.skip_unless_1n2d()
+class TestReduceOps(flow.unittest.TestCase):
+    def test_reduce_any_func(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(64, 64, 64)]
+        arg_dict["axis"] = [None, [], [1], [0, 2]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_any_with_tensorflow(*arg)
+
+    def test_reduce_any_with_one_value_func(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(1,)]
+        arg_dict["axis"] = [None, [], [0]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_any_with_tensorflow(*arg)
+
+    def test_reduce_any_col_reduce(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(1024 * 64, 25)]
+        arg_dict["axis"] = [[0]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_any_with_tensorflow(*arg)
+
+    def test_reduce_any_row_reduce(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(25, 1024 * 1024)]
+        arg_dict["axis"] = [[1]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_any_with_tensorflow(*arg)
+
+    def test_reduce_any_scalar(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(1024 * 64, 25)]
+        arg_dict["axis"] = [[0, 1]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_any_with_tensorflow(*arg)
+
+    def test_reduce_any_split_axis_reduced(test_case):
+        flow.config.gpu_device_num(2)
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def Foo(x: oft.Numpy.Placeholder((10,), dtype=flow.int8)):
+            y = flow.math.reduce_any(x)
+            test_case.assertTrue(y.split_axis == flow.INVALID_SPLIT_AXIS)
+
+        Foo(np.ndarray((10,), dtype=np.int8))
+
+    def test_reduce_prod_func(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(64, 64, 64)]
+        arg_dict["axis"] = [None, [], [1], [0, 2]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_prod_with_tensorflow(*arg)
+
+    def test_reduce_prod_with_one_value_func(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(1,)]
+        arg_dict["axis"] = [None, [], [0]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_prod_with_tensorflow(*arg)
+
+    def test_reduce_prod_col_reduce(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(1024 * 64, 25)]
+        arg_dict["axis"] = [[0]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_prod_with_tensorflow(*arg)
+
+    def test_reduce_prod_row_reduce(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(25, 1024 * 1024)]
+        arg_dict["axis"] = [[1]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_prod_with_tensorflow(*arg)
+
+    def test_reduce_prod_scalar(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(1024 * 64, 25)]
+        arg_dict["axis"] = [[0, 1]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_prod_with_tensorflow(*arg)
+
+    def test_reduce_prod_split_axis_reduced(test_case):
+        flow.config.gpu_device_num(2)
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def Foo(x: oft.Numpy.Placeholder((10,))):
+            y = flow.math.reduce_prod(x)
+            test_case.assertTrue(y.split_axis == flow.INVALID_SPLIT_AXIS)
+
+        Foo(np.ndarray((10,), dtype=np.float32))
+
+    def test_reduce_min_func(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(64, 64, 64)]
+        arg_dict["axis"] = [None, [], [1], [0, 2]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_min_with_tensorflow(*arg)
+
+    def test_reduce_min_with_one_value_func(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(1,)]
+        arg_dict["axis"] = [None, [], [0]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_min_with_tensorflow(*arg)
+
+    def test_reduce_min_col_reduce(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(1024 * 64, 25)]
+        arg_dict["axis"] = [[0]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_min_with_tensorflow(*arg)
+
+    def test_reduce_min_row_reduce(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(25, 1024 * 1024)]
+        arg_dict["axis"] = [[1]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_min_with_tensorflow(*arg)
+
+    def test_reduce_min_scalar(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(1024 * 64, 25)]
+        arg_dict["axis"] = [[0, 1]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_min_with_tensorflow(*arg)
+
+    def test_reduce_min_split_axis_reduced(test_case):
+        flow.config.gpu_device_num(2)
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def Foo(x: oft.Numpy.Placeholder((10,))):
+            y = flow.math.reduce_min(x)
+            test_case.assertTrue(y.split_axis == flow.INVALID_SPLIT_AXIS)
+
+        Foo(np.ndarray((10,), dtype=np.float32))
+
+    def test_reduce_all_func(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(64, 64, 64)]
+        arg_dict["axis"] = [None, [], [1], [0, 2]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_all_with_tensorflow(*arg)
+
+    def test_reduce_all_with_one_value_func(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(1,)]
+        arg_dict["axis"] = [None, [], [0]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_all_with_tensorflow(*arg)
+
+    def test_reduce_all_col_reduce(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(1024 * 64, 25)]
+        arg_dict["axis"] = [[0]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_all_with_tensorflow(*arg)
+
+    def test_reduce_all_row_reduce(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(25, 1024 * 1024)]
+        arg_dict["axis"] = [[1]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_all_with_tensorflow(*arg)
+
+    def test_reduce_all_scalar(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(1024 * 64, 25)]
+        arg_dict["axis"] = [[0, 1]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_all_with_tensorflow(*arg)
+
+    def test_reduce_all_split_axis_reduced(test_case):
+        flow.config.gpu_device_num(2)
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def Foo(x: oft.Numpy.Placeholder((10,), dtype=flow.int8)):
+            y = flow.math.reduce_all(x)
+            test_case.assertTrue(y.split_axis == flow.INVALID_SPLIT_AXIS)
+
+        Foo(np.ndarray((10,), dtype=np.int8))
+
+    def test_reduce_sum_func(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(64, 64, 64)]
+        arg_dict["axis"] = [None, [], [1], [0, 2]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_sum_with_tensorflow(test_case, *arg)
+
+    def test_reduce_sum_with_one_value_func(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(1,)]
+        arg_dict["axis"] = [None, [], [0]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_sum_with_tensorflow(test_case, *arg)
+
+    def test_reduce_sum_col_reduce(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(1024 * 64, 25)]
+        arg_dict["axis"] = [[0]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_sum_with_tensorflow(test_case, *arg)
+
+    def test_reduce_sum_row_reduce(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(25, 1024 * 1024)]
+        arg_dict["axis"] = [[1]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_sum_with_tensorflow(test_case, *arg)
+
+    def test_reduce_sum_scalar(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(1024 * 64, 25)]
+        arg_dict["axis"] = [[0, 1]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_sum_with_tensorflow(test_case, *arg)
+
+    def test_reduce_sum_split_axis_reduced(test_case):
+        flow.config.gpu_device_num(2)
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def Foo(x: oft.Numpy.Placeholder((10,))):
+            y = flow.math.reduce_sum(x)
+            test_case.assertTrue(y.split_axis == flow.INVALID_SPLIT_AXIS)
+
+        Foo(np.ndarray((10,), dtype=np.float32))
+
+    def test_reduce_euclidean_norm_func(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(64, 64, 64)]
+        arg_dict["axis"] = [None, [], [1], [0, 2]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_euclidean_norm_with_tensorflow(*arg)
+
+    def test_reduce_euclidean_norm_with_one_value_func(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(1,)]
+        arg_dict["axis"] = [None, [], [0]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_euclidean_norm_with_tensorflow(*arg)
+
+    def test_reduce_euclidean_norm_col_reduce(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(1024 * 64, 25)]
+        arg_dict["axis"] = [[0]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_euclidean_norm_with_tensorflow(*arg)
+
+    def test_reduce_euclidean_norm_row_reduce(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(25, 1024 * 1024)]
+        arg_dict["axis"] = [[1]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_euclidean_norm_with_tensorflow(*arg)
+
+    def test_reduce_euclidean_norm_scalar(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(1024 * 64, 25)]
+        arg_dict["axis"] = [[0, 1]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_euclidean_norm_with_tensorflow(*arg)
+
+    def test_reduce_euclidean_norm_split_axis_reduced(test_case):
+        flow.config.gpu_device_num(2)
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def Foo(x: oft.Numpy.Placeholder((10,))):
+            y = flow.math.reduce_euclidean_norm(x)
+            test_case.assertTrue(y.split_axis == flow.INVALID_SPLIT_AXIS)
+
+        Foo(np.ndarray((10,), dtype=np.float32))
+
+    def test_reduce_logsumexp_func(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(64, 64, 64)]
+        arg_dict["axis"] = [None, [], [1], [0, 2]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_logsumexp_with_tensorflow(*arg)
+
+    def test_reduce_logsumexp_with_one_value_func(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(1,)]
+        arg_dict["axis"] = [None, [], [0]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_logsumexp_with_tensorflow(*arg)
+
+    def test_reduce_logsumexp_col_reduce(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(1024 * 64, 25)]
+        arg_dict["axis"] = [[0]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_logsumexp_with_tensorflow(*arg)
+
+    def test_reduce_logsumexp_row_reduce(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(25, 1024 * 1024)]
+        arg_dict["axis"] = [[1]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_logsumexp_with_tensorflow(*arg)
+
+    def test_reduce_logsumexp_scalar(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(1024 * 64, 25)]
+        arg_dict["axis"] = [[0, 1]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_logsumexp_with_tensorflow(*arg)
+
+    def test_reduce_logsumexp_split_axis_reduced(test_case):
+        flow.config.gpu_device_num(2)
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def Foo(x: oft.Numpy.Placeholder((10,))):
+            y = flow.math.reduce_logsumexp(x)
+            test_case.assertTrue(y.split_axis == flow.INVALID_SPLIT_AXIS)
+
+        Foo(np.ndarray((10,), dtype=np.float32))
+
+    def test_reduce_std_func(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(64, 64, 64)]
+        arg_dict["axis"] = [None, [], [1], [0, 2]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_std_with_tensorflow(*arg)
+
+    def test_reduce_std_with_one_value_func(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(1,)]
+        arg_dict["axis"] = [None, [], [0]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_std_with_tensorflow(*arg)
+
+    def test_reduce_std_col_reduce(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(1024 * 64, 25)]
+        arg_dict["axis"] = [[0]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_std_with_tensorflow(*arg)
+
+    def test_reduce_std_row_reduce(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(25, 1024 * 1024)]
+        arg_dict["axis"] = [[1]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_std_with_tensorflow(*arg)
+
+    def test_reduce_std_scalar(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(1024 * 64, 25)]
+        arg_dict["axis"] = [[0, 1]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_std_with_tensorflow(*arg)
+
+    def test_reduce_std_split_axis_reduced(test_case):
+        flow.config.gpu_device_num(2)
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def Foo(x: oft.Numpy.Placeholder((10,))):
+            y = flow.math.reduce_std(x)
+            test_case.assertTrue(y.split_axis == flow.INVALID_SPLIT_AXIS)
+
+        Foo(np.ndarray((10,), dtype=np.float32))
+
+    def test_reduce_variance_func(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(64, 64, 64)]
+        arg_dict["axis"] = [None, [], [1], [0, 2]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_variance_with_tensorflow(*arg)
+
+    def test_reduce_variance_with_one_value_func(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(1,)]
+        arg_dict["axis"] = [None, [], [0]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_variance_with_tensorflow(*arg)
+
+    def test_reduce_variance_col_reduce(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(1024 * 64, 25)]
+        arg_dict["axis"] = [[0]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_variance_with_tensorflow(*arg)
+
+    def test_reduce_variance_row_reduce(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(25, 1024 * 1024)]
+        arg_dict["axis"] = [[1]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_variance_with_tensorflow(*arg)
+
+    def test_reduce_variance_scalar(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(1024 * 64, 25)]
+        arg_dict["axis"] = [[0, 1]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_variance_with_tensorflow(*arg)
+
+    def test_reduce_variance_split_axis_reduced(test_case):
+        flow.config.gpu_device_num(2)
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def Foo(x: oft.Numpy.Placeholder((10,))):
+            y = flow.math.reduce_variance(x)
+            test_case.assertTrue(y.split_axis == flow.INVALID_SPLIT_AXIS)
+
+        Foo(np.ndarray((10,), dtype=np.float32))
+
+    def test_reduce_max_func(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(64, 64, 64)]
+        arg_dict["axis"] = [None, [], [1], [0, 2]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_max_with_tensorflow(*arg)
+
+    def test_reduce_max_with_one_value_func(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(1,)]
+        arg_dict["axis"] = [None, [], [0]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_max_with_tensorflow(*arg)
+
+    def test_reduce_max_col_reduce(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(1024 * 64, 25)]
+        arg_dict["axis"] = [[0]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_max_with_tensorflow(*arg)
+
+    def test_reduce_max_row_reduce(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(25, 1024 * 1024)]
+        arg_dict["axis"] = [[1]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_max_with_tensorflow(*arg)
+
+    def test_reduce_max_scalar(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(1024 * 64, 25)]
+        arg_dict["axis"] = [[0, 1]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_max_with_tensorflow(*arg)
+
+    def test_reduce_max_split_axis_reduced(test_case):
+        flow.config.gpu_device_num(2)
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def Foo(x: oft.Numpy.Placeholder((10,))):
+            y = flow.math.reduce_max(x)
+            test_case.assertTrue(y.split_axis == flow.INVALID_SPLIT_AXIS)
+
+        Foo(np.ndarray((10,), dtype=np.float32))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_reduce_opsV2.py b/python/oneflow/compatible/single_client/test/ops/test_reduce_opsV2.py
new file mode 100644
index 0000000000000000000000000000000000000000..2268297610cdfe6f032c51313ab67dd7e4b5c959
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_reduce_opsV2.py
@@ -0,0 +1,128 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import test_global_storage
+from test_util import GenArgList
+
+import oneflow._oneflow_internal
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def compare_reduce_sum_with_tensorflow(
+    device_type, input_shape, axis, keepdims, rtol=1e-05, atol=1e-05
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def ReduceSumJob():
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "in",
+                shape=input_shape,
+                dtype=flow.float,
+                initializer=flow.random_uniform_initializer(minval=2, maxval=5),
+                trainable=True,
+            )
+            loss = flow.math.reduce_sum(x, axis=axis, keepdims=keepdims)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch(loss, test_global_storage.Setter("loss"))
+            flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))
+            return loss
+
+    of_out = ReduceSumJob().get()
+    with tf.GradientTape(persistent=True) as tape:
+        x = tf.Variable(test_global_storage.Get("x"))
+        tf_out = tf.math.reduce_sum(x, axis=axis, keepdims=keepdims)
+    loss_diff = test_global_storage.Get("loss_diff")
+    tf_x_diff = tape.gradient(tf_out, x, loss_diff)
+    assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=0.001, atol=0.001)
+    assert np.allclose(
+        test_global_storage.Get("x_diff"), tf_x_diff.numpy(), rtol=1e-05, atol=1e-05
+    )
+
+
+@flow.unittest.skip_unless_1n2d()
+class TestReduceOpsV2(flow.unittest.TestCase):
+    def test_reduce_sum_func(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(64, 64, 64)]
+        arg_dict["axis"] = [None, [], [1], [0, 2]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_sum_with_tensorflow(*arg)
+
+    def test_reduce_sum_col_reduce(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(1024 * 64, 25)]
+        arg_dict["axis"] = [[0]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_sum_with_tensorflow(*arg)
+
+    def test_reduce_sum_row_reduce(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(25, 1024 * 1024)]
+        arg_dict["axis"] = [[1]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_sum_with_tensorflow(*arg)
+
+    def test_reduce_sum_scalar(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(1024 * 64, 25)]
+        arg_dict["axis"] = [[0, 1]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_reduce_sum_with_tensorflow(*arg)
+
+    def test_reduce_sum_split_axis_reduced(test_case):
+        flow.config.gpu_device_num(2)
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def Foo(x: oft.Numpy.Placeholder((10,))):
+            y = flow.math.reduce_sum(x)
+            test_case.assertTrue(y.split_axis == flow.INVALID_SPLIT_AXIS)
+
+        Foo(np.ndarray((10,), dtype=np.float32))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_reduce_sum.py b/python/oneflow/compatible/single_client/test/ops/test_reduce_sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..5739aa6e9c4d5adbceb9f997b7753be8e1dd29f6
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_reduce_sum.py
@@ -0,0 +1,125 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+from test_util import GenArgList
+
+import oneflow._oneflow_internal
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def compare_with_tensorflow(
+    device_type, data_type, input_shape, axis, keepdims, rtol=1e-05, atol=1e-05
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(function_config=func_config)
+    def ReduceSumJob(x: oft.Numpy.Placeholder(input_shape)):
+        with flow.scope.placement(device_type, "0:0"):
+            if data_type == "float16":
+                y = flow.cast(
+                    flow.math.reduce_sum(
+                        flow.cast(x, dtype=flow.float16), axis=axis, keepdims=keepdims
+                    ),
+                    dtype=flow.float32,
+                )
+            else:
+                y = flow.math.reduce_sum(x, axis=axis, keepdims=keepdims)
+            return y
+
+    x = np.random.rand(*input_shape).astype(np.float16).astype(np.float32)
+    of_out = ReduceSumJob(x).get()
+    tf_out = tf.math.reduce_sum(x, axis=axis, keepdims=keepdims)
+    if data_type == "float16":
+        tf_out = tf.cast(tf_out, dtype=tf.float16)
+        tf_out = tf.cast(tf_out, dtype=tf.float32)
+    assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=rtol, atol=atol), (
+        of_out.numpy(),
+        tf_out.numpy(),
+    )
+
+
+@flow.unittest.skip_unless_1n2d()
+class TestReduceSum(flow.unittest.TestCase):
+    def test_reduce_sum(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["data_type"] = ["float32", "float16"]
+        arg_dict["input_shape"] = [(2, 4, 8)]
+        arg_dict["axis"] = [None, [1], [0, 2]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_col_reduce(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["data_type"] = ["float32", "float16"]
+        arg_dict["input_shape"] = [(32, 2)]
+        arg_dict["axis"] = [[0]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg, atol=0.1)
+
+    def test_row_reduce(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["data_type"] = ["float32", "float16"]
+        arg_dict["input_shape"] = [(2, 64)]
+        arg_dict["axis"] = [[1]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_scalar(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["data_type"] = ["float32", "float16"]
+        arg_dict["input_shape"] = [(64, 2)]
+        arg_dict["axis"] = [[0, 1]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_split_axis_reduced(test_case):
+        flow.config.gpu_device_num(2)
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def Foo(x: oft.Numpy.Placeholder((10,))):
+            y = flow.math.reduce_sum(x)
+            test_case.assertTrue(y.split_axis == flow.INVALID_SPLIT_AXIS)
+
+        Foo(np.ndarray((10,), dtype=np.float32))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_reduce_sum_like.py b/python/oneflow/compatible/single_client/test/ops/test_reduce_sum_like.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd3c720bb5812b5d6f976919f5461ccbe6d3cc7d
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_reduce_sum_like.py
@@ -0,0 +1,138 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+from test_util import GenArgList
+
+import oneflow._oneflow_internal
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def reduce_sum_like(x, like, axis):
+    name = "reduce_sum_like_op"
+    op = (
+        flow.user_op_builder(name)
+        .Op("reduce_sum_like")
+        .Input("x", [x])
+        .Input("like", [like])
+        .Output("y")
+        .Attr("axis", axis)
+        .Build()
+    )
+    return op.InferAndTryRun().SoleOutputBlob()
+
+
+def compare_with_tensorflow(
+    device_type, data_type, input_shape, axis, keepdims, rtol=1e-05, atol=1e-05
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(function_config=func_config)
+    def ReduceSumLikeJob(x: oft.Numpy.Placeholder(input_shape)):
+        with flow.scope.placement(device_type, "0:0"):
+            if data_type == "float16":
+                x = flow.cast(x, dtype=flow.float16)
+                like = flow.math.reduce_sum(x, axis=axis, keepdims=keepdims)
+                y = reduce_sum_like(x, like, axis=axis)
+                y = flow.cast(y, dtype=flow.float32)
+            else:
+                like = flow.math.reduce_sum(x, axis=axis, keepdims=keepdims)
+                y = reduce_sum_like(x, like, axis=axis)
+            return y
+
+    x = np.random.rand(*input_shape).astype(np.float16).astype(np.float32)
+    of_out = ReduceSumLikeJob(x).get()
+    tf_out = tf.math.reduce_sum(x, axis=axis, keepdims=keepdims)
+    if data_type == "float16":
+        tf_out = tf.cast(tf_out, dtype=tf.float16)
+        tf_out = tf.cast(tf_out, dtype=tf.float32)
+    assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=rtol, atol=atol), (
+        of_out.numpy(),
+        tf_out.numpy(),
+    )
+
+
+@flow.unittest.skip_unless_1n2d()
+class TestReduceSum(flow.unittest.TestCase):
+    def test_reduce_sum(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["data_type"] = ["float16", "float32", "float16"]
+        arg_dict["input_shape"] = [(2, 4, 8)]
+        arg_dict["axis"] = [[], [0, 2]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_col_reduce(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["data_type"] = ["float32", "float16"]
+        arg_dict["input_shape"] = [(32, 2)]
+        arg_dict["axis"] = [[0]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_row_reduce(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["data_type"] = ["float32", "float16"]
+        arg_dict["input_shape"] = [(2, 64)]
+        arg_dict["axis"] = [[1]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_scalar(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["data_type"] = ["float32", "float16"]
+        arg_dict["input_shape"] = [(64, 2)]
+        arg_dict["axis"] = [[0, 1]]
+        arg_dict["keepdims"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_split_axis_reduced(test_case):
+        flow.config.gpu_device_num(2)
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def Foo(x: oft.Numpy.Placeholder((10,))):
+            y = flow.math.reduce_sum(x)
+            test_case.assertTrue(y.split_axis == flow.INVALID_SPLIT_AXIS)
+
+        Foo(np.ndarray((10,), dtype=np.float32))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_reflection_pad2d.py b/python/oneflow/compatible/single_client/test/ops/test_reflection_pad2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b189ba2cccacdd1242aa1f6af004bebe3b8ca7f
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_reflection_pad2d.py
@@ -0,0 +1,270 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import (
+    Args,
+    Array2Numpy,
+    Coordinate2Index,
+    FlattenArray,
+    GenArgDict,
+    GenArgList,
+    Index2Coordinate,
+)
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+def _make_op_function(
+    test_case, input, padding, grad, device_type, value_type, machine_ids, device_counts
+):
+    flow.clear_default_session()
+    if device_type == "cpu":
+        flow.config.cpu_device_num(device_counts)
+    else:
+        flow.config.gpu_device_num(device_counts)
+    func_config = flow.FunctionConfig()
+    if value_type == flow.float16:
+        func_config.default_data_type(flow.float32)
+    else:
+        func_config.default_data_type(value_type)
+    func_config.default_placement_scope(flow.scope.placement(device_type, machine_ids))
+    func_config.default_logical_view(flow.scope.consistent_view())
+
+    def _compare_diff(blob: tp.Numpy):
+        test_case.assertTrue(np.allclose(grad, blob, 0.001, 0.001))
+
+    if value_type == flow.float32 or value_type == flow.float64:
+
+        @flow.global_function(type="train", function_config=func_config)
+        def op_function(x: tp.Numpy.Placeholder(input.shape, dtype=value_type)):
+            with flow.scope.placement(device_type, "0:0"):
+                x += flow.get_variable(
+                    name="input",
+                    shape=input.shape,
+                    dtype=value_type,
+                    initializer=flow.zeros_initializer(),
+                )
+                out = flow.reflection_pad2d(x, padding)
+                flow.optimizer.SGD(
+                    flow.optimizer.PiecewiseConstantScheduler([], [0]), momentum=0
+                ).minimize(out)
+            flow.watch_diff(x, _compare_diff)
+            return out
+
+        return op_function
+    elif value_type == flow.int32:
+
+        @flow.global_function(type="train", function_config=func_config)
+        def op_function(x: tp.Numpy.Placeholder(input.shape, dtype=flow.float32)):
+            with flow.scope.placement(device_type, "0:0"):
+                x += flow.get_variable(
+                    name="input",
+                    shape=input.shape,
+                    dtype=flow.float32,
+                    initializer=flow.zeros_initializer(),
+                )
+                y_int32 = flow.reflection_pad2d(x, padding)
+                y_fp32 = flow.cast(y_int32, dtype=flow.float32)
+                flow.optimizer.SGD(
+                    flow.optimizer.PiecewiseConstantScheduler([], [0]), momentum=0
+                ).minimize(y_fp32)
+            flow.watch_diff(x, _compare_diff)
+            return y_fp32
+
+        return op_function
+    elif value_type == flow.float16:
+
+        @flow.global_function(type="train", function_config=func_config)
+        def op_function(x: tp.Numpy.Placeholder(input.shape, dtype=flow.float32)):
+            with flow.scope.placement(device_type, "0:0"):
+                x_var = flow.get_variable(
+                    name="input",
+                    shape=input.shape,
+                    dtype=flow.float32,
+                    initializer=flow.constant_initializer(0),
+                )
+                x_var = flow.cast_to_current_logical_view(x_var)
+                input_x = x_var + x
+                x_fp32 = flow.cast(input_x, flow.float32)
+                x_fp16 = flow.cast(input_x, dtype=flow.float16)
+                y_fp16 = flow.reflection_pad2d(x_fp16, padding)
+                y_fp32 = flow.cast(y_fp16, dtype=flow.float32)
+                flow.optimizer.SGD(
+                    flow.optimizer.PiecewiseConstantScheduler([], [0]), momentum=0
+                ).minimize(y_fp32)
+            flow.watch_diff(x_fp32, _compare_diff)
+            return y_fp32
+
+        return op_function
+
+
+def gen_numpy_test_sample(input_shape, padding, is_float=True):
+    (c_idx, h_idx, w_idx) = (1, 2, 3)
+    pad_left = padding[0]
+    pad_right = padding[1]
+    pad_top = padding[2]
+    pad_bottom = padding[3]
+    pad_shape = ((0, 0), (0, 0), (pad_top, pad_bottom), (pad_left, pad_right))
+
+    def _np_reflection_pad2d(input, pad_shape):
+        numpy_reflect = np.pad(input, pad_shape, "reflect")
+        return numpy_reflect
+
+    def _np_reflection_pad2d_grad(src, dest):
+        (dx_height, dx_width) = (input.shape[h_idx], input.shape[w_idx])
+        (dy_height, dy_width) = (output.shape[h_idx], output.shape[w_idx])
+        numpy_src = np.ones(src.shape, np.int32)
+        numpy_dest = np.zeros(dest.shape, np.int32)
+        array_src = FlattenArray(numpy_src)
+        array_dest = FlattenArray(numpy_dest)
+        src_num = src.shape[c_idx] * src.shape[h_idx] * src.shape[w_idx]
+        dest_num = dest.shape[c_idx] * dest.shape[h_idx] * dest.shape[w_idx]
+        elements_num = src.shape[0] * src_num
+        for iter_n in range(elements_num):
+            coords = Index2Coordinate(iter_n, src.shape)
+            (n, c, i, j) = (coords[0], coords[c_idx], coords[h_idx], coords[w_idx])
+            ip_x = ip_y = 0
+            if j < pad_left:
+                ip_x = pad_left * 2 - j
+            elif j >= pad_left and j < dx_width + pad_left:
+                ip_x = j
+            else:
+                ip_x = (dx_width + pad_left - 1) * 2 - j
+            if i < pad_top:
+                ip_y = pad_top * 2 - i
+            elif i >= pad_top and i < dx_height + pad_top:
+                ip_y = i
+            else:
+                ip_y = (dx_height + pad_top - 1) * 2 - i
+            ip_x = ip_x - pad_left
+            ip_y = ip_y - pad_top
+            src_index = n * src_num + c * dy_width * dy_height + i * dy_width + j
+            dest_index = (
+                n * dest_num + c * dx_width * dx_height + ip_y * dx_width + ip_x
+            )
+            array_dest[dest_index] += array_src[src_index]
+        numpy_dest = Array2Numpy(array_dest, dest.shape)
+        return numpy_dest
+
+    if is_float:
+        input = np.random.random(input_shape).astype(np.float32)
+    else:
+        input = np.random.randint(0, 100, input_shape)
+    output = _np_reflection_pad2d(input, pad_shape)
+    grad = _np_reflection_pad2d_grad(output, input)
+    numpy_results = {"input": input, "padding": padding, "output": output, "grad": grad}
+    return numpy_results
+
+
+def _compare_op_function_with_samples(
+    test_case, device_type, sample, value_type, machine_ids, device_count
+):
+    op_function = _make_op_function(
+        test_case,
+        sample["input"].astype(value_type[0]),
+        sample["padding"],
+        sample["grad"].astype(value_type[0]),
+        device_type,
+        value_type[1],
+        machine_ids,
+        device_count,
+    )
+    y = (
+        op_function(sample["input"].astype(value_type[0]))
+        .get()
+        .numpy()
+        .astype(value_type[0])
+    )
+    if value_type == flow.float16:
+        test_case.assertTrue(
+            np.allclose(y, sample["output"].astype(np.float32), 0.001, 0.001)
+        )
+    else:
+        test_case.assertTrue(np.allclose(y, sample["output"].astype(value_type[0])))
+
+
+def _gen_arg_dict(
+    device_type="gpu", value_type="float", machine_ids="0:0", device_count=1
+):
+    arg_dict = OrderedDict()
+    arg_dict["device_type"] = [device_type]
+    arg_dict["samples"] = []
+    arg_dict["samples"].append(gen_numpy_test_sample((2, 1, 2, 2), [1, 1, 1, 1]))
+    arg_dict["samples"].append(gen_numpy_test_sample((4, 2, 3, 3), [2, 2, 2, 2]))
+    arg_dict["samples"].append(gen_numpy_test_sample((2, 3, 4, 5), [3, 2, 1, 2]))
+    if value_type == "float":
+        if device_type == "gpu":
+            arg_dict["value_type"] = [(np.float32, flow.float32)]
+        else:
+            arg_dict["value_type"] = [(np.float32, flow.float32)]
+    elif value_type == "int":
+        arg_dict["value_type"] = [(np.float32, flow.int32)]
+    else:
+        raise "float or int for value type only"
+    arg_dict["machine_ids"] = [machine_ids]
+    arg_dict["device_count"] = [device_count]
+    return arg_dict
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestReflectionPad2d1n1d(flow.unittest.TestCase):
+    def test_op_function_int_cpu(test_case):
+        arg_dict = _gen_arg_dict("cpu", "int", "0:0", 1)
+        for arg in GenArgList(arg_dict):
+            _compare_op_function_with_samples(test_case, *arg)
+
+    def test_op_function_float_cpu(test_case):
+        arg_dict = _gen_arg_dict("cpu", "float", "0:0", 1)
+        for arg in GenArgList(arg_dict):
+            _compare_op_function_with_samples(test_case, *arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_op_function_int_gpu(test_case):
+        arg_dict = _gen_arg_dict("gpu", "int", "0:0", 1)
+        for arg in GenArgList(arg_dict):
+            _compare_op_function_with_samples(test_case, *arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_op_function_float_gpu(test_case):
+        arg_dict = _gen_arg_dict("gpu", "float", "0:0", 1)
+        for arg in GenArgList(arg_dict):
+            _compare_op_function_with_samples(test_case, *arg)
+
+
+@flow.unittest.skip_unless_1n2d()
+class TestReflectionPad2d1n2d(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_op_function_float(test_case):
+        arg_dict = _gen_arg_dict("gpu", "float", "0:0-1", 2)
+        for arg in GenArgList(arg_dict):
+            _compare_op_function_with_samples(test_case, *arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_op_function_int(test_case):
+        arg_dict = _gen_arg_dict("gpu", "int", "0:0-1", 2)
+        for arg in GenArgList(arg_dict):
+            _compare_op_function_with_samples(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_relu6.py b/python/oneflow/compatible/single_client/test/ops/test_relu6.py
new file mode 100644
index 0000000000000000000000000000000000000000..a26a479e3ce1fe56837417c46835aa76d0a88c88
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_relu6.py
@@ -0,0 +1,192 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+from typing import Dict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+def _compare_relu6_with_np(
+    input_shape, device_type, value_type, machine_ids, device_counts
+):
+    if value_type[1] == flow.float16:
+        input_1 = np.random.uniform(-1, 7, size=input_shape).astype(np.float16)
+        input_1 = np.array(input_1, dtype=value_type[0])
+    else:
+        input_1 = np.random.uniform(-1, 7, size=input_shape).astype(value_type[0])
+    assert device_type in ["cpu", "gpu"]
+    flow.clear_default_session()
+    if device_type == "cpu":
+        flow.config.cpu_device_num(device_counts)
+    else:
+        flow.config.gpu_device_num(device_counts)
+    func_config = flow.FunctionConfig()
+    func_config.default_placement_scope(flow.scope.placement(device_type, machine_ids))
+    if value_type == flow.float16:
+        func_config.default_data_type(flow.float32)
+    else:
+        func_config.default_data_type(value_type[1])
+
+    def np_relu6(input):
+        out = np.clip(input, 0.0, 6.0)
+        return np.array(out).astype(value_type[0])
+
+    np_out_relu6 = np_relu6(input_1)
+
+    def np_diff(input):
+        input_shape = input.shape
+        input = input.flatten()
+        elem_cnt = input.size
+        diff = np.zeros(shape=(elem_cnt,))
+        for i in range(elem_cnt):
+            if input[i] > 0 and input[i] < 6:
+                diff[i] = 1
+        diff = np.reshape(diff, newshape=input_shape)
+        diff = np.array(diff, dtype=value_type[0])
+        return diff
+
+    _np_grad = np_diff(input_1)
+
+    def assert_prediction_grad(blob: tp.Numpy):
+        if value_type[1] == flow.float16:
+            assert np.allclose(blob, _np_grad, atol=0.001)
+        else:
+            assert np.allclose(blob, _np_grad, atol=1e-05)
+
+    if value_type[1] == flow.float16:
+
+        @flow.global_function(type="train", function_config=func_config)
+        def oneflow_relu6(
+            of_input_1: tp.Numpy.Placeholder(shape=input_1.shape, dtype=flow.float32)
+        ) -> tp.Numpy:
+            with flow.scope.placement(device_type, "0:0"):
+                v = flow.get_variable(
+                    shape=input_1.shape,
+                    dtype=flow.float32,
+                    initializer=flow.zeros_initializer(),
+                    name="x_var",
+                )
+                x_var = of_input_1 + v
+                x_f16 = flow.cast(x_var, flow.float16)
+            of_relu6_out_f16 = flow.nn.relu6(x_f16)
+            of_relu6_out_f32 = flow.cast(of_relu6_out_f16, flow.float32)
+            with flow.scope.placement(device_type, "0:0"):
+                flow.optimizer.SGD(
+                    flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+                ).minimize(of_relu6_out_f32)
+            flow.watch_diff(x_var, assert_prediction_grad)
+            return of_relu6_out_f32
+
+    elif value_type[1] == flow.float32 or value_type[1] == flow.float64:
+
+        @flow.global_function(type="train", function_config=func_config)
+        def oneflow_relu6(
+            of_input_1: tp.Numpy.Placeholder(shape=input_1.shape, dtype=value_type[1])
+        ) -> tp.Numpy:
+            with flow.scope.placement(device_type, "0:0"):
+                v = flow.get_variable(
+                    shape=input_1.shape,
+                    dtype=value_type[1],
+                    initializer=flow.zeros_initializer(),
+                    name="x_var",
+                )
+                x_var = of_input_1 + v
+            flow.watch_diff(x_var, assert_prediction_grad)
+            of_relu6_out = flow.nn.relu6(x_var)
+            with flow.scope.placement(device_type, "0:0"):
+                flow.optimizer.SGD(
+                    flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+                ).minimize(of_relu6_out)
+            return of_relu6_out
+
+    of_out_relu6 = oneflow_relu6(input_1)
+    if value_type[1] == flow.float16:
+        assert np.allclose(of_out_relu6, np_out_relu6, atol=0.001)
+    else:
+        assert np.allclose(of_out_relu6, np_out_relu6, atol=1e-05)
+
+
+def _gen_arg_dict(shape, device_type, value_type, machine_ids, device_counts):
+    arg_dict = OrderedDict()
+    arg_dict["input_shape"] = [shape]
+    arg_dict["device_type"] = [device_type]
+    if value_type == "float" and device_type == "cpu":
+        arg_dict["value_type"] = [
+            (np.float32, flow.float32),
+            (np.float64, flow.float64),
+        ]
+    else:
+        arg_dict["value_type"] = [
+            (np.float32, flow.float16),
+            (np.float32, flow.float32),
+            (np.float64, flow.float64),
+        ]
+    arg_dict["machine_ids"] = [machine_ids]
+    arg_dict["device_counts"] = [device_counts]
+    return arg_dict
+
+
+@flow.unittest.skip_unless_1n1d()
+class Testrelu61n1d(flow.unittest.TestCase):
+    def test_relu6_cpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(16, 16),
+            device_type="cpu",
+            value_type="float",
+            machine_ids="0:0",
+            device_counts=1,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_relu6_with_np(*arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_relu6_gpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(4, 8, 16),
+            device_type="gpu",
+            value_type="float",
+            machine_ids="0:0",
+            device_counts=1,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_relu6_with_np(*arg)
+
+
+@flow.unittest.skip_unless_1n2d()
+class Testrelu61n2d(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_relu6_gpu_1n2d(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(4, 8, 16),
+            device_type="gpu",
+            value_type="float",
+            machine_ids="0:0-1",
+            device_counts=2,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_relu6_with_np(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_repeat_acc.py b/python/oneflow/compatible/single_client/test/ops/test_repeat_acc.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5de7078bd6d3b047f0539ce39a1046840bd0716
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_repeat_acc.py
@@ -0,0 +1,71 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import Args, GenArgDict
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+func_config = flow.FunctionConfig()
+func_config.default_logical_view(flow.scope.mirrored_view())
+func_config.default_data_type(flow.float)
+
+
+def test_repeat_acc(test_case, device_type, shape, dtype, acc_num):
+    flow.clear_default_session()
+    if flow.eager_execution_enabled():
+        return
+
+    @flow.global_function(function_config=func_config)
+    def RepeatAccJob(a: oft.Numpy.Placeholder(shape)):
+        if dtype == "float16":
+            return flow.cast(
+                flow.acc(flow.repeat(flow.cast(a, flow.float16), acc_num), acc_num),
+                flow.float,
+            )
+        else:
+            return flow.acc(flow.repeat(a, acc_num), acc_num)
+
+    x = np.random.rand(*shape).astype(np.float32)
+    y = RepeatAccJob(x).get().numpy()
+    z = x * acc_num
+    if dtype == "float16":
+        z = x.astype(np.float16) * acc_num
+        z = z.astype(np.float32)
+    test_case.assertTrue(np.allclose(y, z, rtol=1e-05, atol=1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestRepeatAcc(flow.unittest.TestCase):
+    def test_case(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["shape"] = [(1024, 1024, 4)]
+        arg_dict["dtype"] = ["float16", "float32", "double"]
+        arg_dict["acc_num"] = [5]
+        for arg in GenArgDict(arg_dict):
+            if arg["device_type"] == "cpu" and arg["dtype"] == "float16":
+                continue
+            test_repeat_acc(test_case, **arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_replication_pad2d.py b/python/oneflow/compatible/single_client/test/ops/test_replication_pad2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..d74765fb12871030061d2b8a6e6dbfe944d58ff1
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_replication_pad2d.py
@@ -0,0 +1,270 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import (
+    Args,
+    Array2Numpy,
+    Coordinate2Index,
+    FlattenArray,
+    GenArgDict,
+    GenArgList,
+    Index2Coordinate,
+)
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+def _make_op_function(
+    test_case, input, padding, grad, device_type, value_type, machine_ids, device_counts
+):
+    flow.clear_default_session()
+    if device_type == "cpu":
+        flow.config.cpu_device_num(device_counts)
+    else:
+        flow.config.gpu_device_num(device_counts)
+    func_config = flow.FunctionConfig()
+    if value_type == flow.float16:
+        func_config.default_data_type(flow.float32)
+    else:
+        func_config.default_data_type(value_type)
+    func_config.default_placement_scope(flow.scope.placement(device_type, machine_ids))
+    func_config.default_logical_view(flow.scope.consistent_view())
+
+    def _compare_diff(blob: tp.Numpy):
+        test_case.assertTrue(np.allclose(grad, blob, 0.001, 0.001))
+
+    if value_type == flow.float32 or value_type == flow.float64:
+
+        @flow.global_function(type="train", function_config=func_config)
+        def op_function(x: tp.Numpy.Placeholder(input.shape, dtype=value_type)):
+            with flow.scope.placement(device_type, "0:0"):
+                x += flow.get_variable(
+                    name="input",
+                    shape=input.shape,
+                    dtype=value_type,
+                    initializer=flow.zeros_initializer(),
+                )
+                out = flow.replication_pad2d(x, padding)
+                flow.optimizer.SGD(
+                    flow.optimizer.PiecewiseConstantScheduler([], [0]), momentum=0
+                ).minimize(out)
+            flow.watch_diff(x, _compare_diff)
+            return out
+
+        return op_function
+    elif value_type == flow.int32:
+
+        @flow.global_function(type="train", function_config=func_config)
+        def op_function(x: tp.Numpy.Placeholder(input.shape, dtype=flow.float32)):
+            with flow.scope.placement(device_type, "0:0"):
+                x += flow.get_variable(
+                    name="input",
+                    shape=input.shape,
+                    dtype=flow.float32,
+                    initializer=flow.zeros_initializer(),
+                )
+                y_int32 = flow.replication_pad2d(x, padding)
+                y_fp32 = flow.cast(y_int32, dtype=flow.float32)
+                flow.optimizer.SGD(
+                    flow.optimizer.PiecewiseConstantScheduler([], [0]), momentum=0
+                ).minimize(y_fp32)
+            flow.watch_diff(x, _compare_diff)
+            return y_fp32
+
+        return op_function
+    elif value_type == flow.float16:
+
+        @flow.global_function(type="train", function_config=func_config)
+        def op_function(x: tp.Numpy.Placeholder(input.shape, dtype=flow.float32)):
+            with flow.scope.placement(device_type, "0:0"):
+                x_var = flow.get_variable(
+                    name="input",
+                    shape=input.shape,
+                    dtype=flow.float32,
+                    initializer=flow.constant_initializer(0),
+                )
+                x_var = flow.cast_to_current_logical_view(x_var)
+                input_x = x_var + x
+                x_fp32 = flow.cast(input_x, flow.float32)
+                x_fp16 = flow.cast(input_x, dtype=flow.float16)
+                y_fp16 = flow.replication_pad2d(x_fp16, padding)
+                y_fp32 = flow.cast(y_fp16, dtype=flow.float32)
+                flow.optimizer.SGD(
+                    flow.optimizer.PiecewiseConstantScheduler([], [0]), momentum=0
+                ).minimize(y_fp32)
+            flow.watch_diff(x_fp32, _compare_diff)
+            return y_fp32
+
+        return op_function
+
+
+def gen_numpy_test_sample(input_shape, padding, is_float=True):
+    (c_idx, h_idx, w_idx) = (1, 2, 3)
+    pad_left = padding[0]
+    pad_right = padding[1]
+    pad_top = padding[2]
+    pad_bottom = padding[3]
+    pad_shape = ((0, 0), (0, 0), (pad_top, pad_bottom), (pad_left, pad_right))
+
+    def _np_replication_pad2d(input, pad_shape):
+        numpy_replicate = np.pad(input, pad_shape, "edge")
+        return numpy_replicate
+
+    def _np_replication_pad2d_grad(src, dest):
+        (dx_height, dx_width) = (input.shape[h_idx], input.shape[w_idx])
+        (dy_height, dy_width) = (output.shape[h_idx], output.shape[w_idx])
+        numpy_src = np.ones(src.shape, np.int32)
+        numpy_dest = np.zeros(dest.shape, np.int32)
+        array_src = FlattenArray(numpy_src)
+        array_dest = FlattenArray(numpy_dest)
+        src_num = src.shape[c_idx] * src.shape[h_idx] * src.shape[w_idx]
+        dest_num = dest.shape[c_idx] * dest.shape[h_idx] * dest.shape[w_idx]
+        elements_num = src.shape[0] * src_num
+        for iter_n in range(elements_num):
+            coords = Index2Coordinate(iter_n, src.shape)
+            (n, c, i, j) = (coords[0], coords[c_idx], coords[h_idx], coords[w_idx])
+            ip_x = ip_y = 0
+            if j < pad_left:
+                ip_x = pad_left
+            elif j >= pad_left and j < dx_width + pad_left:
+                ip_x = j
+            else:
+                ip_x = dx_width + pad_left - 1
+            if i < pad_top:
+                ip_y = pad_top
+            elif i >= pad_top and i < dx_height + pad_top:
+                ip_y = i
+            else:
+                ip_y = dx_height + pad_top - 1
+            ip_x = ip_x - pad_left
+            ip_y = ip_y - pad_top
+            src_index = n * src_num + c * dy_width * dy_height + i * dy_width + j
+            dest_index = (
+                n * dest_num + c * dx_width * dx_height + ip_y * dx_width + ip_x
+            )
+            array_dest[dest_index] += array_src[src_index]
+        numpy_dest = Array2Numpy(array_dest, dest.shape)
+        return numpy_dest
+
+    if is_float:
+        input = np.random.random(input_shape).astype(np.float32)
+    else:
+        input = np.random.randint(0, 100, input_shape)
+    output = _np_replication_pad2d(input, pad_shape)
+    grad = _np_replication_pad2d_grad(output, input)
+    numpy_results = {"input": input, "padding": padding, "output": output, "grad": grad}
+    return numpy_results
+
+
+def _compare_op_function_with_samples(
+    test_case, device_type, sample, value_type, machine_ids, device_count
+):
+    op_function = _make_op_function(
+        test_case,
+        sample["input"].astype(value_type[0]),
+        sample["padding"],
+        sample["grad"].astype(value_type[0]),
+        device_type,
+        value_type[1],
+        machine_ids,
+        device_count,
+    )
+    y = (
+        op_function(sample["input"].astype(value_type[0]))
+        .get()
+        .numpy()
+        .astype(value_type[0])
+    )
+    if value_type == flow.float16:
+        test_case.assertTrue(
+            np.allclose(y, sample["output"].astype(np.float32), 0.001, 0.001)
+        )
+    else:
+        test_case.assertTrue(np.allclose(y, sample["output"].astype(value_type[0])))
+
+
+def _gen_arg_dict(
+    device_type="gpu", value_type="float", machine_ids="0:0", device_count=1
+):
+    arg_dict = OrderedDict()
+    arg_dict["device_type"] = [device_type]
+    arg_dict["samples"] = []
+    arg_dict["samples"].append(gen_numpy_test_sample((2, 1, 2, 2), [1, 1, 1, 1]))
+    arg_dict["samples"].append(gen_numpy_test_sample((4, 2, 3, 3), [2, 2, 2, 2]))
+    arg_dict["samples"].append(gen_numpy_test_sample((2, 3, 4, 5), [3, 2, 1, 2]))
+    if value_type == "float":
+        if device_type == "gpu":
+            arg_dict["value_type"] = [(np.float32, flow.float32)]
+        else:
+            arg_dict["value_type"] = [(np.float32, flow.float32)]
+    elif value_type == "int":
+        arg_dict["value_type"] = [(np.float32, flow.int32)]
+    else:
+        raise "float or int for value type only"
+    arg_dict["machine_ids"] = [machine_ids]
+    arg_dict["device_count"] = [device_count]
+    return arg_dict
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestReplicationPad2d1n1d(flow.unittest.TestCase):
+    def test_op_function_int_cpu(test_case):
+        arg_dict = _gen_arg_dict("cpu", "int", "0:0", 1)
+        for arg in GenArgList(arg_dict):
+            _compare_op_function_with_samples(test_case, *arg)
+
+    def test_op_function_float_cpu(test_case):
+        arg_dict = _gen_arg_dict("cpu", "float", "0:0", 1)
+        for arg in GenArgList(arg_dict):
+            _compare_op_function_with_samples(test_case, *arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_op_function_int_gpu(test_case):
+        arg_dict = _gen_arg_dict("gpu", "int", "0:0", 1)
+        for arg in GenArgList(arg_dict):
+            _compare_op_function_with_samples(test_case, *arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_op_function_float_gpu(test_case):
+        arg_dict = _gen_arg_dict("gpu", "float", "0:0", 1)
+        for arg in GenArgList(arg_dict):
+            _compare_op_function_with_samples(test_case, *arg)
+
+
+@flow.unittest.skip_unless_1n2d()
+class TestReplicationPad2d1n2d(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_op_function_float(test_case):
+        arg_dict = _gen_arg_dict("gpu", "float", "0:0-1", 2)
+        for arg in GenArgList(arg_dict):
+            _compare_op_function_with_samples(test_case, *arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_op_function_int(test_case):
+        arg_dict = _gen_arg_dict("gpu", "int", "0:0-1", 2)
+        for arg in GenArgList(arg_dict):
+            _compare_op_function_with_samples(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_reshape.py b/python/oneflow/compatible/single_client/test/ops/test_reshape.py
new file mode 100644
index 0000000000000000000000000000000000000000..aea3d4b4670dd244ab86167337df2db3afc84e90
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_reshape.py
@@ -0,0 +1,51 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+from test_util import Args, CompareOpWithTensorFlow, GenArgDict
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestReshape(flow.unittest.TestCase):
+    def test_reshape(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["flow_op"] = [flow.reshape]
+        arg_dict["tf_op"] = [tf.reshape]
+        arg_dict["input_shape"] = [(10, 10, 10)]
+        arg_dict["op_args"] = [
+            Args([(100, 10)]),
+            Args([(10, 100)]),
+            Args([(5, 20, 10)]),
+        ]
+        for arg in GenArgDict(arg_dict):
+            CompareOpWithTensorFlow(**arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_reshapeV2.py b/python/oneflow/compatible/single_client/test/ops/test_reshapeV2.py
new file mode 100644
index 0000000000000000000000000000000000000000..43a98e205556e466f1f46ecfcd7417cab4d4ccbc
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_reshapeV2.py
@@ -0,0 +1,84 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import test_global_storage
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def compare_with_tensorflow(device_type, input_shape, shape):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def ReshapeJob():
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "in",
+                shape=input_shape,
+                dtype=flow.float,
+                initializer=flow.random_uniform_initializer(minval=2, maxval=5),
+                trainable=True,
+            )
+            loss = flow.reshape(x, shape)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch(loss, test_global_storage.Setter("loss"))
+            flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))
+            return loss
+
+    of_out = ReshapeJob().get()
+    with tf.GradientTape(persistent=True) as tape:
+        x = tf.Variable(test_global_storage.Get("x"))
+        tf_out = tf.reshape(x, shape)
+    loss_diff = test_global_storage.Get("loss_diff")
+    tf_x_diff = tape.gradient(tf_out, x, loss_diff)
+    assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=1e-05, atol=1e-05)
+    assert np.allclose(
+        test_global_storage.Get("x_diff"), tf_x_diff.numpy(), rtol=1e-05, atol=1e-05
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestReshapeV2(flow.unittest.TestCase):
+    def test_reshape(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(5, 4, 3), (2, 3, 4, 5)]
+        arg_dict["shape"] = [[2, -1], [-1], [3, -1]]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_reshapeV3.py b/python/oneflow/compatible/single_client/test/ops/test_reshapeV3.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc20f0ac6e2bbb00245e851ca2c842b66f39ac6b
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_reshapeV3.py
@@ -0,0 +1,67 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+
+def distribute_reshape_test(device_type, device_num, input_shape, shape):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    flow.config.gpu_device_num(device_num)
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def ReshapeJob():
+        with flow.scope.placement(device_type, "0:0-{}".format(device_num - 1)):
+            x = flow.get_variable(
+                "var_x",
+                shape=input_shape,
+                dtype=flow.float,
+                initializer=flow.random_uniform_initializer(minval=2, maxval=5),
+                trainable=True,
+                distribute=flow.distribute.split(2),
+            )
+            loss = flow.reshape(x, shape)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            return (x, loss)
+
+    (x, loss) = ReshapeJob().get()
+
+
+@flow.unittest.skip_unless_1n2d()
+class TestReshapeV2(flow.unittest.TestCase):
+    def test_reshape(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["device_num"] = [2]
+        arg_dict["input_shape"] = [(5, 8, 16)]
+        arg_dict["shape"] = [[-1, 16]]
+        for arg in GenArgList(arg_dict):
+            distribute_reshape_test(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_reverse.py b/python/oneflow/compatible/single_client/test/ops/test_reverse.py
new file mode 100644
index 0000000000000000000000000000000000000000..23fea320bcb170d3155a4fe12f81162f4e80d274
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_reverse.py
@@ -0,0 +1,83 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+def _of_reverse(input, axis, dtype):
+    flow.clear_default_session()
+
+    @flow.global_function()
+    def reverse(
+        input: tp.Numpy.Placeholder(shape=input.shape, dtype=dtype)
+    ) -> tp.Numpy:
+        return flow.reverse(input, axis)
+
+    return reverse(input)
+
+
+def _test_reverse(test_case, input, axis, dtype, verbose=False):
+    assert isinstance(input, np.ndarray)
+    input = input.astype(flow.convert_oneflow_dtype_to_numpy_dtype(dtype))
+    slice_list = [slice(None)] * input.ndim
+    for a in axis:
+        if a < 0:
+            a += input.ndim
+        assert a >= 0 and a < input.ndim
+        slice_list[a] = slice(None, None, -1)
+    output = input[tuple(slice_list)]
+    of_output = _of_reverse(input, axis, dtype)
+    if verbose:
+        print("input: {}\n{}\n".format(input.shape, input))
+        print("comparing output:\n{}\nvs.\n{}".format(output, of_output))
+    test_case.assertTrue(np.array_equal(output, of_output))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestReverse(flow.unittest.TestCase):
+    def test_reverse_case_1(test_case):
+        input = np.arange(1 * 2 * 3 * 4).reshape(1, 2, 3, 4)
+        _test_reverse(test_case, input, [3], flow.int32)
+
+    def test_reverse_case_2(test_case):
+        input = np.arange(1 * 2 * 3 * 4).reshape(1, 2, 3, 4)
+        _test_reverse(test_case, input, [-1], flow.int32)
+
+    def test_reverse_case_3(test_case):
+        input = np.arange(1 * 2 * 3 * 4).reshape(1, 2, 3, 4)
+        _test_reverse(test_case, input, [1], flow.int32)
+
+    def test_reverse_case_4(test_case):
+        input = np.arange(1 * 2 * 3 * 4).reshape(1, 2, 3, 4)
+        _test_reverse(test_case, input, [-3], flow.int32)
+
+    def test_reverse_case_5(test_case):
+        input = np.arange(1 * 2 * 3 * 4).reshape(1, 2, 3, 4)
+        _test_reverse(test_case, input, [2], flow.float32)
+
+    def test_reverse_case_6(test_case):
+        input = np.arange(1 * 2 * 3 * 4).reshape(1, 2, 3, 4)
+        _test_reverse(test_case, input, [-2], flow.float32)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_rsqrt.py b/python/oneflow/compatible/single_client/test/ops/test_rsqrt.py
new file mode 100644
index 0000000000000000000000000000000000000000..0631b45e97cf7275ad323a375563d571c1c1c040
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_rsqrt.py
@@ -0,0 +1,58 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def _check(test_case, x, y):
+    np_rsqrt = 1.0 / np.sqrt(x)
+    test_case.assertTrue(np.allclose(np_rsqrt, y))
+
+
+def _run_test(test_case, x, dtype, device):
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.consistent_view())
+
+    @flow.global_function(function_config=func_config)
+    def RsqrtJob(x: oft.Numpy.Placeholder(x.shape, dtype=dtype)):
+        return flow.math.rsqrt(x)
+
+    y = RsqrtJob(x).get()
+    _check(test_case, x, y.numpy())
+
+
+@flow.unittest.skip_unless_1n2d()
+class TestRsqrt(flow.unittest.TestCase):
+    def test_rsqrt_random_gpu(test_case):
+        flow.config.gpu_device_num(2)
+        x = np.random.rand(10, 3, 32, 1024).astype(np.float32)
+        _run_test(test_case, x, flow.float, "gpu")
+
+    def test_rsqrt_random_cpu(test_case):
+        flow.config.gpu_device_num(2)
+        x = np.random.rand(10, 3, 32, 1024).astype(np.float32)
+        _run_test(test_case, x, flow.float, "cpu")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_scalar_by_tensor_int.py b/python/oneflow/compatible/single_client/test/ops/test_scalar_by_tensor_int.py
new file mode 100644
index 0000000000000000000000000000000000000000..15fbd9a312dacaeca90bee9e5da330e8c89e0a9b
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_scalar_by_tensor_int.py
@@ -0,0 +1,268 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def _check(test_case, x, y, out, case):
+    if case == "add":
+        np_out = np.add(x, y)
+    elif case == "sub":
+        np_out = np.subtract(x, y)
+    elif case == "mul":
+        np_out = np.multiply(x, y)
+    elif case == "div":
+        if type(y[0]) == np.float32 or type(y[0]) == np.double:
+            np_out = np.divide(x, y)
+        else:
+            np_out = np.floor_divide(x, y)
+    test_case.assertTrue(np.allclose(np_out, out, rtol=1e-05, atol=1e-05))
+
+
+def _run_test(test_case, x, y, case, dtype=None, device="gpu"):
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.consistent_view())
+
+    @flow.global_function(function_config=func_config)
+    def ScalarByTensorJob(
+        x: oft.Numpy.Placeholder(x.shape, dtype=dtype),
+        y: oft.Numpy.Placeholder(y.shape, dtype=dtype),
+    ):
+        if case == "add":
+            return flow.math.add(x, y)
+        elif case == "sub":
+            return flow.math.subtract(x, y)
+        elif case == "mul":
+            return flow.math.multiply(x, y)
+        elif case == "div":
+            return flow.math.divide(x, y)
+
+    out = ScalarByTensorJob(x, y).get()
+    _check(test_case, x, y, out.numpy(), case)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestScalarByTensorInt(flow.unittest.TestCase):
+    def test_scalar_add_by_tensor_gpu_float(test_case):
+        x = np.random.rand(10, 3, 32, 1024).astype(np.float32)
+        y = np.random.rand(1).astype(np.float32)
+        _run_test(test_case, x, y, "add", flow.float, "gpu")
+
+    def test_scalar_add_by_tensor_cpu_float(test_case):
+        x = np.random.rand(10, 3, 32, 1024).astype(np.float32)
+        y = np.random.rand(1).astype(np.float32)
+        _run_test(test_case, x, y, "add", flow.float, "cpu")
+
+    def test_scalar_add_by_tensor_gpu_double(test_case):
+        x = np.random.rand(10, 3, 32, 1024).astype(np.double)
+        y = np.random.rand(1).astype(np.double)
+        _run_test(test_case, x, y, "add", flow.double, "gpu")
+
+    def test_scalar_add_by_tensor_cpu_double(test_case):
+        x = np.random.rand(10, 3, 32, 1024).astype(np.double)
+        y = np.random.rand(1).astype(np.double)
+        _run_test(test_case, x, y, "add", flow.double, "cpu")
+
+    def test_scalar_add_by_tensor_gpu_int8(test_case):
+        x = np.random.randint(low=1, high=10, size=(10, 3, 32, 1024), dtype=np.int8)
+        y = np.random.randint(low=1, high=10, size=(1,), dtype=np.int8)
+        _run_test(test_case, x, y, "add", flow.int8, "gpu")
+
+    def test_scalar_add_by_tensor_cpu_int8(test_case):
+        x = np.random.randint(low=1, high=10, size=(10, 3, 32, 1024), dtype=np.int8)
+        y = np.random.randint(low=1, high=10, size=(1,), dtype=np.int8)
+        _run_test(test_case, x, y, "add", flow.int8, "cpu")
+
+    def test_scalar_add_by_tensor_gpu_int32(test_case):
+        x = np.random.randint(low=1, high=10, size=(10, 3, 32, 1024), dtype=np.int32)
+        y = np.random.randint(low=1, high=10, size=(1,), dtype=np.int32)
+        _run_test(test_case, x, y, "add", flow.int32, "gpu")
+
+    def test_scalar_add_by_tensor_cpu_int32(test_case):
+        x = np.random.randint(low=1, high=10, size=(10, 3, 32, 1024), dtype=np.int32)
+        y = np.random.randint(low=1, high=10, size=(1,), dtype=np.int32)
+        _run_test(test_case, x, y, "add", flow.int32, "cpu")
+
+    def test_scalar_add_by_tensor_gpu_int64(test_case):
+        x = np.random.randint(low=1, high=10, size=(10, 3, 32, 1024), dtype=np.int64)
+        y = np.random.randint(low=1, high=10, size=(1,), dtype=np.int64)
+        _run_test(test_case, x, y, "add", flow.int64, "gpu")
+
+    def test_scalar_add_by_tensor_cpu_int64(test_case):
+        x = np.random.randint(low=1, high=10, size=(10, 3, 32, 1024), dtype=np.int64)
+        y = np.random.randint(low=1, high=10, size=(1,), dtype=np.int64)
+        _run_test(test_case, x, y, "add", flow.int64, "cpu")
+
+    def test_scalar_sub_by_tensor_gpu_float(test_case):
+        x = np.random.rand(10, 3, 32, 1024).astype(np.float32)
+        y = np.random.rand(1).astype(np.float32)
+        _run_test(test_case, x, y, "sub", flow.float, "gpu")
+
+    def test_scalar_sub_by_tensor_cpu_float(test_case):
+        x = np.random.rand(10, 3, 32, 1024).astype(np.float32)
+        y = np.random.rand(1).astype(np.float32)
+        _run_test(test_case, x, y, "sub", flow.float, "cpu")
+
+    def test_scalar_sub_by_tensor_gpu_double(test_case):
+        x = np.random.rand(10, 3, 32, 1024).astype(np.double)
+        y = np.random.rand(1).astype(np.double)
+        _run_test(test_case, x, y, "sub", flow.double, "gpu")
+
+    def test_scalar_sub_by_tensor_cpu_double(test_case):
+        x = np.random.rand(10, 3, 32, 1024).astype(np.double)
+        y = np.random.rand(1).astype(np.double)
+        _run_test(test_case, x, y, "sub", flow.double, "cpu")
+
+    def test_scalar_sub_by_tensor_gpu_int8(test_case):
+        x = np.random.randint(low=1, high=10, size=(10, 3, 32, 1024), dtype=np.int8)
+        y = np.random.randint(low=1, high=10, size=(1,), dtype=np.int8)
+        _run_test(test_case, x, y, "sub", flow.int8, "gpu")
+
+    def test_scalar_sub_by_tensor_cpu_int8(test_case):
+        x = np.random.randint(low=1, high=10, size=(10, 3, 32, 1024), dtype=np.int8)
+        y = np.random.randint(low=1, high=10, size=(1,), dtype=np.int8)
+        _run_test(test_case, x, y, "sub", flow.int8, "cpu")
+
+    def test_scalar_sub_by_tensor_gpu_int32(test_case):
+        x = np.random.randint(low=1, high=10, size=(10, 3, 32, 1024), dtype=np.int32)
+        y = np.random.randint(low=1, high=10, size=(1,), dtype=np.int32)
+        _run_test(test_case, x, y, "sub", flow.int32, "gpu")
+
+    def test_scalar_sub_by_tensor_cpu_int32(test_case):
+        x = np.random.randint(low=1, high=10, size=(10, 3, 32, 1024), dtype=np.int32)
+        y = np.random.randint(low=1, high=10, size=(1,), dtype=np.int32)
+        _run_test(test_case, x, y, "sub", flow.int32, "cpu")
+
+    def test_scalar_sub_by_tensor_gpu_int64(test_case):
+        x = np.random.randint(low=1, high=10, size=(10, 3, 32, 1024), dtype=np.int64)
+        y = np.random.randint(low=1, high=10, size=(1,), dtype=np.int64)
+        _run_test(test_case, x, y, "sub", flow.int64, "gpu")
+
+    def test_scalar_sub_by_tensor_cpu_int64(test_case):
+        x = np.random.randint(low=1, high=10, size=(10, 3, 32, 1024), dtype=np.int64)
+        y = np.random.randint(low=1, high=10, size=(1,), dtype=np.int64)
+        _run_test(test_case, x, y, "sub", flow.int64, "cpu")
+
+    def test_scalar_mul_by_tensor_gpu_float(test_case):
+        x = np.random.rand(10, 3, 32, 1024).astype(np.float32)
+        y = np.random.rand(1).astype(np.float32)
+        _run_test(test_case, x, y, "mul", flow.float, "gpu")
+
+    def test_scalar_mul_by_tensor_cpu_float(test_case):
+        x = np.random.rand(10, 3, 32, 1024).astype(np.float32)
+        y = np.random.rand(1).astype(np.float32)
+        _run_test(test_case, x, y, "mul", flow.float, "cpu")
+
+    def test_scalar_mul_by_tensor_gpu_double(test_case):
+        x = np.random.rand(10, 3, 32, 1024).astype(np.double)
+        y = np.random.rand(1).astype(np.double)
+        _run_test(test_case, x, y, "mul", flow.double, "gpu")
+
+    def test_scalar_mul_by_tensor_cpu_double(test_case):
+        x = np.random.rand(10, 3, 32, 1024).astype(np.double)
+        y = np.random.rand(1).astype(np.double)
+        _run_test(test_case, x, y, "mul", flow.double, "cpu")
+
+    def test_scalar_mul_by_tensor_gpu_int8(test_case):
+        x = np.random.randint(low=1, high=10, size=(10, 3, 32, 1024), dtype=np.int8)
+        y = np.random.randint(low=1, high=10, size=(1,), dtype=np.int8)
+        _run_test(test_case, x, y, "mul", flow.int8, "gpu")
+
+    def test_scalar_mul_by_tensor_cpu_int8(test_case):
+        x = np.random.randint(low=1, high=10, size=(10, 3, 32, 1024), dtype=np.int8)
+        y = np.random.randint(low=1, high=10, size=(1,), dtype=np.int8)
+        _run_test(test_case, x, y, "mul", flow.int8, "cpu")
+
+    def test_scalar_mul_by_tensor_gpu_int32(test_case):
+        x = np.random.randint(low=1, high=10, size=(10, 3, 32, 1024), dtype=np.int32)
+        y = np.random.randint(low=1, high=10, size=(1,), dtype=np.int32)
+        _run_test(test_case, x, y, "mul", flow.int32, "gpu")
+
+    def test_scalar_mul_by_tensor_cpu_int32(test_case):
+        x = np.random.randint(low=1, high=10, size=(10, 3, 32, 1024), dtype=np.int32)
+        y = np.random.randint(low=1, high=10, size=(1,), dtype=np.int32)
+        _run_test(test_case, x, y, "mul", flow.int32, "cpu")
+
+    def test_scalar_mul_by_tensor_gpu_int64(test_case):
+        x = np.random.randint(low=1, high=10, size=(10, 3, 32, 1024), dtype=np.int64)
+        y = np.random.randint(low=1, high=10, size=(1,), dtype=np.int64)
+        _run_test(test_case, x, y, "mul", flow.int64, "gpu")
+
+    def test_scalar_mul_by_tensor_cpu_int64(test_case):
+        x = np.random.randint(low=1, high=10, size=(10, 3, 32, 1024), dtype=np.int64)
+        y = np.random.randint(low=1, high=10, size=(1,), dtype=np.int64)
+        _run_test(test_case, x, y, "mul", flow.int64, "cpu")
+
+    def test_scalar_div_by_tensor_gpu_float(test_case):
+        x = np.random.rand(10, 3, 32, 1024).astype(np.float32)
+        y = np.random.rand(1).astype(np.float32)
+        _run_test(test_case, x, y, "div", flow.float, "gpu")
+
+    def test_scalar_div_by_tensor_cpu_float(test_case):
+        x = np.random.rand(10, 3, 32, 1024).astype(np.float32)
+        y = np.random.rand(1).astype(np.float32)
+        _run_test(test_case, x, y, "div", flow.float, "cpu")
+
+    def test_scalar_div_by_tensor_gpu_double(test_case):
+        x = np.random.rand(10, 3, 32, 1024).astype(np.double)
+        y = np.random.rand(1).astype(np.double)
+        _run_test(test_case, x, y, "div", flow.double, "gpu")
+
+    def test_scalar_div_by_tensor_cpu_double(test_case):
+        x = np.random.rand(10, 3, 32, 1024).astype(np.double)
+        y = np.random.rand(1).astype(np.double)
+        _run_test(test_case, x, y, "div", flow.double, "cpu")
+
+    def test_scalar_div_by_tensor_gpu_int8(test_case):
+        x = np.random.randint(low=1, high=10, size=(10, 3, 32, 1024), dtype=np.int8)
+        y = np.random.randint(low=1, high=10, size=(1,), dtype=np.int8)
+        _run_test(test_case, x, y, "div", flow.int8, "gpu")
+
+    def test_scalar_div_by_tensor_cpu_int8(test_case):
+        x = np.random.randint(low=1, high=10, size=(10, 3, 32, 1024), dtype=np.int8)
+        y = np.random.randint(low=1, high=10, size=(1,), dtype=np.int8)
+        _run_test(test_case, x, y, "div", flow.int8, "cpu")
+
+    def test_scalar_div_by_tensor_gpu_int32(test_case):
+        x = np.random.randint(low=1, high=10, size=(10, 3, 32, 1024), dtype=np.int32)
+        y = np.random.randint(low=1, high=10, size=(1,), dtype=np.int32)
+        _run_test(test_case, x, y, "div", flow.int32, "gpu")
+
+    def test_scalar_div_by_tensor_cpu_int32(test_case):
+        x = np.random.randint(low=1, high=10, size=(10, 3, 32, 1024), dtype=np.int32)
+        y = np.random.randint(low=1, high=10, size=(1,), dtype=np.int32)
+        _run_test(test_case, x, y, "div", flow.int32, "cpu")
+
+    def test_scalar_div_by_tensor_gpu_int64(test_case):
+        x = np.random.randint(low=1, high=10, size=(10, 3, 32, 1024), dtype=np.int64)
+        y = np.random.randint(low=1, high=10, size=(1,), dtype=np.int64)
+        _run_test(test_case, x, y, "div", flow.int64, "gpu")
+
+    def test_scalar_div_by_tensor_cpu_int64(test_case):
+        x = np.random.randint(low=1, high=10, size=(10, 3, 32, 1024), dtype=np.int64)
+        y = np.random.randint(low=1, high=10, size=(1,), dtype=np.int64)
+        _run_test(test_case, x, y, "div", flow.int64, "cpu")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_scalar_by_tensor_ops.py b/python/oneflow/compatible/single_client/test/ops/test_scalar_by_tensor_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c93336683592147bf38e15e272262d1df6f8cab1
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_scalar_by_tensor_ops.py
@@ -0,0 +1,140 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import test_global_storage
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def compare_with_tensorflow(device_type, data_type, x_shape, case):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def ScalarAddByTensorJob():
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "x",
+                shape=x_shape,
+                dtype=flow.float,
+                initializer=flow.random_uniform_initializer(minval=0, maxval=100),
+                trainable=True,
+            )
+            y = flow.get_variable(
+                "y",
+                shape=(1,),
+                dtype=flow.float,
+                initializer=flow.random_uniform_initializer(minval=0, maxval=100),
+                trainable=True,
+            )
+            if case == "add":
+                loss = flow.math.add(x, y)
+            elif case == "sub":
+                loss = flow.math.subtract(x, y)
+            elif case == "mul":
+                loss = flow.math.multiply(x, y)
+            elif case == "div":
+                loss = flow.math.divide(x, y)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch(y, test_global_storage.Setter("y"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch_diff(y, test_global_storage.Setter("y_diff"))
+            flow.watch(loss, test_global_storage.Setter("loss"))
+            flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))
+            return loss
+
+    of_out = ScalarAddByTensorJob().get()
+    with tf.GradientTape(persistent=True) as tape:
+        x = tf.Variable(test_global_storage.Get("x"))
+        y = tf.Variable(test_global_storage.Get("y"))
+        if case == "add":
+            tf_out = x + y
+        elif case == "sub":
+            tf_out = x - y
+        elif case == "mul":
+            tf_out = x * y
+        elif case == "div":
+            tf_out = x / y
+    loss_diff = test_global_storage.Get("loss_diff")
+    tf_x_diff = tape.gradient(tf_out, x, loss_diff)
+    tf_y_diff = tape.gradient(tf_out, y, loss_diff)
+    assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=1e-05, atol=1e-05)
+    assert np.allclose(
+        test_global_storage.Get("x_diff"), tf_x_diff.numpy(), rtol=1e-05, atol=1e-05
+    )
+    assert np.allclose(
+        test_global_storage.Get("y_diff"), tf_y_diff.numpy(), rtol=1e-05, atol=1e-05
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestScalarByTensorOps(flow.unittest.TestCase):
+    def test_add(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["data_type"] = [flow.float]
+        arg_dict["x_shape"] = [(10, 20, 30)]
+        arg_dict["case"] = ["add"]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_sub(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["data_type"] = [flow.float]
+        arg_dict["x_shape"] = [(10, 20, 30)]
+        arg_dict["case"] = ["sub"]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_mul(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["data_type"] = [flow.float]
+        arg_dict["x_shape"] = [(10, 20, 30)]
+        arg_dict["case"] = ["mul"]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_div(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["data_type"] = [flow.float]
+        arg_dict["x_shape"] = [(10, 20, 30)]
+        arg_dict["case"] = ["div"]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_scalar_divide.py b/python/oneflow/compatible/single_client/test/ops/test_scalar_divide.py
new file mode 100644
index 0000000000000000000000000000000000000000..4463101b41b392db772fdfe99adb048c5b195fd5
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_scalar_divide.py
@@ -0,0 +1,82 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestScalarDivide(flow.unittest.TestCase):
+    def test_scalar_div_2(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.consistent_view())
+        func_config.default_data_type(flow.float)
+
+        @flow.global_function(function_config=func_config)
+        def Div2Job(a: oft.Numpy.Placeholder((10, 10))):
+            return a / 2
+
+        x = np.random.rand(10, 10).astype(np.float32) + 1
+        y = Div2Job(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, x / 2))
+
+    def test_scalar_div_by_2(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.consistent_view())
+        func_config.default_data_type(flow.float)
+
+        @flow.global_function(function_config=func_config)
+        def DivBy2Job(a: oft.Numpy.Placeholder((10, 10))):
+            return 2 / a
+
+        x = np.random.rand(10, 10).astype(np.float32) + 1
+        y = DivBy2Job(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, 2 / x))
+
+    def test_scalar_div_2_mirrored(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.mirrored_view())
+        func_config.default_data_type(flow.float)
+
+        @flow.global_function(function_config=func_config)
+        def Div2Job(a: oft.ListNumpy.Placeholder((10, 10))):
+            return a / 2
+
+        x = np.random.rand(10, 10).astype(np.float32) + 1
+        y = Div2Job([x]).get().numpy_list()[0]
+        test_case.assertTrue(np.allclose(y, x / 2))
+
+    def test_scalar_div_by_2_mirrored(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.mirrored_view())
+        func_config.default_data_type(flow.float)
+
+        @flow.global_function(function_config=func_config)
+        def DivBy2Job(a: oft.ListNumpy.Placeholder((10, 10))):
+            return 2 / a
+
+        x = np.random.rand(10, 10).astype(np.float32) + 1
+        y = DivBy2Job([x]).get().numpy_list()[0]
+        test_case.assertTrue(np.allclose(y, 2 / x))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_scalar_pow.py b/python/oneflow/compatible/single_client/test/ops/test_scalar_pow.py
new file mode 100644
index 0000000000000000000000000000000000000000..af0407763806e8e8bc61005cead8418c074a7135
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_scalar_pow.py
@@ -0,0 +1,140 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import random
+import unittest
+from collections import OrderedDict
+from typing import Dict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+def _compare_scalar_pow_with_np(
+    input_shape, exponent, device_type, value_type, machine_ids, device_counts
+):
+    input_1 = np.random.uniform(0, 1, size=input_shape).astype(value_type[0])
+    assert device_type in ["cpu", "gpu"]
+    flow.clear_default_session()
+    if device_type == "cpu":
+        flow.config.cpu_device_num(device_counts)
+    else:
+        flow.config.gpu_device_num(device_counts)
+    func_config = flow.FunctionConfig()
+    func_config.default_placement_scope(flow.scope.placement(device_type, machine_ids))
+    func_config.default_data_type(value_type[1])
+
+    def np_pow(input, exponent):
+        out = np.power(input, exponent)
+        return np.array(out).astype(value_type[0])
+
+    np_out_pow = np_pow(input_1, exponent)
+
+    def np_diff(input, exponent):
+        diff = exponent * np.power(input, exponent - 1)
+        return diff
+
+    _np_grad = np_diff(input_1, exponent)
+
+    def assert_prediction_grad(blob: tp.Numpy):
+        assert np.allclose(blob, _np_grad, atol=1e-05)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def oneflow_pow(
+        of_input_1: tp.Numpy.Placeholder(shape=input_1.shape, dtype=value_type[1])
+    ) -> tp.Numpy:
+        with flow.scope.placement(device_type, "0:0"):
+            v = flow.get_variable(
+                shape=input_1.shape,
+                dtype=value_type[1],
+                initializer=flow.zeros_initializer(),
+                name="x_var",
+            )
+            x_var = of_input_1 + v
+        flow.watch_diff(x_var, assert_prediction_grad)
+        of_pow_out = flow.math.pow(x_var, exponent)
+        with flow.scope.placement(device_type, "0:0"):
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+            ).minimize(of_pow_out)
+        return of_pow_out
+
+    of_out_pow = oneflow_pow(input_1)
+    assert np.allclose(of_out_pow, np_out_pow, atol=1e-05)
+
+
+def _gen_arg_dict(shape, exponent, device_type, value_type, machine_ids, device_counts):
+    arg_dict = OrderedDict()
+    arg_dict["input_shape"] = [shape]
+    arg_dict["exponent"] = [exponent]
+    arg_dict["device_type"] = [device_type]
+    arg_dict["value_type"] = [(np.float32, flow.float32), (np.float64, flow.float64)]
+    arg_dict["machine_ids"] = [machine_ids]
+    arg_dict["device_counts"] = [device_counts]
+    return arg_dict
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestScalarPow1n1d(flow.unittest.TestCase):
+    def test_scalar_pow_cpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(3, 3),
+            exponent=1.4,
+            device_type="cpu",
+            value_type="float",
+            machine_ids="0:0",
+            device_counts=1,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_scalar_pow_with_np(*arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_scalar_pow_gpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(4, 4),
+            exponent=2.3,
+            device_type="gpu",
+            value_type="float",
+            machine_ids="0:0",
+            device_counts=1,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_scalar_pow_with_np(*arg)
+
+
+@flow.unittest.skip_unless_1n2d()
+class TestScalarPow1n2d(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_pow_gpu_1n2d(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(4, 8, 4),
+            exponent=2.0,
+            device_type="gpu",
+            value_type="float",
+            machine_ids="0:0-1",
+            device_counts=2,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_scalar_pow_with_np(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_scatter_nd.py b/python/oneflow/compatible/single_client/test/ops/test_scatter_nd.py
new file mode 100644
index 0000000000000000000000000000000000000000..17c884d2d6e81e7dcfc3270474d8c367e3063686
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_scatter_nd.py
@@ -0,0 +1,655 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def _random_inputs(
+    params_shape, indices_shape, updates_shape, allow_duplicate_index=True
+):
+    params = np.random.rand(*params_shape).astype(np.float32)
+    updates = np.random.rand(*updates_shape).astype(np.float32)
+    indices = []
+    indices_rows = np.prod(indices_shape[:-1])
+    indices_cols = indices_shape[-1]
+    for col in range(indices_cols):
+        if allow_duplicate_index is False and indices_rows <= params_shape[col]:
+            rand_indices = np.arange(params_shape[col], dtype=np.int32)
+            np.random.shuffle(rand_indices)
+            indices_col = rand_indices[:indices_rows].reshape(indices_shape[:-1])
+        else:
+            indices_col = np.random.randint(
+                low=0, high=params_shape[col], size=(indices_rows,), dtype=np.int32
+            ).reshape(indices_shape[:-1])
+        indices.append(indices_col)
+    indices = np.stack(indices, axis=len(indices_shape) - 1)
+    if allow_duplicate_index is False:
+        existing_nd_index_set = set()
+        for nd_index in indices.reshape(-1, indices.shape[-1]):
+            nd_index_str = "(" + ",".join(map(str, nd_index)) + ")"
+            assert (
+                nd_index_str not in existing_nd_index_set
+            ), "random generated duplicate nd index {}".format(nd_index_str)
+            existing_nd_index_set.add(nd_index_str)
+    return (params, updates, indices)
+
+
+def _make_scatter_nd_fn(indices, updates, shape, device_type, mirrored, compare_fn):
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    if mirrored:
+        func_config.default_logical_view(flow.scope.mirrored_view())
+    else:
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+    def do_scatter_nd(indices_blob, updates_blob):
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "updates",
+                shape=updates.shape,
+                dtype=flow.float32,
+                initializer=flow.constant_initializer(0),
+            )
+            x = flow.cast_to_current_logical_view(x)
+            x = x + updates_blob
+            y = flow.scatter_nd(indices_blob, x, shape)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+            ).minimize(y)
+        flow.watch_diff(x, compare_fn)
+        return y
+
+    if mirrored:
+
+        @flow.global_function(type="train", function_config=func_config)
+        def scatter_nd_fn(
+            indices_def: oft.ListNumpy.Placeholder(indices.shape, dtype=flow.int32),
+            updates_def: oft.ListNumpy.Placeholder(updates.shape, dtype=flow.float),
+        ):
+            return do_scatter_nd(indices_def, updates_def)
+
+    else:
+
+        @flow.global_function(type="train", function_config=func_config)
+        def scatter_nd_fn(
+            indices_def: oft.Numpy.Placeholder(indices.shape, dtype=flow.int32),
+            updates_def: oft.Numpy.Placeholder(updates.shape, dtype=flow.float),
+        ):
+            return do_scatter_nd(indices_def, updates_def)
+
+    return scatter_nd_fn
+
+
+def _compare_scatter_nd_with_tf(
+    test_case,
+    device_type,
+    params_shape,
+    indices_shape,
+    updates_shape,
+    mirrored=False,
+    verbose=False,
+):
+    (_, updates, indices) = _random_inputs(params_shape, indices_shape, updates_shape)
+    indices_const = tf.constant(indices)
+    with tf.GradientTape() as t:
+        x = tf.Variable(updates)
+        y = tf.scatter_nd(indices_const, x, params_shape)
+    dy_dx = t.gradient(y, x)
+    if mirrored:
+
+        def compare_dy(params_grad):
+            test_case.assertTrue(
+                np.array_equal(dy_dx.numpy(), params_grad.numpy_list()[0])
+            )
+
+    else:
+
+        def compare_dy(params_grad):
+            test_case.assertTrue(np.array_equal(dy_dx.numpy(), params_grad.numpy()))
+
+    scatter_nd_fn = _make_scatter_nd_fn(
+        indices, updates, params_shape, device_type, mirrored, compare_dy
+    )
+    if mirrored:
+        of_y = scatter_nd_fn([indices], [updates]).get().numpy_list()[0]
+    else:
+        of_y = scatter_nd_fn(indices, updates).get().numpy()
+    if verbose is True:
+        print("device_type:", device_type)
+        print("indices:", indices)
+        print("updates:", updates)
+        print("tf_params:", y.numpy())
+        print("of_params:", of_y)
+    test_case.assertTrue(np.allclose(y.numpy(), of_y))
+
+
+def _compare_scatter_nd_update_with_tf(
+    test_case,
+    device_type,
+    params_shape,
+    indices_shape,
+    updates_shape,
+    allow_duplicate_index=False,
+    verbose=False,
+):
+    (params, updates, indices) = _random_inputs(
+        params_shape, indices_shape, updates_shape, allow_duplicate_index
+    )
+    x_const = tf.constant(params)
+    y_const = tf.constant(updates)
+    i_const = tf.constant(indices)
+    with tf.GradientTape() as t1:
+        x = tf.Variable(params)
+        z1 = tf.tensor_scatter_nd_update(x, i_const, y_const)
+    dz_dx = t1.gradient(z1, x)
+    with tf.GradientTape() as t2:
+        y = tf.Variable(updates)
+        z2 = tf.tensor_scatter_nd_update(x_const, i_const, y)
+    dz_dy = t2.gradient(z2, y)
+    test_case.assertTrue(np.allclose(z1.numpy(), z2.numpy()))
+
+    def compare_dz_dx(params_grad):
+        test_case.assertTrue(np.allclose(dz_dx.numpy(), params_grad.numpy()))
+
+    def compare_dz_dy(updates_grad):
+        test_case.assertTrue(np.allclose(dz_dy.numpy(), updates_grad.numpy()))
+
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.consistent_view())
+
+    @flow.global_function(type="train", function_config=func_config)
+    def scatter_nd_update_grad_fn(
+        x_def: oft.Numpy.Placeholder(params.shape, dtype=flow.float),
+        indices_def: oft.Numpy.Placeholder(indices.shape, dtype=flow.int32),
+        y_def: oft.Numpy.Placeholder(updates.shape, dtype=flow.float),
+    ):
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "params",
+                shape=params.shape,
+                dtype=flow.float32,
+                initializer=flow.constant_initializer(0),
+            )
+            y = flow.get_variable(
+                "updates",
+                shape=updates.shape,
+                dtype=flow.float32,
+                initializer=flow.constant_initializer(0),
+            )
+            x = x + x_def
+            y = y + y_def
+            z = flow.tensor_scatter_nd_update(x, indices_def, y)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+            ).minimize(z)
+        flow.watch_diff(x, compare_dz_dx)
+        flow.watch_diff(y, compare_dz_dy)
+        return z
+
+    of_z = scatter_nd_update_grad_fn(params, indices, updates).get()
+    if verbose is True:
+        print("device_type:", device_type)
+        print("x:", params)
+        print("y:", updates)
+        print("indices:", indices)
+        print("tf_z:", z1.numpy())
+        print("of_z:", of_z.numpy())
+    test_case.assertTrue(np.allclose(z1.numpy(), of_z.numpy()))
+
+
+def _of_tensor_scatter_nd_add(
+    params,
+    indices,
+    updates,
+    device_type,
+    mirrored,
+    params_grad_watcher,
+    updates_grad_watcher,
+):
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    def do_tensor_scatter_nd_add(params_blob, indices_blob, updates_blob):
+        with flow.scope.placement(device_type, "0:0"):
+            params_var = flow.get_variable(
+                "params",
+                shape=params_blob.shape,
+                dtype=flow.float32,
+                initializer=flow.constant_initializer(0),
+            )
+            updates_var = flow.get_variable(
+                "updates",
+                shape=updates_blob.shape,
+                dtype=flow.float32,
+                initializer=flow.constant_initializer(0),
+            )
+            params_var = flow.cast_to_current_logical_view(params_var)
+            params_blob = flow.cast_to_current_logical_view(params_blob)
+            updates_blob = flow.cast_to_current_logical_view(updates_blob)
+            updates_var = flow.cast_to_current_logical_view(updates_var)
+            params_var = params_var + params_blob
+            updates_var = updates_var + updates_blob
+            out = flow.tensor_scatter_nd_add(params_var, indices_blob, updates_var)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+            ).minimize(out)
+        flow.watch_diff(params_var, params_grad_watcher)
+        flow.watch_diff(updates_var, updates_grad_watcher)
+        return out
+
+    if mirrored:
+        func_config.default_logical_view(flow.scope.mirrored_view())
+
+        @flow.global_function(type="train", function_config=func_config)
+        def tensor_scatter_nd_add_fn(
+            params_def: oft.ListNumpy.Placeholder(params.shape, dtype=flow.float),
+            indices_def: oft.ListNumpy.Placeholder(indices.shape, dtype=flow.int32),
+            updates_def: oft.ListNumpy.Placeholder(updates.shape, dtype=flow.float),
+        ):
+            return do_tensor_scatter_nd_add(params_def, indices_def, updates_def)
+
+        return (
+            tensor_scatter_nd_add_fn([params], [indices], [updates])
+            .get()
+            .numpy_list()[0]
+        )
+    else:
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(type="train", function_config=func_config)
+        def tensor_scatter_nd_add_fn(
+            params_def: oft.Numpy.Placeholder(params.shape, dtype=flow.float),
+            indices_def: oft.Numpy.Placeholder(indices.shape, dtype=flow.int32),
+            updates_def: oft.Numpy.Placeholder(updates.shape, dtype=flow.float),
+        ):
+            return do_tensor_scatter_nd_add(params_def, indices_def, updates_def)
+
+        return tensor_scatter_nd_add_fn(params, indices, updates).get().numpy()
+
+
+def _compare_tensor_scatter_nd_add_with_tf(
+    test_case, params_shape, indices_shape, updates_shape, device_type, mirrored
+):
+    (params, updates, indices) = _random_inputs(
+        params_shape, indices_shape, updates_shape, True
+    )
+    params_const = tf.constant(params)
+    indices_const = tf.constant(indices)
+    updates_const = tf.constant(updates)
+    with tf.GradientTape() as t1:
+        params_var = tf.Variable(params)
+        tf_out1 = tf.tensor_scatter_nd_add(params_var, indices_const, updates_const)
+    tf_params_grad = t1.gradient(tf_out1, params_var)
+    with tf.GradientTape() as t2:
+        updates_var = tf.Variable(updates)
+        tf_out2 = tf.tensor_scatter_nd_add(params_const, indices_const, updates_var)
+    tf_updates_grad = t2.gradient(tf_out2, updates_var)
+    test_case.assertTrue(np.allclose(tf_out1.numpy(), tf_out2.numpy()))
+
+    def compare_params_grad(of_params_grad):
+        tf_params_grad_np = tf_params_grad.numpy()
+        of_params_grad_np = (
+            of_params_grad.numpy_list()[0] if mirrored else of_params_grad.numpy()
+        )
+        test_case.assertTrue(np.allclose(tf_params_grad_np, of_params_grad_np))
+
+    def compare_updates_grad(of_updates_grad):
+        tf_updates_grad_np = tf_updates_grad.numpy()
+        of_updates_grad_np = (
+            of_updates_grad.numpy_list()[0] if mirrored else of_updates_grad.numpy()
+        )
+        test_case.assertTrue(np.allclose(tf_updates_grad_np, of_updates_grad_np))
+
+    of_out = _of_tensor_scatter_nd_add(
+        params,
+        indices,
+        updates,
+        device_type,
+        mirrored,
+        compare_params_grad,
+        compare_updates_grad,
+    )
+    test_case.assertTrue(np.allclose(tf_out1.numpy(), of_out))
+
+
+def _of_scatter_nd_dynamic_indices(
+    indices, updates, indices_static_shape, updates_static_shape, params_shape
+):
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.mirrored_view())
+
+    @flow.global_function(function_config=func_config)
+    def scatter_nd_fn(
+        indices_def: oft.ListNumpy.Placeholder(indices_static_shape, dtype=flow.int32),
+        updates_def: oft.ListNumpy.Placeholder(updates_static_shape, dtype=flow.float),
+    ):
+        with flow.scope.placement("gpu", "0:0"):
+            return flow.scatter_nd(indices_def, updates_def, params_shape)
+
+    return scatter_nd_fn([indices], [updates]).get().numpy_list()[0]
+
+
+def _compare_scatter_nd_dynamic_indices_with_tf(
+    test_case,
+    indices_shape,
+    updates_shape,
+    indices_static_shape,
+    updates_static_shape,
+    params_shape,
+):
+    (_, updates, indices) = _random_inputs(params_shape, indices_shape, updates_shape)
+    indices_const = tf.constant(indices)
+    x = tf.Variable(updates)
+    y = tf.scatter_nd(indices_const, x, params_shape)
+    of_y = _of_scatter_nd_dynamic_indices(
+        indices, updates, indices_static_shape, updates_static_shape, params_shape
+    )
+    test_case.assertTrue(np.allclose(y.numpy(), of_y))
+
+
+def _of_tensor_scatter_nd_update_dynamic_indices(
+    params, indices, updates, indices_static_shape, updates_static_shape
+):
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.mirrored_view())
+
+    @flow.global_function(function_config=func_config)
+    def tensor_scatter_nd_update_fn(
+        params_def: oft.ListNumpy.Placeholder(params.shape, dtype=flow.float),
+        indices_def: oft.ListNumpy.Placeholder(indices_static_shape, dtype=flow.int32),
+        updates_def: oft.ListNumpy.Placeholder(updates_static_shape, dtype=flow.float),
+    ):
+        with flow.scope.placement("gpu", "0:0"):
+            return flow.tensor_scatter_nd_update(params_def, indices_def, updates_def)
+
+    return (
+        tensor_scatter_nd_update_fn([params], [indices], [updates])
+        .get()
+        .numpy_list()[0]
+    )
+
+
+def _compare_tensor_scatter_nd_update_dynamic_indices_with_tf(
+    test_case,
+    params_shape,
+    indices_shape,
+    updates_shape,
+    indices_static_shape,
+    updates_static_shape,
+):
+    (params, updates, indices) = _random_inputs(
+        params_shape, indices_shape, updates_shape, False
+    )
+    i = tf.constant(indices)
+    x = tf.Variable(params)
+    y = tf.Variable(updates)
+    z = tf.tensor_scatter_nd_update(x, i, y)
+    of_z = _of_tensor_scatter_nd_update_dynamic_indices(
+        params, indices, updates, indices_static_shape, updates_static_shape
+    )
+    test_case.assertTrue(np.allclose(z.numpy(), of_z))
+
+
+def _of_tensor_scatter_nd_add_dynamic_indices(
+    params, indices, updates, indices_static_shape, updates_static_shape
+):
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.mirrored_view())
+
+    @flow.global_function(function_config=func_config)
+    def tensor_scatter_nd_add_fn(
+        params_def: oft.ListNumpy.Placeholder(params.shape, dtype=flow.float),
+        indices_def: oft.ListNumpy.Placeholder(indices_static_shape, dtype=flow.int32),
+        updates_def: oft.ListNumpy.Placeholder(updates_static_shape, dtype=flow.float),
+    ):
+        with flow.scope.placement("gpu", "0:0"):
+            return flow.tensor_scatter_nd_add(params_def, indices_def, updates_def)
+
+    return (
+        tensor_scatter_nd_add_fn([params], [indices], [updates]).get().numpy_list()[0]
+    )
+
+
+def _compare_tensor_scatter_nd_add_dynamic_indices_with_tf(
+    test_case,
+    params_shape,
+    indices_shape,
+    updates_shape,
+    indices_static_shape,
+    updates_static_shape,
+):
+    (params, updates, indices) = _random_inputs(
+        params_shape, indices_shape, updates_shape
+    )
+    i = tf.constant(indices)
+    x = tf.Variable(params)
+    y = tf.Variable(updates)
+    z = tf.tensor_scatter_nd_add(x, i, y)
+    of_z = _of_tensor_scatter_nd_add_dynamic_indices(
+        params, indices, updates, indices_static_shape, updates_static_shape
+    )
+    test_case.assertTrue(np.allclose(z.numpy(), of_z))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestScatterNd(flow.unittest.TestCase):
+    def test_scatter_nd(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["params_shape"] = [(10,)]
+        arg_dict["indices_shape"] = [(5, 1)]
+        arg_dict["updates_shape"] = [(5,)]
+        arg_dict["mirrored"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            _compare_scatter_nd_with_tf(test_case, *arg)
+
+    def test_scatter_nd_case_1(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["params_shape"] = [(128,)]
+        arg_dict["indices_shape"] = [(100, 1)]
+        arg_dict["updates_shape"] = [(100,)]
+        for arg in GenArgList(arg_dict):
+            _compare_scatter_nd_with_tf(test_case, *arg)
+
+    def test_scatter_nd_case_2(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["params_shape"] = [(32, 16, 4)]
+        arg_dict["indices_shape"] = [(50, 2)]
+        arg_dict["updates_shape"] = [(50, 4)]
+        for arg in GenArgList(arg_dict):
+            _compare_scatter_nd_with_tf(test_case, *arg)
+
+    def test_scatter_nd_case_3(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["params_shape"] = [(24, 25, 32, 10, 12)]
+        arg_dict["indices_shape"] = [(3, 4, 2)]
+        arg_dict["updates_shape"] = [(3, 4, 32, 10, 12)]
+        arg_dict["mirrored"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            _compare_scatter_nd_with_tf(test_case, *arg)
+
+    def test_scatter_nd_case_4(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["params_shape"] = [(8,)]
+        arg_dict["indices_shape"] = [(12, 1)]
+        arg_dict["updates_shape"] = [(12,)]
+        for arg in GenArgList(arg_dict):
+            _compare_scatter_nd_with_tf(test_case, *arg)
+
+    def test_scatter_nd_update(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["params_shape"] = [(10,)]
+        arg_dict["indices_shape"] = [(5, 1)]
+        arg_dict["updates_shape"] = [(5,)]
+        arg_dict["allow_duplicate_index"] = [False]
+        for arg in GenArgList(arg_dict):
+            _compare_scatter_nd_update_with_tf(test_case, *arg)
+
+    def test_scatter_nd_update_case_1(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["params_shape"] = [(256, 64)]
+        arg_dict["indices_shape"] = [(128, 2)]
+        arg_dict["updates_shape"] = [(128,)]
+        for arg in GenArgList(arg_dict):
+            _compare_scatter_nd_update_with_tf(test_case, *arg)
+
+    def test_scatter_nd_update_case_2(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["params_shape"] = [(20, 10, 11, 3, 5)]
+        arg_dict["indices_shape"] = [(2, 4, 3)]
+        arg_dict["updates_shape"] = [(2, 4, 3, 5)]
+        for arg in GenArgList(arg_dict):
+            _compare_scatter_nd_update_with_tf(test_case, *arg)
+
+    def test_scatter_nd_update_case_3(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["params_shape"] = [(256, 4)]
+        arg_dict["indices_shape"] = [(10, 25, 1)]
+        arg_dict["updates_shape"] = [(10, 25, 4)]
+        for arg in GenArgList(arg_dict):
+            _compare_scatter_nd_update_with_tf(test_case, *arg)
+
+    def test_tensor_scatter_nd_add(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["params_shape"] = [(12,)]
+        arg_dict["indices_shape"] = [(7, 1)]
+        arg_dict["updates_shape"] = [(7,)]
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["mirrored"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            _compare_tensor_scatter_nd_add_with_tf(test_case, *arg)
+
+    def test_tensor_scatter_nd_add_case1(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["params_shape"] = [(38, 66, 9)]
+        arg_dict["indices_shape"] = [(17, 2)]
+        arg_dict["updates_shape"] = [(17, 9)]
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["mirrored"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            _compare_tensor_scatter_nd_add_with_tf(test_case, *arg)
+
+    def test_tensor_scatter_nd_add_case2(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["params_shape"] = [(2, 7, 19, 41, 33)]
+        arg_dict["indices_shape"] = [(20, 9, 3)]
+        arg_dict["updates_shape"] = [(20, 9, 41, 33)]
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["mirrored"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            _compare_tensor_scatter_nd_add_with_tf(test_case, *arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_scatter_nd_dynamic_indices(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["indices_shape"] = [(12, 10, 2)]
+        arg_dict["updates_shape"] = [(12, 10, 41, 33)]
+        arg_dict["indices_static_shape"] = [(15, 10, 2)]
+        arg_dict["updates_static_shape"] = [(15, 10, 41, 33)]
+        arg_dict["params_shape"] = [(64, 22, 41, 33)]
+        for arg in GenArgList(arg_dict):
+            _compare_scatter_nd_dynamic_indices_with_tf(test_case, *arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_scatter_nd_empty_indices(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["indices_shape"] = [(0, 1)]
+        arg_dict["updates_shape"] = [(0, 14)]
+        arg_dict["indices_static_shape"] = [(8, 1)]
+        arg_dict["updates_static_shape"] = [(8, 14)]
+        arg_dict["params_shape"] = [(10, 14)]
+        for arg in GenArgList(arg_dict):
+            _compare_scatter_nd_dynamic_indices_with_tf(test_case, *arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_tensor_scatter_nd_update_dynamic_indices(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["params_shape"] = [(32, 33, 4, 5)]
+        arg_dict["indices_shape"] = [(12, 2)]
+        arg_dict["updates_shape"] = [(12, 4, 5)]
+        arg_dict["indices_static_shape"] = [(14, 2)]
+        arg_dict["updates_static_shape"] = [(14, 4, 5)]
+        for arg in GenArgList(arg_dict):
+            _compare_tensor_scatter_nd_update_dynamic_indices_with_tf(test_case, *arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_tensor_scatter_nd_update_empty_indices(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["params_shape"] = [(37, 14)]
+        arg_dict["indices_shape"] = [(7, 0, 1)]
+        arg_dict["updates_shape"] = [(7, 0, 14)]
+        arg_dict["indices_static_shape"] = [(7, 5, 1)]
+        arg_dict["updates_static_shape"] = [(7, 5, 14)]
+        for arg in GenArgList(arg_dict):
+            _compare_tensor_scatter_nd_update_dynamic_indices_with_tf(test_case, *arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_tensor_scatter_nd_add_dynamic_indices(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["params_shape"] = [(2, 9, 7, 5, 4)]
+        arg_dict["indices_shape"] = [(12, 5, 3)]
+        arg_dict["updates_shape"] = [(12, 5, 5, 4)]
+        arg_dict["indices_static_shape"] = [(15, 6, 3)]
+        arg_dict["updates_static_shape"] = [(15, 6, 5, 4)]
+        for arg in GenArgList(arg_dict):
+            _compare_tensor_scatter_nd_add_dynamic_indices_with_tf(test_case, *arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_tensor_scatter_nd_add_empty_indices(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["params_shape"] = [(24, 30, 14)]
+        arg_dict["indices_shape"] = [(0, 2)]
+        arg_dict["updates_shape"] = [(0, 14)]
+        arg_dict["indices_static_shape"] = [(11, 2)]
+        arg_dict["updates_static_shape"] = [(11, 14)]
+        for arg in GenArgList(arg_dict):
+            _compare_tensor_scatter_nd_add_dynamic_indices_with_tf(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_shape.py b/python/oneflow/compatible/single_client/test/ops/test_shape.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8841278249038826968b60cbc6801b66abd73a4
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_shape.py
@@ -0,0 +1,46 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import random
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+@flow.unittest.skip_unless_1n2d()
+class TestShape(flow.unittest.TestCase):
+    def test_shape(test_case):
+        flow.clear_default_session()
+        flow.config.gpu_device_num(2)
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.mirrored_view())
+
+        @flow.global_function(function_config=func_config)
+        def foo_job(input: oft.Numpy.Placeholder(shape=(2, 5))):
+            ret = flow.identity(input)
+            test_case.assertTrue(ret.shape == (1, 5))
+
+        input_tensor = np.arange(10).reshape(2, 5).astype(np.single)
+        foo_job(input_tensor)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_shuffle.py b/python/oneflow/compatible/single_client/test/ops/test_shuffle.py
new file mode 100644
index 0000000000000000000000000000000000000000..89e495f5b67501c38d30a7896c46dbabe66d6fd4
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_shuffle.py
@@ -0,0 +1,83 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+import uuid
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestShuffle(flow.unittest.TestCase):
+    def test_shuffle(_):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["x_shape"] = [(100,), (10, 1000), (10, 10, 2000)]
+        arg_dict["data_type"] = ["float32", "double", "int32", "int64"]
+        for (device_type, x_shape, data_type) in GenArgList(arg_dict):
+            assert device_type in ["gpu", "cpu"]
+            assert data_type in ["float32", "double", "int8", "int32", "int64"]
+            flow.clear_default_session()
+            func_config = flow.FunctionConfig()
+            func_config.default_data_type(flow.float)
+
+            @flow.global_function(function_config=flow.FunctionConfig())
+            def TestJob(
+                x: oft.Numpy.Placeholder(
+                    x_shape, dtype=type_name_to_flow_type[data_type]
+                )
+            ):
+                with flow.scope.placement(device_type, "0:0"):
+                    return flow.random.shuffle(x)
+
+            x = np.random.randn(*x_shape).astype(type_name_to_np_type[data_type])
+            ret = TestJob(x).get().numpy()
+            assert np.array_equal(x, ret) == False, x_shape
+            x.sort(0)
+            ret.sort(0)
+            assert np.array_equal(x, ret), x_shape
+            assert device_type in ["gpu", "cpu"]
+            assert data_type in ["float32", "double", "int8", "int32", "int64"]
+            flow.clear_default_session()
+            func_config = flow.FunctionConfig()
+            func_config.default_data_type(flow.float)
+
+            @flow.global_function(function_config=flow.FunctionConfig())
+            def TestJob1(
+                x: oft.Numpy.Placeholder(
+                    x_shape, dtype=type_name_to_flow_type[data_type]
+                )
+            ):
+                with flow.scope.placement(device_type, "0:0"):
+                    return flow.random.generate_random_batch_permutation_indices(x)
+
+            x = np.random.randn(*x_shape).astype(type_name_to_np_type[data_type])
+            ret = TestJob1(x).get().numpy()
+            idx = np.arange(x_shape[0]).astype(np.int32)
+            assert np.array_equal(idx, ret) == False, x_shape
+            idx.sort()
+            ret.sort()
+            assert np.array_equal(idx, ret), x_shape
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_sigmoid_cross_entropy.py b/python/oneflow/compatible/single_client/test/ops/test_sigmoid_cross_entropy.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f82c4cbba9cdfcac11b6b93a6d949495ea2ba31
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_sigmoid_cross_entropy.py
@@ -0,0 +1,96 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import test_global_storage
+from test_util import GenArgList, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def compare_with_tensorflow(device_type, data_type, shape):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    dtype = type_name_to_flow_type[data_type]
+
+    def np_sigmoid(x):
+        return 1 / (1 + np.exp(-x))
+
+    @flow.global_function(type="train", function_config=func_config)
+    def SigmoidCrossEntropyWithLogitsJob(labels: oft.Numpy.Placeholder(shape, dtype)):
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "x",
+                shape=shape,
+                dtype=type_name_to_flow_type[data_type],
+                initializer=flow.random_uniform_initializer(minval=-10, maxval=10),
+                trainable=True,
+            )
+            loss = flow.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=x)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch(loss, test_global_storage.Setter("loss"))
+            flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))
+            return loss
+
+    labels = np_sigmoid(np.random.randint(0, 10, size=shape)).astype(
+        type_name_to_np_type[data_type]
+    )
+    of_out = SigmoidCrossEntropyWithLogitsJob(labels).get()
+    with tf.GradientTape(persistent=True) as tape:
+        x = tf.Variable(test_global_storage.Get("x"))
+        tf_out = tf.nn.sigmoid_cross_entropy_with_logits(labels, x)
+        loss_diff = test_global_storage.Get("loss_diff")
+        tf_x_diff = tape.gradient(tf_out, x, loss_diff)
+    tolerance = 1e-05
+    assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=tolerance, atol=tolerance)
+    assert np.allclose(
+        test_global_storage.Get("x_diff"),
+        tf_x_diff.numpy(),
+        rtol=tolerance,
+        atol=tolerance,
+    )
+    flow.clear_default_session()
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestSigmoidCrossEntropy(flow.unittest.TestCase):
+    def test_sigmoid_cross_entropy_with_logits(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["data_type"] = ["double", "float32"]
+        arg_dict["shape"] = [(64, 1000), (5, 5, 1000)]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_size.py b/python/oneflow/compatible/single_client/test/ops/test_size.py
new file mode 100644
index 0000000000000000000000000000000000000000..07e4e4ecab3fdae0056a2acf2a1af4c0056872ad
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_size.py
@@ -0,0 +1,125 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import random
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import (
+    GenArgDict,
+    GenArgList,
+    type_name_to_flow_type,
+    type_name_to_np_type,
+)
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def _compare_with_np(test_case, x_shape, dtype):
+    x = np.random.randn(*x_shape).astype(type_name_to_np_type[dtype])
+    ret = flow.Size(x_shape)
+    for idx in range(0, len(ret)):
+        test_case.assertEqual(ret[idx], x.shape[idx])
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestSize(flow.unittest.TestCase):
+    def test_size(test_case):
+        size = flow.Size((4, 3, 10, 5))
+        test_case.assertTrue(size[0] == 4)
+        test_case.assertTrue(size[2] == 10)
+        test_case.assertTrue(len(size) == 4)
+        size = flow.Size([4, 3, 10, 5])
+        test_case.assertTrue(size[0] == 4)
+        test_case.assertTrue(size[2] == 10)
+        test_case.assertTrue(len(size) == 4)
+        size = flow.Size(size)
+        test_case.assertTrue(size[0] == 4)
+        test_case.assertTrue(size[2] == 10)
+        test_case.assertTrue(len(size) == 4)
+        test_case.assertTrue(size[-1] == 5)
+        test_case.assertTrue(size[-4] == 4)
+
+    def test_unpack(test_case):
+        (one, two, three, four) = flow.Size((1, 2, 3, 4))
+        test_case.assertEqual(one, 1)
+        test_case.assertEqual(two, 2)
+        test_case.assertEqual(three, 3)
+        test_case.assertEqual(four, 4)
+
+    def test_offical(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["x_shape"] = [
+            (10,),
+            (20, 10),
+            (20, 10, 10),
+            (20, 10, 10, 3),
+            (20, 10, 10, 3, 3),
+        ]
+        arg_dict["dtype"] = ["float32", "int32", "double"]
+        for arg in GenArgDict(arg_dict):
+            _compare_with_np(test_case, **arg)
+
+    def test_equal(test_case):
+        size = flow.Size((2, 3))
+        test_case.assertEqual(size == (2, 3), True)
+        test_case.assertEqual(size == (3, 2), False)
+        test_case.assertEqual(size == flow.Size((2, 3)), True)
+        test_case.assertEqual(size == flow.Size((3, 2)), False)
+        test_case.assertEqual(size == [2, 3], False)
+        test_case.assertEqual(size == dict(), False)
+
+    def test_numel(test_case):
+        size = flow.Size((1, 2, 3, 4))
+        test_case.assertEqual(size.numel(), 24)
+
+    def test_count(test_case):
+        size = flow.Size((2, 2, 3, 4))
+        test_case.assertEqual(size.count(1), 0)
+        test_case.assertEqual(size.count(2), 2)
+        test_case.assertEqual(size.count(3), 1)
+        test_case.assertEqual(size.count(4), 1)
+
+    def test_index(test_case):
+        size = flow.Size((2, 3, 2, 4, 4))
+        test_case.assertEqual(size.index(2), 0)
+        test_case.assertEqual(size.index(2, start=0), 0)
+        test_case.assertEqual(size.index(2, start=0, end=20), 0)
+        test_case.assertEqual(size.index(2, start=1, end=20), 2)
+        test_case.assertEqual(size.index(4), 3)
+        test_case.assertEqual(size.index(4, start=4), 4)
+        with test_case.assertRaises(ValueError):
+            size.index(4, start=0, end=3)
+        with test_case.assertRaises(ValueError):
+            size.index(5)
+        with test_case.assertRaises(ValueError):
+            size.index(2, start=3)
+
+    def test_slicing(test_case):
+        size = flow.Size([2, 3, 4, 5])
+        test_case.assertTrue(size[1:3] == flow.Size((3, 4)))
+        test_case.assertTrue(size[1:] == flow.Size((3, 4, 5)))
+        test_case.assertTrue(size[:2] == (2, 3))
+        test_case.assertTrue(size[-3:] == flow.Size((3, 4, 5)))
+        test_case.assertTrue(size[-3:-1] == flow.Size((3, 4)))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_slice_v2.py b/python/oneflow/compatible/single_client/test/ops/test_slice_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..75beb83ca68c6b1a66df0add3336868bdf16a191
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_slice_v2.py
@@ -0,0 +1,558 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import collections
+import os
+import typing as tp
+import unittest
+
+import numpy as np
+import test_util
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as otp
+
+DEFAULT_DEVICE_TAG = "gpu"
+if os.getenv("ONEFLOW_TEST_CPU_ONLY"):
+    DEFAULT_DEVICE_TAG = "cpu"
+
+
+def _do_slice(input, args, name=None):
+    outputs = []
+    for slice_tup_list in args:
+        output = flow.slice_v2(input, slice_tup_list, name)
+        outputs.append(output)
+    return outputs
+
+
+def _make_slice_func(slice_args, input_shape, dtype=flow.float32, func_cfg=None):
+    @flow.global_function(type="predict", function_config=func_cfg)
+    def slice_job(
+        x: otp.Numpy.Placeholder(shape=input_shape, dtype=dtype)
+    ) -> tp.List[otp.Numpy]:
+        return _do_slice(x, slice_args)
+
+    return slice_job
+
+
+def _make_slice_with_fp16_func(slice_args, input_shape, func_cfg=None):
+    @flow.global_function(type="predict", function_config=func_cfg)
+    def slice_job(
+        x: otp.Numpy.Placeholder(shape=input_shape, dtype=flow.float32)
+    ) -> tp.List[otp.Numpy]:
+        x = flow.cast(x, flow.float16)
+        y = _do_slice(x, slice_args)
+        return [flow.cast(y_i, flow.float32) for y_i in y]
+
+    return slice_job
+
+
+def _make_slice_dynamic_func(
+    slice_args, input_shape, dtype=flow.float32, func_cfg=None
+):
+    if func_cfg is None:
+        func_cfg = flow.FunctionConfig()
+        func_cfg.default_logical_view(flow.scope.mirrored_view())
+
+    @flow.global_function(type="predict", function_config=func_cfg)
+    def slice_dynamic_job(
+        x: otp.ListNumpy.Placeholder(shape=input_shape, dtype=dtype)
+    ) -> tp.List[otp.ListNumpy]:
+        return _do_slice(x, slice_args, name="SliceDynamic")
+
+    return slice_dynamic_job
+
+
+def _make_slice_with_grad_func(
+    slice_tup_list, input_shape, watch_diff_cb=None, dtype=flow.float32, func_cfg=None
+):
+    @flow.global_function(type="train", function_config=func_cfg)
+    def slice_with_grad_job(
+        x: otp.Numpy.Placeholder(shape=input_shape, dtype=dtype)
+    ) -> otp.Numpy:
+        var = flow.get_variable(
+            shape=input_shape,
+            dtype=dtype,
+            initializer=flow.constant_initializer(0.0),
+            name="variable",
+        )
+        x = x + var
+        if callable(watch_diff_cb):
+            flow.watch_diff(x, watch_diff_cb)
+        y = flow.slice_v2(x, slice_tup_list, name="SliceWithGrad")
+        flow.optimizer.SGD(
+            flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+        ).minimize(y)
+        return y
+
+    return slice_with_grad_job
+
+
+def _make_slice_update_func(
+    slice_tup_list, input_shape, update_shape, dtype=flow.float32, func_cfg=None
+):
+    @flow.global_function(type="predict", function_config=func_cfg)
+    def slice_update_job(
+        x: otp.Numpy.Placeholder(shape=input_shape, dtype=dtype),
+        update: otp.Numpy.Placeholder(shape=update_shape, dtype=dtype),
+    ) -> otp.Numpy:
+        return flow.slice_update(x, update, slice_tup_list)
+
+    return slice_update_job
+
+
+def _make_slice_update_grad_func(
+    slice_tup_list,
+    input_shape,
+    update_shape,
+    diff_watcher_maker=None,
+    dtype=flow.float32,
+    func_cfg=None,
+):
+    @flow.global_function(type="train", function_config=func_cfg)
+    def slice_update_train_job(
+        x: otp.Numpy.Placeholder(shape=input_shape, dtype=dtype),
+        update: otp.Numpy.Placeholder(shape=update_shape, dtype=dtype),
+    ) -> otp.Numpy:
+        x_var = flow.get_variable(
+            shape=input_shape,
+            dtype=dtype,
+            initializer=flow.constant_initializer(0.0),
+            name="x",
+        )
+        update_var = flow.get_variable(
+            shape=update_shape,
+            dtype=dtype,
+            initializer=flow.constant_initializer(0.0),
+            name="update",
+        )
+        x = x + x_var
+        update = update + update_var
+        if callable(diff_watcher_maker):
+            flow.watch_diff(x, diff_watcher_maker(input_shape))
+            flow.watch_diff(update, diff_watcher_maker(update_shape))
+        y = flow.slice_update(x, update, slice_tup_list)
+        flow.optimizer.SGD(
+            flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+        ).minimize(y)
+        return y
+
+    return slice_update_train_job
+
+
+def _test_slice(
+    test_case,
+    input,
+    slice_args,
+    outputs,
+    dtype=flow.float32,
+    device_tag=DEFAULT_DEVICE_TAG,
+    verbose=False,
+):
+    input = input.astype(flow.convert_oneflow_dtype_to_numpy_dtype(dtype))
+    outputs = [
+        output.astype(flow.convert_oneflow_dtype_to_numpy_dtype(dtype))
+        for output in outputs
+    ]
+    flow.clear_default_session()
+    func_cfg = flow.FunctionConfig()
+    func_cfg.default_data_type(dtype)
+    func_cfg.default_placement_scope(flow.scope.placement(device_tag, "0:0"))
+    slice_func = _make_slice_func(slice_args, input.shape, dtype, func_cfg)
+    of_outputs = slice_func(input)
+    if verbose:
+        print("input:\n{}".format(input))
+        print("slice_args:", slice_args)
+        print("dtype:", dtype)
+        print("device_tag:", device_tag)
+    for (out, of_out) in zip(outputs, of_outputs):
+        if verbose:
+            print("output:\n{}\n{}".format(out, of_out))
+        test_case.assertTrue(np.array_equal(out, of_out))
+
+
+def _test_slice_dynamic(
+    test_case,
+    input,
+    slice_args,
+    outputs,
+    static_shape=None,
+    dtype=flow.float32,
+    device_tag=DEFAULT_DEVICE_TAG,
+):
+    input = input.astype(flow.convert_oneflow_dtype_to_numpy_dtype(dtype))
+    outputs = [
+        output.astype(flow.convert_oneflow_dtype_to_numpy_dtype(dtype))
+        for output in outputs
+    ]
+    if static_shape is None:
+        static_shape = input.shape
+    flow.clear_default_session()
+    func_cfg = flow.FunctionConfig()
+    func_cfg.default_data_type(dtype)
+    func_cfg.default_placement_scope(flow.scope.placement(device_tag, "0:0"))
+    func_cfg.default_logical_view(flow.scope.mirrored_view())
+    slice_func = _make_slice_dynamic_func(slice_args, static_shape, dtype, func_cfg)
+    of_outputs = slice_func([input])
+    for (out, of_out) in zip(outputs, of_outputs):
+        test_case.assertTrue(np.array_equal(out, of_out[0]))
+
+
+def _test_slice_with_grad(
+    test_case,
+    input,
+    slice_args,
+    output,
+    diff,
+    dtype=flow.float32,
+    device_tag=DEFAULT_DEVICE_TAG,
+    verbose=False,
+):
+    input = input.astype(flow.convert_oneflow_dtype_to_numpy_dtype(dtype))
+    output = output.astype(flow.convert_oneflow_dtype_to_numpy_dtype(dtype))
+    diff = diff.astype(flow.convert_oneflow_dtype_to_numpy_dtype(dtype))
+    if verbose:
+        print("dtype: {}".format(dtype))
+        print("device_tag: {}".format(device_tag))
+        print("input: {}\n{}\n".format(input.shape, input))
+        print("output: {}\n{}\n".format(output.shape, output))
+        print("diff: {}\n{}\n".format(diff.shape, diff))
+
+    def WatchDiff(of_diff: otp.Numpy):
+        if verbose:
+            print("of_diff: {}\n{}\n".format(of_diff.shape, of_diff))
+        test_case.assertTrue(np.array_equal(of_diff, diff))
+
+    flow.clear_default_session()
+    func_cfg = flow.FunctionConfig()
+    func_cfg.default_data_type(dtype)
+    func_cfg.default_placement_scope(flow.scope.placement(device_tag, "0:0"))
+    slice_func = _make_slice_with_grad_func(
+        slice_args, input.shape, WatchDiff, dtype, func_cfg
+    )
+    of_output = slice_func(input)
+    if verbose:
+        print("of_output: {}\n{}\n".format(of_output.shape, of_output))
+    test_case.assertTrue(np.array_equal(output, of_output))
+
+
+def _test_slice_update(
+    test_case,
+    input,
+    update,
+    slice_args,
+    output,
+    dtype=flow.float32,
+    device_tag=DEFAULT_DEVICE_TAG,
+    verbose=False,
+):
+    input = input.astype(flow.convert_oneflow_dtype_to_numpy_dtype(dtype))
+    update = update.astype(flow.convert_oneflow_dtype_to_numpy_dtype(dtype))
+    output = output.astype(flow.convert_oneflow_dtype_to_numpy_dtype(dtype))
+    flow.clear_default_session()
+    func_cfg = flow.FunctionConfig()
+    func_cfg.default_data_type(dtype)
+    func_cfg.default_placement_scope(flow.scope.placement(device_tag, "0:0"))
+    slice_func = _make_slice_update_func(
+        slice_args, input.shape, update.shape, dtype, func_cfg
+    )
+    of_output = slice_func(input, update)
+    if verbose:
+        print("input:\n{}".format(input))
+        print("update:\n{}".format(update))
+        print("slice_args:", slice_args)
+        print("output:\n{}".format(output))
+        print("dtype:", dtype)
+        print("device_tag:", device_tag)
+        print("of_output:\n{}".format(of_output))
+    test_case.assertTrue(np.array_equal(output, of_output))
+
+
+def _test_slice_update_grad(
+    test_case,
+    input,
+    update,
+    slice_args,
+    output,
+    input_diff,
+    update_diff,
+    dtype=flow.float32,
+    device_tag=DEFAULT_DEVICE_TAG,
+    verbose=False,
+):
+    input = input.astype(flow.convert_oneflow_dtype_to_numpy_dtype(dtype))
+    update = update.astype(flow.convert_oneflow_dtype_to_numpy_dtype(dtype))
+    output = output.astype(flow.convert_oneflow_dtype_to_numpy_dtype(dtype))
+    input_diff = input_diff.astype(flow.convert_oneflow_dtype_to_numpy_dtype(dtype))
+    update_diff = update_diff.astype(flow.convert_oneflow_dtype_to_numpy_dtype(dtype))
+    if verbose:
+        print("dtype: {}".format(dtype))
+        print("device_tag: {}".format(device_tag))
+        print("input: {}\n{}\n".format(input.shape, input))
+        print("output: {}\n{}\n".format(output.shape, output))
+
+    def _make_diff_watcher(shape):
+        def _watch_diff(diff: otp.Numpy):
+            if shape == input_diff.shape:
+                test_case.assertTrue(np.array_equal(diff, input_diff))
+            elif shape == update_diff.shape:
+                test_case.assertTrue(np.array_equal(diff, update_diff))
+
+        return _watch_diff
+
+    flow.clear_default_session()
+    func_cfg = flow.FunctionConfig()
+    func_cfg.default_data_type(dtype)
+    func_cfg.default_placement_scope(flow.scope.placement(device_tag, "0:0"))
+    slice_func = _make_slice_update_grad_func(
+        slice_args, input.shape, update.shape, _make_diff_watcher, dtype, func_cfg
+    )
+    ret = slice_func(input, update)
+    test_case.assertTrue(np.array_equal(ret, output))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestSliceV2(flow.unittest.TestCase):
+    def test_slice_base(test_case):
+        input = np.random.rand(10)
+        slice_args = [[(1, 7, 2)]]
+        outputs = [input[1:7:2]]
+        arg_dict = collections.OrderedDict()
+        arg_dict["dtype"] = [
+            flow.uint8,
+            flow.int8,
+            flow.int32,
+            flow.int64,
+            flow.float32,
+            flow.float64,
+        ]
+        arg_dict["device_tag"] = ["cpu", "gpu"]
+        for kwarg in test_util.GenArgDict(arg_dict):
+            _test_slice(test_case, input, slice_args, outputs, **kwarg)
+
+    def test_slice_into_two_parts(test_case):
+        input = np.random.rand(2, 5, 4)
+        slice_args = [
+            [(None, None, None), (0, 2, None), (None, None, None)],
+            [(None, None, None), (2, None, None), (None, None, None)],
+        ]
+        outputs = [input[:, 0:2, :], input[:, 2:, :]]
+        _test_slice(test_case, input, slice_args, outputs)
+
+    def test_slice_at_first_dim(test_case):
+        input = np.random.rand(4, 5, 4)
+        slice_args = [[(2, None, None)]]
+        outputs = [input[2:None, :, :]]
+        _test_slice(test_case, input, slice_args, outputs)
+
+    def test_slice_at_two_dims(test_case):
+        input = np.random.rand(2, 5, 4)
+        slice_args = [[(None, None, None), (0, 2, None), (2, None, None)]]
+        outputs = [input[:, 0:2, 2:]]
+        _test_slice(test_case, input, slice_args, outputs)
+
+    def test_slice_with_collapse_dims(test_case):
+        input = np.random.rand(2, 5, 4, 4, 3)
+        slice_args = [
+            [
+                (None, None, None),
+                (0, 2, None),
+                (None, None, None),
+                (None, None, None),
+                (1, None, None),
+            ]
+        ]
+        outputs = [input[:, 0:2, :, :, 1:]]
+        _test_slice(test_case, input, slice_args, outputs)
+
+    def test_slice_with_step_two(test_case):
+        input = np.random.rand(2, 5, 4)
+        slice_args = [[(None, None, None), (1, None, 2)]]
+        outputs = [input[:, 1::2, :]]
+        _test_slice(test_case, input, slice_args, outputs)
+
+    def test_slice_at_two_dim_with_step_more_than_one(test_case):
+        input = np.random.rand(2, 5, 4)
+        slice_args = [[(None, None, None), (1, None, 3), (None, None, 2)]]
+        outputs = [input[:, 1::3, ::2]]
+        _test_slice(test_case, input, slice_args, outputs)
+
+    def test_slice_with_neg_start(test_case):
+        input = np.random.rand(2, 5, 4)
+        slice_args = [[(None, None, None), (-4, None, None)]]
+        outputs = [input[:, -4:, :]]
+        _test_slice(test_case, input, slice_args, outputs)
+
+    def test_slice_with_neg_stop(test_case):
+        input = np.random.rand(2, 5, 4)
+        slice_args = [[(None, None, None), (None, -2, None)]]
+        outputs = [input[:, :-2, :]]
+        _test_slice(test_case, input, slice_args, outputs)
+
+    def test_slice_with_neg_step(test_case):
+        input = np.random.rand(2, 5, 4)
+        slice_args = [[(None, None, None), (None, None, -1)]]
+        outputs = [input[:, ::-1, :]]
+        _test_slice(test_case, input, slice_args, outputs)
+
+    def test_slice_with_neg_step_two(test_case):
+        input = np.random.rand(2, 5, 4)
+        slice_args = [[(None, None, None), (-1, 1, -2)]]
+        outputs = [input[:, -1:1:-2, :]]
+        _test_slice(test_case, input, slice_args, outputs)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_slice_with_float16(test_case):
+        input = np.random.rand(10).astype(np.float32)
+        slice_args = [[(2, 7, None)]]
+        outputs = [input[2:7]]
+        flow.clear_default_session()
+        flow.config.gpu_device_num(1)
+        slice_func = _make_slice_with_fp16_func(slice_args, input.shape)
+        of_outputs = slice_func(input)
+        test_case.assertTrue(
+            np.allclose(outputs[0], of_outputs[0], rtol=0.001, atol=0.0001)
+        )
+
+    def test_slice_dynamic_base(test_case):
+        input = np.random.rand(2, 4, 4)
+        slice_args = [[(None, None, None), (1, None, None)]]
+        outputs = [input[:, 1:, :]]
+        arg_dict = collections.OrderedDict()
+        arg_dict["dtype"] = [
+            flow.uint8,
+            flow.int8,
+            flow.int32,
+            flow.int64,
+            flow.float32,
+            flow.float64,
+        ]
+        arg_dict["device_tag"] = ["cpu", "gpu"]
+        for kwarg in test_util.GenArgDict(arg_dict):
+            _test_slice_dynamic(
+                test_case, input, slice_args, outputs, static_shape=(2, 5, 5), **kwarg
+            )
+
+    def test_slice_dynamic_at_two_dims(test_case):
+        input = np.random.rand(2, 3, 2, 2)
+        slice_args = [
+            [(None, None, None), (2, None, None), (None, None, None), (1, None, None)]
+        ]
+        outputs = [input[:, 2:, :, 1:]]
+        _test_slice_dynamic(
+            test_case, input, slice_args, outputs, static_shape=(2, 5, 3, 3)
+        )
+
+    def test_slice_dynamic_at_first_dim_and_last_dim(test_case):
+        input = np.random.rand(3, 6, 3, 3)
+        slice_args = [
+            [(1, None, None), (None, None, None), (None, None, None), (1, None, None)]
+        ]
+        outputs = [input[1:, :, :, 1:]]
+        _test_slice_dynamic(
+            test_case, input, slice_args, outputs, static_shape=(4, 5, 5, 3)
+        )
+
+    def test_slice_dynamic_neg_start(test_case):
+        input = np.random.rand(2, 10)
+        slice_args = [[(None, None, None), (-5, None, None)]]
+        outputs = [input[:, -5:]]
+        _test_slice_dynamic(test_case, input, slice_args, outputs, static_shape=(3, 7))
+
+    def test_slice_dynamic_neg_step(test_case):
+        input = np.random.rand(2, 10)
+        slice_args = [[(None, None, None), (None, -5, -1)]]
+        outputs = [input[:, :-5:-1]]
+        _test_slice_dynamic(test_case, input, slice_args, outputs, static_shape=(3, 7))
+
+    def test_slice_dynamic_anomaly(test_case):
+        input = np.random.rand(4, 7)
+        slice_args = [[(None, None, None), (2, None, None)]]
+        outputs = [input[:, 2:]]
+        _test_slice_dynamic(test_case, input, slice_args, outputs, static_shape=(5, 6))
+
+    def test_slice_dynamic_empty_blob(test_case):
+        input = np.random.rand(5, 0, 5)
+        slice_args = [[(None, None, None), (None, None, None), (2, 3, None)]]
+        outputs = [input[:, :, 2:3]]
+        _test_slice_dynamic(
+            test_case, input, slice_args, outputs, static_shape=(8, 2, 10)
+        )
+
+    "This test case will raise fatal error, error infomation is like below:\n    F0808 00:20:19.768465 23960 user_kernel.cpp:451] Check failed: shape_view.elem_cnt() <= static_shape.elem_cnt() (12 vs. 9)\n    InferShape of OpKernel (op_type_name: slice, op_name: SliceDynamic_0) raise error,\n    output arg's (name: y, index: 0) runtime shape (2,6) surpass the limit of static shape (3,3)\n    *** Check failure stack trace: ***\n    ...\n    The reason is the dismatch between static slice (for memory) and dynamic slice (real slice)\n    The result shape of slice [:, 3:-1] for static shape (3, 7) is (3, 3)\n    which indicate that blob has prod(3, 3) memory limit,\n    and the result shape of slice [:, 3:-1] for dynamic shape (2, 10) is (2, 6)\n    which will cause blob to be out of memory limit.\n    "
+    "\n    static shape after slice is (5, 4)\n    dynamic shape after slice is (4, 5)\n    static shape after slice is (5, 3)\n    dynamic shape after slice is (4, 4)\n    "
+
+    def test_slice_with_grad(test_case):
+        input = np.random.rand(2, 5, 4)
+        slice_tup_list = [(None, None, None), (2, -2, None)]
+        output = input[:, 2:-2, :]
+        diff = np.zeros(input.shape, dtype=input.dtype)
+        diff[:, 2:-2, :] = 1
+        arg_dict = collections.OrderedDict()
+        arg_dict["dtype"] = [flow.float32, flow.float64]
+        arg_dict["device_tag"] = ["cpu", "gpu"]
+        arg_dict["verbose"] = [False]
+        for kwarg in test_util.GenArgDict(arg_dict):
+            _test_slice_with_grad(
+                test_case, input, slice_tup_list, output, diff, **kwarg
+            )
+
+    def test_slice_update(test_case):
+        input = np.random.rand(10, 5, 4)
+        update = input[5:, :-1, ::2]
+        update = np.random.rand(*update.shape)
+        output = np.copy(input)
+        output[5:, :-1, ::2] = update
+        slice_tup_list = [(5, None, None), (None, -1, None), (None, None, 2)]
+        arg_dict = collections.OrderedDict()
+        arg_dict["dtype"] = [flow.float32, flow.float64]
+        arg_dict["device_tag"] = ["cpu", "gpu"]
+        arg_dict["verbose"] = [False]
+        for kwarg in test_util.GenArgDict(arg_dict):
+            _test_slice_update(
+                test_case, input, update, slice_tup_list, output, **kwarg
+            )
+
+    def test_slice_update_grad(test_case):
+        input = np.random.rand(2, 7)
+        update = input[:, 1:4]
+        update = np.random.rand(*update.shape)
+        update_diff = np.ones(update.shape)
+        input_diff = np.ones(input.shape)
+        input_diff[:, 1:4] = 0
+        output = np.copy(input)
+        output[:, 1:4] = update
+        slice_tup_list = [(None, None, None), (1, 4, None)]
+        arg_dict = collections.OrderedDict()
+        arg_dict["dtype"] = [flow.float32, flow.float64]
+        arg_dict["device_tag"] = ["cpu", "gpu"]
+        arg_dict["verbose"] = [False]
+        for kwarg in test_util.GenArgDict(arg_dict):
+            _test_slice_update_grad(
+                test_case,
+                input,
+                update,
+                slice_tup_list,
+                output,
+                input_diff,
+                update_diff,
+                **kwarg
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_smooth_l1_loss.py b/python/oneflow/compatible/single_client/test/ops/test_smooth_l1_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..e53314788b427dc802acc9bd6db25b119bf622b2
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_smooth_l1_loss.py
@@ -0,0 +1,122 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+import uuid
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def gen_numpy_data(prediction, label, beta=1.0):
+    original_shape = prediction.shape
+    elem_cnt = prediction.size
+    prediction = prediction.reshape(-1)
+    label = label.reshape(-1)
+    loss = np.zeros(elem_cnt).astype(prediction.dtype)
+    prediction_grad = np.zeros(elem_cnt).astype(prediction.dtype)
+    for i in np.arange(elem_cnt):
+        abs_diff = abs(prediction[i] - label[i])
+        if abs_diff < beta:
+            loss[i] = 0.5 * abs_diff * abs_diff / beta
+        else:
+            loss[i] = abs_diff - 0.5 * beta
+    for i in np.arange(elem_cnt):
+        diff = prediction[i] - label[i]
+        abs_diff = abs(diff)
+        if abs_diff < beta:
+            prediction_grad[i] = diff / beta
+        else:
+            prediction_grad[i] = np.sign(diff)
+    return {
+        "loss": loss.reshape(original_shape),
+        "prediction_grad": prediction_grad.reshape(original_shape),
+    }
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestSmoothL1Loss(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_smooth_l1_loss(_):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["prediction_shape"] = [(100,), (10, 10)]
+        arg_dict["data_type"] = ["float32", "double"]
+        arg_dict["beta"] = [0, 0.5, 1]
+        for case in GenArgList(arg_dict):
+            (device_type, prediction_shape, data_type, beta) = case
+            assert device_type in ["gpu", "cpu"]
+            assert data_type in ["float32", "double", "int8", "int32", "int64"]
+            flow.clear_default_session()
+            func_config = flow.FunctionConfig()
+            func_config.default_data_type(flow.float)
+            prediction = np.random.randn(*prediction_shape).astype(
+                type_name_to_np_type[data_type]
+            )
+            label = np.random.randn(*prediction_shape).astype(
+                type_name_to_np_type[data_type]
+            )
+            np_result = gen_numpy_data(prediction, label, beta)
+
+            def assert_prediction_grad(b):
+                prediction_grad = np_result["prediction_grad"]
+                assert prediction_grad.dtype == type_name_to_np_type[data_type]
+                assert np.allclose(prediction_grad, b.numpy()), (
+                    case,
+                    prediction_grad,
+                    b.numpy(),
+                )
+
+            @flow.global_function(type="train", function_config=func_config)
+            def TestJob(
+                prediction: oft.Numpy.Placeholder(
+                    prediction_shape, dtype=type_name_to_flow_type[data_type]
+                ),
+                label: oft.Numpy.Placeholder(
+                    prediction_shape, dtype=type_name_to_flow_type[data_type]
+                ),
+            ):
+                v = flow.get_variable(
+                    "prediction",
+                    shape=prediction_shape,
+                    dtype=type_name_to_flow_type[data_type],
+                    initializer=flow.constant_initializer(0),
+                    trainable=True,
+                )
+                flow.watch_diff(v, assert_prediction_grad)
+                prediction += v
+                with flow.scope.placement(device_type, "0:0"):
+                    loss = flow.smooth_l1_loss(prediction, label, beta)
+                    flow.optimizer.SGD(
+                        flow.optimizer.PiecewiseConstantScheduler([], [0.0001]),
+                        momentum=0,
+                    ).minimize(loss)
+                    return loss
+
+            loss_np = np_result["loss"]
+            assert loss_np.dtype == type_name_to_np_type[data_type]
+            loss = TestJob(prediction, label).get().numpy()
+            assert np.allclose(loss_np, loss), (case, loss_np, loss)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_softmax.py b/python/oneflow/compatible/single_client/test/ops/test_softmax.py
new file mode 100644
index 0000000000000000000000000000000000000000..593b6246c74852d27ab532172b4fbf3fc6539b02
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_softmax.py
@@ -0,0 +1,133 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import test_global_storage
+from test_util import GenArgList, type_name_to_flow_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def compare_with_tensorflow(device_type, x_shape, data_type, axis):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    if data_type == "float16":
+        dtype = flow.float
+    else:
+        dtype = type_name_to_flow_type[data_type]
+
+    @flow.global_function(type="train", function_config=func_config)
+    def SoftmaxJob():
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "x",
+                shape=x_shape,
+                dtype=dtype,
+                initializer=flow.random_uniform_initializer(minval=-1.0, maxval=1.0),
+                trainable=True,
+            )
+            x1 = x
+            x = flow.identity(x)
+            if data_type == "float16":
+                loss = flow.cast(
+                    flow.nn.softmax(flow.cast(x, dtype=flow.float16), axis=axis),
+                    dtype=flow.float,
+                )
+            else:
+                loss = flow.nn.softmax(x, axis=axis)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch(loss, test_global_storage.Setter("loss"))
+            flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))
+            total_loss = loss * x1
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(total_loss)
+            return loss
+
+    of_out = SoftmaxJob().get()
+    with tf.GradientTape(persistent=True) as tape:
+        x = tf.Variable(test_global_storage.Get("x"))
+        tf_out = tf.nn.softmax(x, axis=axis)
+    loss_diff = test_global_storage.Get("loss_diff")
+    tf_x_diff = tape.gradient(tf_out, x, loss_diff)
+    if data_type == "float16":
+        tolerance = 0.001
+    else:
+        tolerance = 1e-05
+    assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=tolerance, atol=tolerance)
+    assert np.allclose(
+        test_global_storage.Get("x_diff"),
+        tf_x_diff.numpy(),
+        rtol=tolerance,
+        atol=tolerance,
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestSoftmax(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_softmax_shape(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["x_shape"] = [
+            (10, 10, 20, 30),
+            (10, 20, 13),
+            (10, 20, 30),
+            (10, 20),
+            (10, 60),
+            (15, 60),
+            (32, 12, 128),
+            (10, 960),
+            (12, 2001),
+            (10, 4096),
+            (10, 8092),
+            (256, 1001),
+            (100, 65536),
+            (10, 65535),
+        ]
+        arg_dict["data_type"] = ["float32", "double", "float16"]
+        arg_dict["axis"] = [-1]
+        for arg in GenArgList(arg_dict):
+            if arg[0] == "cpu" and arg[2] == "float16":
+                continue
+            compare_with_tensorflow(*arg)
+
+    def test_softmax_axis(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["x_shape"] = [(10, 20, 30, 40)]
+        arg_dict["data_type"] = ["float32", "double", "float16"]
+        arg_dict["axis"] = [-4, -3, -2, -1, 0, 1, 2, 3]
+        for arg in GenArgList(arg_dict):
+            if arg[0] == "cpu" and arg[2] == "float16":
+                continue
+            compare_with_tensorflow(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_softmax_cross_entropy.py b/python/oneflow/compatible/single_client/test/ops/test_softmax_cross_entropy.py
new file mode 100644
index 0000000000000000000000000000000000000000..278e826a4670fb91e0947ff5bd8a5778de2c355e
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_softmax_cross_entropy.py
@@ -0,0 +1,128 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import test_global_storage
+from test_util import GenArgList, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def compare_with_tensorflow(device_type, data_type, shape):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    if data_type == "float16":
+        dtype = flow.float
+    else:
+        dtype = type_name_to_flow_type[data_type]
+
+    def np_softmax(x):
+        return np.exp(x) / np.sum(np.exp(x), axis=1, keepdims=True)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def SoftmaxCrossEntropyWithLogitsJob(labels: oft.Numpy.Placeholder(shape, dtype)):
+        with flow.scope.placement(device_type, "0:0"):
+            if data_type == "float16":
+                x = flow.get_variable(
+                    "x",
+                    shape=shape,
+                    dtype=dtype,
+                    initializer=flow.constant_initializer(20),
+                    trainable=True,
+                )
+                loss = flow.cast(
+                    flow.nn.softmax_cross_entropy_with_logits(
+                        flow.cast(labels, dtype=flow.float16),
+                        flow.cast(x, dtype=flow.float16),
+                    ),
+                    dtype=flow.float,
+                )
+            else:
+                x = flow.get_variable(
+                    "x",
+                    shape=shape,
+                    dtype=type_name_to_flow_type[data_type],
+                    initializer=flow.random_uniform_initializer(minval=-10, maxval=10),
+                    trainable=True,
+                )
+                loss = flow.nn.softmax_cross_entropy_with_logits(
+                    labels=labels, logits=x
+                )
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch(loss, test_global_storage.Setter("loss"))
+            flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))
+            return loss
+
+    if data_type == "float16":
+        labels = np_softmax(np.random.randint(0, 10, size=shape)).astype(np.float32)
+    else:
+        labels = np_softmax(np.random.randint(0, 10, size=shape)).astype(
+            type_name_to_np_type[data_type]
+        )
+    of_out = SoftmaxCrossEntropyWithLogitsJob(labels).get()
+    with tf.GradientTape(persistent=True) as tape:
+        x = tf.Variable(test_global_storage.Get("x"))
+        tf_out = tf.nn.softmax_cross_entropy_with_logits(labels, x)
+        tf16_out = tf.nn.softmax_cross_entropy_with_logits(
+            tf.cast(labels, dtype=tf.float16), tf.cast(x, dtype=tf.float16)
+        )
+        loss_diff = test_global_storage.Get("loss_diff")
+        tf_x_diff = tape.gradient(tf_out, x, loss_diff)
+    if data_type == "float16":
+        tolerance = 0.001
+    else:
+        tolerance = 1e-05
+    assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=tolerance, atol=tolerance)
+    assert np.allclose(
+        test_global_storage.Get("x_diff"),
+        tf_x_diff.numpy(),
+        rtol=tolerance,
+        atol=tolerance,
+    )
+    flow.clear_default_session()
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestSoftmaxCrossEntropy(flow.unittest.TestCase):
+    def test_softmax_cross_entropy_with_logits(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["data_type"] = ["double", "float32", "float16"]
+        arg_dict["shape"] = [(64, 1000), (5, 5, 1000)]
+        for arg in GenArgList(arg_dict):
+            if arg[0] == "cpu" and arg[1] == "float16":
+                continue
+            compare_with_tensorflow(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_sort.py b/python/oneflow/compatible/single_client/test/ops/test_sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..b364447cab7979ac0746f1782a2170acef73058d
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_sort.py
@@ -0,0 +1,89 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+from test_util import GenArgList, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def compare_with_tensorflow(device_type, in_shape, axis, direction, data_type):
+    assert device_type in ["gpu", "cpu"]
+    assert data_type in ["float32", "double", "int8", "int32", "int64"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_logical_view(flow.scope.mirrored_view())
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(function_config=func_config)
+    def SortJob(
+        input: oft.ListNumpy.Placeholder(
+            tuple([dim + 10 for dim in in_shape]),
+            dtype=type_name_to_flow_type[data_type],
+        )
+    ):
+        with flow.scope.placement(device_type, "0:0"):
+            return flow.sort(input, axis, direction)
+
+    input = (np.random.random(in_shape) * 100).astype(type_name_to_np_type[data_type])
+    of_out = SortJob([input]).get().numpy_list()[0]
+    tf_out = tf.sort(input, axis, direction)
+    assert np.array_equal(of_out, tf_out.numpy())
+
+
+def gen_arg_list():
+    arg_dict = OrderedDict()
+    arg_dict["device_type"] = ["cpu", "gpu"]
+    arg_dict["in_shape"] = [(10,), (10, 10, 20)]
+    arg_dict["axis"] = [-1]
+    arg_dict["direction"] = ["ASCENDING", "DESCENDING"]
+    arg_dict["data_type"] = ["float32", "double"]
+    return GenArgList(arg_dict)
+
+
+def gen_arg_list_for_test_axis():
+    arg_dict = OrderedDict()
+    arg_dict["device_type"] = ["cpu", "gpu"]
+    arg_dict["in_shape"] = [(10, 10, 20)]
+    arg_dict["axis"] = [-2, 0, 2]
+    arg_dict["direction"] = ["ASCENDING", "DESCENDING"]
+    arg_dict["data_type"] = ["int32", "int64"]
+    return GenArgList(arg_dict)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestSort(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_sort(test_case):
+        for arg in gen_arg_list():
+            compare_with_tensorflow(*arg)
+        for arg in gen_arg_list_for_test_axis():
+            compare_with_tensorflow(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_sparse_cross_entropy.py b/python/oneflow/compatible/single_client/test/ops/test_sparse_cross_entropy.py
new file mode 100644
index 0000000000000000000000000000000000000000..78cc36eeacba32e67f5a0e5d46b143d396c67f62
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_sparse_cross_entropy.py
@@ -0,0 +1,100 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import test_global_storage
+from test_util import GenArgList, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def compare_with_tensorflow(
+    device_type, data_type, label_type, num_classes, batch_size
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def SparseSoftmaxCrossEntropyWithLogitsJob(
+        labels: oft.Numpy.Placeholder(
+            (batch_size,), dtype=type_name_to_flow_type[label_type]
+        )
+    ):
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "x",
+                shape=(batch_size, num_classes),
+                dtype=type_name_to_flow_type[data_type],
+                initializer=flow.random_uniform_initializer(minval=-10, maxval=10),
+                trainable=True,
+            )
+            prediction = flow.nn.softmax(logits=x)
+            loss = flow.nn.sparse_cross_entropy(labels=labels, prediction=prediction)
+            loss = flow.math.square(loss)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch(loss, test_global_storage.Setter("loss"))
+            flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))
+        return loss
+
+    labels = np.random.randint(0, num_classes, size=(batch_size,)).astype(
+        type_name_to_np_type[label_type]
+    )
+    of_out = SparseSoftmaxCrossEntropyWithLogitsJob(labels).get()
+    with tf.GradientTape(persistent=True) as tape:
+        x = tf.Variable(test_global_storage.Get("x"))
+        tf_out = tf.nn.sparse_softmax_cross_entropy_with_logits(labels, x)
+        tf_out = tf.math.square(tf_out)
+    loss_diff = test_global_storage.Get("loss_diff")
+    tf_x_diff = tape.gradient(tf_out, x, loss_diff)
+    assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=1e-05, atol=1e-05)
+    assert np.allclose(
+        test_global_storage.Get("x_diff"), tf_x_diff.numpy(), rtol=1e-05, atol=1e-05
+    )
+    flow.clear_default_session()
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestSparseCrossEntropy(flow.unittest.TestCase):
+    def test_sparse_cross_entropy_with_logits(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["data_type"] = ["float32", "double"]
+        arg_dict["label_type"] = ["int32", "int64"]
+        arg_dict["num_classes"] = [1000]
+        arg_dict["batch_size"] = [64]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_sparse_cross_entropy_ms.py b/python/oneflow/compatible/single_client/test/ops/test_sparse_cross_entropy_ms.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f84ca1808e625ecb1cd2eaaf610907636a3047d
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_sparse_cross_entropy_ms.py
@@ -0,0 +1,112 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import test_global_storage
+from test_util import GenArgList, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def compare_with_tensorflow(
+    device_type, data_type, label_type, num_classes, batch_size
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    if device_type == "cpu":
+        flow.config.gpu_device_num(0)
+        flow.config.cpu_device_num(4)
+    else:
+        flow.config.gpu_device_num(4)
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def SparseSoftmaxCrossEntropyWithLogitsJob(
+        labels: oft.Numpy.Placeholder(
+            (batch_size,), dtype=type_name_to_flow_type[label_type]
+        )
+    ):
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "x",
+                shape=(batch_size, num_classes),
+                dtype=type_name_to_flow_type[data_type],
+                initializer=flow.random_uniform_initializer(minval=-10, maxval=10),
+                trainable=True,
+            )
+            prediction = flow.nn.softmax(logits=x)
+        with flow.scope.placement(device_type, "0:0-3"):
+            lebels_distribute = flow.distribute.broadcast()
+            prediction_distribute = flow.distribute.split(len(prediction.shape) - 1)
+            loss = flow.nn.sparse_cross_entropy(
+                labels=labels.with_distribute(lebels_distribute),
+                prediction=prediction.with_distribute(prediction_distribute),
+            )
+        with flow.scope.placement(device_type, "0:0"):
+            loss = flow.math.square(loss)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch(loss, test_global_storage.Setter("loss"))
+            flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))
+        return loss
+
+    labels = np.random.randint(0, num_classes, size=(batch_size,)).astype(
+        type_name_to_np_type[label_type]
+    )
+    of_out = SparseSoftmaxCrossEntropyWithLogitsJob(labels).get()
+    with tf.GradientTape(persistent=True) as tape:
+        x = tf.Variable(test_global_storage.Get("x"))
+        tf_out = tf.nn.sparse_softmax_cross_entropy_with_logits(labels, x)
+        tf_out = tf.math.square(tf_out)
+    loss_diff = test_global_storage.Get("loss_diff")
+    tf_x_diff = tape.gradient(tf_out, x, loss_diff)
+    assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=1e-05, atol=1e-05)
+    assert np.allclose(
+        test_global_storage.Get("x_diff"), tf_x_diff.numpy(), rtol=1e-05, atol=1e-05
+    )
+    flow.clear_default_session()
+
+
+@flow.unittest.skip_unless_1n4d()
+class TestSparseCrossEntropyMs(flow.unittest.TestCase):
+    def test_sparse_cross_entropy_with_logits(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["data_type"] = ["float32", "double"]
+        arg_dict["label_type"] = ["int32", "int64"]
+        arg_dict["num_classes"] = [1000]
+        arg_dict["batch_size"] = [64]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_sparse_softmax_cross_entropy.py b/python/oneflow/compatible/single_client/test/ops/test_sparse_softmax_cross_entropy.py
new file mode 100644
index 0000000000000000000000000000000000000000..aac00ef441a8571e34955cd6b2366bce5209a4ad
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_sparse_softmax_cross_entropy.py
@@ -0,0 +1,101 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import test_global_storage
+from test_util import GenArgList, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def compare_with_tensorflow(
+    device_type, data_type, label_type, num_classes, batch_size
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def SparseSoftmaxCrossEntropyWithLogitsJob(
+        labels: oft.Numpy.Placeholder(
+            (batch_size,), dtype=type_name_to_flow_type[label_type]
+        )
+    ):
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "x",
+                shape=(batch_size, num_classes),
+                dtype=type_name_to_flow_type[data_type],
+                initializer=flow.random_uniform_initializer(minval=-10, maxval=10),
+                trainable=True,
+            )
+            loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+                labels=labels, logits=x
+            )
+            loss = flow.math.square(loss)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch(loss, test_global_storage.Setter("loss"))
+            flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))
+        return loss
+
+    labels = np.random.randint(0, num_classes, size=(batch_size,)).astype(
+        type_name_to_np_type[label_type]
+    )
+    of_out = SparseSoftmaxCrossEntropyWithLogitsJob(labels).get()
+    with tf.GradientTape(persistent=True) as tape:
+        x = tf.Variable(test_global_storage.Get("x"))
+        tf_out = tf.nn.sparse_softmax_cross_entropy_with_logits(labels, x)
+        tf_out = tf.math.square(tf_out)
+    loss_diff = test_global_storage.Get("loss_diff")
+    tf_x_diff = tape.gradient(tf_out, x, loss_diff)
+    assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=1e-05, atol=1e-05)
+    assert np.allclose(
+        test_global_storage.Get("x_diff"), tf_x_diff.numpy(), rtol=1e-05, atol=1e-05
+    )
+    flow.clear_default_session()
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestSparseSoftmaxCrossEntropy(flow.unittest.TestCase):
+    def test_sparse_softmax_cross_entropy_with_logits(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["data_type"] = ["float32", "double"]
+        arg_dict["label_type"] = ["int32", "int64"]
+        arg_dict["num_classes"] = [1000]
+        arg_dict["batch_size"] = [64]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_sparse_softmax_cross_entropy_ms.py b/python/oneflow/compatible/single_client/test/ops/test_sparse_softmax_cross_entropy_ms.py
new file mode 100644
index 0000000000000000000000000000000000000000..547515adee6d5223de335918685dff3e027d8f3b
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_sparse_softmax_cross_entropy_ms.py
@@ -0,0 +1,112 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import test_global_storage
+from test_util import GenArgList, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def compare_with_tensorflow(
+    device_type, data_type, label_type, num_classes, batch_size
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    flow.config.gpu_device_num(4)
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def SparseSoftmaxCrossEntropyWithLogitsJob(
+        labels: oft.Numpy.Placeholder(
+            (batch_size,), dtype=type_name_to_flow_type[label_type]
+        )
+    ):
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "x",
+                shape=(batch_size, num_classes),
+                dtype=type_name_to_flow_type[data_type],
+                initializer=flow.random_uniform_initializer(minval=-10, maxval=10),
+                trainable=True,
+            )
+        with flow.scope.placement(device_type, "0:0-3"):
+            labels = flow.parallel_cast(labels, distribute=flow.distribute.broadcast())
+            logits = flow.parallel_cast(
+                x, distribute=flow.distribute.split(len(x.shape) - 1)
+            )
+            loss = flow.nn.distributed_sparse_softmax_cross_entropy_with_logits(
+                labels, logits
+            )
+            loss = flow.math.square(loss)
+        with flow.scope.placement(device_type, "0:0"):
+            loss = flow.identity(loss)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch(loss, test_global_storage.Setter("loss"))
+            flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))
+            return loss
+
+    labels = np.random.randint(0, num_classes, size=(batch_size,)).astype(
+        type_name_to_np_type[label_type]
+    )
+    of_out = SparseSoftmaxCrossEntropyWithLogitsJob(labels).get()
+    with tf.GradientTape(persistent=True) as tape:
+        x = tf.Variable(test_global_storage.Get("x"))
+        tf_out = tf.nn.sparse_softmax_cross_entropy_with_logits(labels, x)
+        tf_out = tf.math.square(tf_out)
+    loss_diff = test_global_storage.Get("loss_diff")
+    tf_x_diff = tape.gradient(tf_out, x, loss_diff)
+    assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=1e-05, atol=1e-05)
+    assert np.allclose(
+        test_global_storage.Get("x_diff"), tf_x_diff.numpy(), rtol=1e-05, atol=1e-05
+    )
+    flow.clear_default_session()
+
+
+@flow.unittest.skip_unless_1n4d()
+class TestSparseSoftmaxCrossEntropyMs(flow.unittest.TestCase):
+    def test_sparse_softmax_cross_entropy_with_logits(test_case):
+        if flow.eager_execution_enabled():
+            print("\nSkip under erger mode!")
+            return
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["data_type"] = ["float32", "double"]
+        arg_dict["label_type"] = ["int32", "int64"]
+        arg_dict["num_classes"] = [1000]
+        arg_dict["batch_size"] = [64]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_split_like.py b/python/oneflow/compatible/single_client/test/ops/test_split_like.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd451390d52494686e73614e41cecfcd68909392
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_split_like.py
@@ -0,0 +1,102 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import math
+import os
+import random
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import test_global_storage
+from test_util import GenArgList, type_name_to_flow_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def split_like(input, like, name):
+    return (
+        flow.user_op_builder(name)
+        .Op("split_like")
+        .Input("in", [input])
+        .Input("like", like)
+        .Output("out", len(like))
+        .Attr("axis", 0)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()
+    )
+
+
+def compare_with_np(device_type, x_shape, like0_shape, like1_shape, dtype):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def SplitLikeJob(x: oft.Numpy.Placeholder(x_shape, dtype=flow.float)):
+        v = flow.get_variable(
+            "x",
+            shape=x_shape,
+            dtype=flow.float,
+            initializer=flow.constant_initializer(0),
+            trainable=True,
+        )
+        x += v
+        like0 = flow.constant(0, dtype=flow.float, shape=like0_shape)
+        like1 = flow.constant(0, dtype=flow.float, shape=like1_shape)
+        with flow.scope.placement("gpu", "0:0"):
+            (y0, y1) = split_like(x, [like0, like1], "split_like")
+            loss = y0
+        flow.optimizer.SGD(
+            flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+        ).minimize(loss)
+        flow.watch(x, test_global_storage.Setter("x"))
+        flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+        flow.watch(loss, test_global_storage.Setter("loss"))
+        flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))
+        return (y0, y1)
+
+    x = np.random.randn(*x_shape).astype(np.float32)
+    (y0, y1) = SplitLikeJob(x).get()
+    assert like0_shape[0] + like1_shape[0] == x_shape[0]
+    np_y0 = x[0 : like0_shape[0]]
+    np_y1 = x[like0_shape[0] :]
+    zeros = np.zeros(np_y1.shape, dtype=np.float32)
+    np_x_diff = np.concatenate([test_global_storage.Get("loss_diff"), zeros], axis=0)
+    assert np.array_equal(y0.numpy(), np_y0)
+    assert np.array_equal(y1.numpy(), np_y1)
+    assert np.array_equal(test_global_storage.Get("x_diff"), np_x_diff)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestSplitLike(flow.unittest.TestCase):
+    def test_split_like_axis0(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["x_shape"] = [(15, 20, 10)]
+        arg_dict["like0_shape"] = [(10,)]
+        arg_dict["like1_shape"] = [(5,)]
+        arg_dict["dtype"] = ["float32", "double"]
+        for arg in GenArgList(arg_dict):
+            compare_with_np(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_sqrt.py b/python/oneflow/compatible/single_client/test/ops/test_sqrt.py
new file mode 100644
index 0000000000000000000000000000000000000000..60c3814862842a065e3b84414a1dbb5b4a01cfd3
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_sqrt.py
@@ -0,0 +1,48 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+from test_util import CompareOpWithTensorFlow, GenArgDict
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestSqrt(flow.unittest.TestCase):
+    def test_sqrt(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["flow_op"] = [flow.math.sqrt]
+        arg_dict["tf_op"] = [tf.math.sqrt]
+        arg_dict["input_shape"] = [(10, 20, 30)]
+        arg_dict["input_minval"] = [0]
+        arg_dict["input_maxval"] = [100]
+        for arg in GenArgDict(arg_dict):
+            CompareOpWithTensorFlow(**arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_square.py b/python/oneflow/compatible/single_client/test/ops/test_square.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1e13f98734b3c2a4ff92c21534c0461bb739979
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_square.py
@@ -0,0 +1,83 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import test_global_storage
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def compare_with_tensorflow(device_type, x_shape):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def SquareJob():
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "x",
+                shape=x_shape,
+                dtype=flow.float,
+                initializer=flow.random_uniform_initializer(minval=-10, maxval=10),
+                trainable=True,
+            )
+            loss = flow.math.square(x)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch(loss, test_global_storage.Setter("loss"))
+            flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))
+            return loss
+
+    of_out = SquareJob().get()
+    with tf.GradientTape(persistent=True) as tape:
+        x = tf.Variable(test_global_storage.Get("x"))
+        tf_out = tf.math.square(x)
+    loss_diff = test_global_storage.Get("loss_diff")
+    tf_x_diff = tape.gradient(tf_out, x, loss_diff)
+    assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=1e-05, atol=1e-05)
+    assert np.allclose(
+        test_global_storage.Get("x_diff"), tf_x_diff.numpy(), rtol=1e-05, atol=1e-05
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestSquare(flow.unittest.TestCase):
+    def test_square(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["x_shape"] = [(10, 20, 30)]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_square_sum.py b/python/oneflow/compatible/single_client/test/ops/test_square_sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..4957d364e6e0f4684af5f99dc780d77bccb0a694
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_square_sum.py
@@ -0,0 +1,59 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+func_config = flow.FunctionConfig()
+func_config.default_data_type(flow.float)
+
+
+def _check(test_case, x, y):
+    ref_y = np.array(np.sum(x ** 2))
+    test_case.assertTrue(np.allclose(y, ref_y))
+
+
+def _run_test(test_case, x, dtype, device):
+    @flow.global_function(function_config=func_config)
+    def SquareSum(x: oft.Numpy.Placeholder(x.shape, dtype=dtype)):
+        with flow.scope.placement(device, "0:0"):
+            return flow.experimental.square_sum(x)
+
+    y = SquareSum(x).get()
+    _check(test_case, x, y.numpy())
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestSquareSum(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_square_sum_random_gpu(test_case):
+        x = np.random.uniform(-0.01, 0.01, (64, 64)).astype(np.float32)
+        _run_test(test_case, x, flow.float32, "gpu")
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_square_sum_small_blob_gpu(test_case):
+        x = np.random.uniform(-0.01, 0.01, (64,)).astype(np.float32)
+        _run_test(test_case, x, flow.float32, "gpu")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_squared_difference.py b/python/oneflow/compatible/single_client/test/ops/test_squared_difference.py
new file mode 100644
index 0000000000000000000000000000000000000000..339606f3187e956c1f7b363679da9278c5de469d
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_squared_difference.py
@@ -0,0 +1,102 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+from test_util import Args, CompareOpWithTensorFlow, GenArgDict
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+func_config = flow.FunctionConfig()
+func_config.default_data_type(flow.float)
+
+
+def GenerateTest(test_case, a_shape, b_shape):
+    @flow.global_function(function_config=func_config)
+    def SqrDiffJob(
+        a: oft.Numpy.Placeholder(a_shape), b: oft.Numpy.Placeholder(b_shape)
+    ):
+        return flow.math.squared_difference(a, b)
+
+    a = np.random.rand(*a_shape).astype(np.float32)
+    b = np.random.rand(*b_shape).astype(np.float32)
+    y = SqrDiffJob(a, b).get().numpy()
+    test_case.assertTrue(np.allclose(y, (a - b) * (a - b)))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestSquaredDifference(flow.unittest.TestCase):
+    def test_naive(test_case):
+        @flow.global_function(function_config=func_config)
+        def SqrDiffJob(
+            a: oft.Numpy.Placeholder((5, 2)), b: oft.Numpy.Placeholder((5, 2))
+        ):
+            return flow.math.squared_difference(a, b)
+
+        x = np.random.rand(5, 2).astype(np.float32)
+        y = np.random.rand(5, 2).astype(np.float32)
+        z = None
+        z = SqrDiffJob(x, y).get().numpy()
+        test_case.assertTrue(np.allclose(z, (x - y) * (x - y)))
+
+    def test_broadcast(test_case):
+        @flow.global_function(function_config=func_config)
+        def SqrDiffJob(
+            a: oft.Numpy.Placeholder((5, 2)), b: oft.Numpy.Placeholder((1, 2))
+        ):
+            return flow.math.squared_difference(a, b)
+
+        x = np.random.rand(5, 2).astype(np.float32)
+        y = np.random.rand(1, 2).astype(np.float32)
+        z = None
+        z = SqrDiffJob(x, y).get().numpy()
+        test_case.assertTrue(np.allclose(z, (x - y) * (x - y)))
+
+    def test_xy_sqr_diff_x1(test_case):
+        GenerateTest(test_case, (64, 64), (64, 1))
+
+    def test_xy_sqr_diff_1y(test_case):
+        GenerateTest(test_case, (64, 64), (1, 64))
+
+    def test_xyz_sqr_diff_x1z(test_case):
+        GenerateTest(test_case, (64, 64, 64), (64, 1, 64))
+
+    def test_xyz_sqr_diff_1y1(test_case):
+        GenerateTest(test_case, (64, 64, 64), (1, 64, 1))
+
+    def test_scalar_sqr_diff(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["flow_op"] = [flow.math.squared_difference]
+        arg_dict["tf_op"] = [tf.math.squared_difference]
+        arg_dict["input_shape"] = [(10, 10, 10)]
+        arg_dict["op_args"] = [
+            Args([1]),
+            Args([-1]),
+            Args([84223.19348]),
+            Args([-3284.139]),
+        ]
+        for arg in GenArgDict(arg_dict):
+            CompareOpWithTensorFlow(**arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_squeeze.py b/python/oneflow/compatible/single_client/test/ops/test_squeeze.py
new file mode 100644
index 0000000000000000000000000000000000000000..b578f81c545253163160552bbbcb117fd0b42dda
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_squeeze.py
@@ -0,0 +1,86 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def compare_with_tensorflow(device_type, x_shape, axis):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    def check_grad(x_diff_blob):
+        assert np.array_equal(x_diff_blob.numpy(), np.ones(x_shape))
+
+    @flow.global_function(type="train", function_config=func_config)
+    def SqueezeJob():
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "var",
+                shape=x_shape,
+                dtype=flow.float,
+                initializer=flow.ones_initializer(),
+                trainable=True,
+            )
+            flow.watch_diff(x, check_grad)
+            loss = flow.squeeze(x, axis)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            return loss
+
+    of_out = SqueezeJob().get().numpy()
+    tf_out = tf.squeeze(np.ones(x_shape, dtype=np.float32), axis).numpy()
+    tf_out = np.array([tf_out]) if isinstance(tf_out, np.float32) else tf_out
+    assert np.array_equal(of_out, tf_out)
+
+
+def gen_arg_list():
+    arg_dict = OrderedDict()
+    arg_dict["device_type"] = ["cpu", "gpu"]
+    arg_dict["in_shape"] = [(1, 10, 1, 10, 1)]
+    arg_dict["axis"] = [None, [2], [-3], [0, 2, 4], [-1, -3, -5]]
+    return GenArgList(arg_dict)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestSqueeze(flow.unittest.TestCase):
+    def test_squeeze(test_case):
+        for arg in gen_arg_list():
+            compare_with_tensorflow(*arg)
+        if os.getenv("ONEFLOW_TEST_CPU_ONLY") is None:
+            compare_with_tensorflow("gpu", (1, 1, 1), [0, 1, 2])
+            compare_with_tensorflow("gpu", (5, 6, 7), None)
+        compare_with_tensorflow("cpu", (1, 1, 1), [0, 1, 2])
+        compare_with_tensorflow("cpu", (5, 6, 7), None)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_ssp_variable_proxy.py b/python/oneflow/compatible/single_client/test/ops/test_ssp_variable_proxy.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5342232ea90a245934f749992f24524a8534457
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_ssp_variable_proxy.py
@@ -0,0 +1,146 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+@flow.unittest.skip_unless_1n1d()
+class Test1dSspVariableProxy(flow.unittest.TestCase):
+    def test_1d_ring_buffer_Wm_assign_Wc_plus_1(test_case):
+        if flow.eager_execution_enabled():
+            return
+        device_name = "0:0"
+        flow.config.cpu_device_num(2)
+        buffer_size = 4
+
+        @flow.global_function()
+        def Foo() -> tp.Numpy:
+            with flow.scope.placement("cpu", device_name):
+                w = flow.get_variable(
+                    "w",
+                    shape=(10,),
+                    dtype=flow.float,
+                    initializer=flow.constant_initializer(0),
+                )
+                ones = flow.constant_like(w, value=1.0, dtype=flow.float)
+                (ref, value) = flow.experimental.ssp_variable_proxy(
+                    w, buffer_size=buffer_size
+                )
+                flow.assign(ref, value + ones)
+                return value
+
+        zeros = np.zeros((10,)).astype(np.float32)
+        ones = np.ones((10,)).astype(np.float32)
+        for i in range(buffer_size):
+            x = Foo()
+            test_case.assertTrue(np.allclose(x, zeros))
+        for i in range(buffer_size):
+            x = Foo()
+            test_case.assertTrue(np.allclose(x, ones))
+        for i in range(buffer_size):
+            x = Foo()
+            test_case.assertTrue(np.allclose(x, ones + ones))
+
+    def test_1d_ring_buffer_Wm_assign_Wm_plus_1(test_case):
+        if flow.eager_execution_enabled():
+            return
+        device_name = "0:0"
+        flow.config.cpu_device_num(2)
+        buffer_size = 4
+
+        @flow.global_function()
+        def Foo() -> tp.Numpy:
+            with flow.scope.placement("cpu", device_name):
+                w = flow.get_variable(
+                    "w",
+                    shape=(10,),
+                    dtype=flow.float,
+                    initializer=flow.constant_initializer(0),
+                )
+                ones = flow.constant_like(w, value=1.0, dtype=flow.float)
+                (ref, value) = flow.experimental.ssp_variable_proxy(
+                    w, buffer_size=buffer_size
+                )
+                flow.assign(ref, ref + ones)
+                return value
+
+        zeros = np.zeros((10,)).astype(np.float32)
+        ones = np.ones((10,)).astype(np.float32)
+        for i in range(buffer_size):
+            x = Foo()
+            test_case.assertTrue(np.allclose(x, zeros))
+        x = Foo()
+        test_case.assertTrue(np.allclose(x, ones))
+        x = Foo()
+        test_case.assertTrue(np.allclose(x, ones + ones))
+        x = Foo()
+        test_case.assertTrue(np.allclose(x, ones + ones + ones))
+        x = Foo()
+        test_case.assertTrue(np.allclose(x, ones + ones + ones + ones))
+
+    def test_add_ssp_variable_proxy(test_case):
+        if flow.eager_execution_enabled():
+            return
+        device_name = "0:0"
+        flow.config.enable_debug_mode(True)
+        flow.config.cpu_device_num(2)
+        buffer_size = 4
+        function_config = flow.FunctionConfig()
+        function_config.enable_ssp(True)
+
+        @flow.global_function(type="train", function_config=function_config)
+        def Foo() -> tp.Numpy:
+            with flow.scope.placement(
+                "cpu", device_name
+            ), flow.experimental.scope.config(
+                ssp_num_stages=buffer_size, ssp_stage_id=0
+            ):
+                w = flow.get_variable(
+                    "w",
+                    shape=(10,),
+                    dtype=flow.float,
+                    initializer=flow.constant_initializer(0),
+                )
+                loss = w + flow.constant_like(w, value=0.0, dtype=flow.float)
+                flow.optimizer.SGD(
+                    flow.optimizer.PiecewiseConstantScheduler([], [-10.0]), momentum=0
+                ).minimize(loss)
+                return loss
+
+        zeros = np.zeros((10,)).astype(np.float32)
+        ones = np.ones((10,)).astype(np.float32)
+        for i in range(buffer_size):
+            x = Foo()
+            test_case.assertTrue(np.allclose(x, zeros))
+        x = Foo()
+        test_case.assertTrue(np.allclose(x, ones))
+        x = Foo()
+        test_case.assertTrue(np.allclose(x, ones + ones))
+        x = Foo()
+        test_case.assertTrue(np.allclose(x, ones + ones + ones))
+        x = Foo()
+        test_case.assertTrue(np.allclose(x, ones + ones + ones + ones))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_stack.py b/python/oneflow/compatible/single_client/test/ops/test_stack.py
new file mode 100644
index 0000000000000000000000000000000000000000..d30ce38213daacbe64a59fe7fa352b9fbcc5d730
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_stack.py
@@ -0,0 +1,131 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+from typing import Dict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+def _compare_stack_with_np(input_shape, axis, device_type, machine_ids, device_counts):
+    input_1 = np.random.random(size=input_shape).astype(np.float32)
+    input_2 = np.random.random(size=input_shape).astype(np.float32)
+    assert device_type in ["cpu", "gpu"]
+    flow.clear_default_session()
+    if device_type == "cpu":
+        flow.config.cpu_device_num(device_counts)
+    else:
+        flow.config.gpu_device_num(device_counts)
+    func_config = flow.FunctionConfig()
+
+    def np_stack(np_input_1, np_input_2, np_axis):
+        return np.stack([np_input_1, np_input_2], axis=np_axis)
+
+    np_out_stack = np_stack(input_1, input_2, axis)
+    np_random_mul = np.random.random(size=np_out_stack.shape).astype(np.float32)
+
+    def np_diff(np_input, np_axis):
+        np_stack_grad = np.split(np_input, indices_or_sections=2, axis=np_axis)[0]
+        return np.squeeze(np_stack_grad, np_axis)
+
+    _np_grad = np_diff(np_random_mul, axis)
+
+    def assert_prediction_grad(blob: tp.Numpy):
+        assert np.allclose(blob, _np_grad)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def oneflow_stack(
+        of_input_1: tp.Numpy.Placeholder(shape=input_1.shape),
+        of_input_2: tp.Numpy.Placeholder(shape=input_2.shape),
+        of_mul: tp.Numpy.Placeholder(shape=np_random_mul.shape),
+    ) -> tp.Numpy:
+        with flow.scope.placement(device_type, "0:0"):
+            v = flow.get_variable(
+                shape=input_1.shape,
+                dtype=flow.float32,
+                initializer=flow.zeros_initializer(),
+                name="x_var",
+            )
+            x_var = of_input_1 + v
+        flow.watch_diff(x_var, assert_prediction_grad)
+        of_stack_out = flow.stack([x_var, of_input_2], axis=axis)
+        out = of_stack_out * of_mul
+        with flow.scope.placement(device_type, "0:0"):
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+            ).minimize(out)
+        return of_stack_out
+
+    of_out_stack = oneflow_stack(input_1, input_2, np_random_mul)
+    assert np.allclose(of_out_stack, np_out_stack)
+
+
+def _gen_arg_dict(shape, axis, device_type, machine_ids, device_counts):
+    arg_dict = OrderedDict()
+    arg_dict["input_shape"] = [shape]
+    arg_dict["axis"] = [axis]
+    arg_dict["device_type"] = [device_type]
+    arg_dict["machine_ids"] = [machine_ids]
+    arg_dict["device_counts"] = [device_counts]
+    return arg_dict
+
+
+@flow.unittest.skip_unless_1n1d()
+class Teststack1n1d(flow.unittest.TestCase):
+    def test_stack_cpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(4, 6), axis=2, device_type="cpu", machine_ids="0:0", device_counts=1
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_stack_with_np(*arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_stack_gpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(3, 16, 32),
+            axis=-4,
+            device_type="gpu",
+            machine_ids="0:0",
+            device_counts=1,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_stack_with_np(*arg)
+
+
+@flow.unittest.skip_unless_1n2d()
+class Teststack1n2d(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_stack_gpu_1n2d(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(3, 8, 8, 4),
+            axis=3,
+            device_type="gpu",
+            machine_ids="0:0-1",
+            device_counts=2,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_stack_with_np(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_stateful_local_kernel.py b/python/oneflow/compatible/single_client/test/ops/test_stateful_local_kernel.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a7b733d2f3c27d7fe7b81be8c25e26d206091d9
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_stateful_local_kernel.py
@@ -0,0 +1,71 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+
+@unittest.skipIf(
+    not flow.unittest.env.eager_execution_enabled(),
+    ".numpy() doesn't work in lazy mode",
+)
+@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+class TestStatefulLocalKernel(flow.unittest.TestCase):
+    def test_dynamic_attrs(test_case):
+        x = (
+            flow.builtin_op("constant")
+            .Output("out")
+            .Attr("is_floating_value", True)
+            .Attr("floating_value", 3.0)
+            .Attr("dtype", flow.float32)
+            .Attr("shape", [2, 3])
+            .Build()()[0]
+        )
+        op = flow.builtin_op("expand_dims").Input("in").Output("out").Build()
+        y = op(x, axis=1)[0]
+        test_case.assertEqual(y.shape, flow.Size((2, 1, 3)))
+        y = op(x, axis=2)[0]
+        test_case.assertEqual(y.shape, flow.Size((2, 3, 1)))
+
+    def test_stateful_local_kernel(test_case):
+        op1 = (
+            flow.builtin_op("constant")
+            .Output("out")
+            .Attr("is_floating_value", True)
+            .Attr("floating_value", 3.0)
+            .Attr("dtype", flow.float32)
+            .Attr("shape", [1, 1])
+            .Build()
+        )
+        op2 = (
+            flow.builtin_op("matmul")
+            .Input("a")
+            .Input("b")
+            .Attr("transpose_a", False)
+            .Attr("transpose_b", False)
+            .Attr("alpha", float(1.0))
+            .Output("out")
+            .Build()
+        )
+        x = op1()[0]
+        x = op2(x, x)[0]
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_summary.py b/python/oneflow/compatible/single_client/test/ops/test_summary.py
new file mode 100644
index 0000000000000000000000000000000000000000..5084ed12763da3bde2cc47800f0fabc841823485
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_summary.py
@@ -0,0 +1,182 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import tempfile
+import time
+import unittest
+from collections import OrderedDict
+
+import cv2
+import numpy as np
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+
+def summary_demo():
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.mirrored_view())
+    with tempfile.TemporaryDirectory() as logdir:
+
+        @flow.global_function(function_config=func_config)
+        def CreateWriter():
+            flow.summary.create_summary_writer(logdir)
+
+        @flow.global_function(function_config=func_config)
+        def ScalarJob(
+            value: flow.typing.ListNumpy.Placeholder((1,), dtype=flow.float),
+            step: flow.typing.ListNumpy.Placeholder((1,), dtype=flow.int64),
+            tag: flow.typing.ListNumpy.Placeholder((1000,), dtype=flow.int8),
+        ):
+            flow.summary.scalar(value, step, tag)
+
+        @flow.global_function(function_config=func_config)
+        def HistogramJob(
+            value: flow.typing.ListNumpy.Placeholder((200, 200, 200), dtype=flow.float),
+            step: flow.typing.ListNumpy.Placeholder((1,), dtype=flow.int64),
+            tag: flow.typing.ListNumpy.Placeholder((9,), dtype=flow.int8),
+        ):
+            flow.summary.histogram(value, step, tag)
+
+        @flow.global_function(function_config=func_config)
+        def PbJob(
+            value: flow.typing.ListNumpy.Placeholder((1500,), dtype=flow.int8),
+            step: flow.typing.ListNumpy.Placeholder((1,), dtype=flow.int64),
+        ):
+            flow.summary.pb(value, step=step)
+
+        @flow.global_function(function_config=func_config)
+        def ImageJob(
+            value: flow.typing.ListNumpy.Placeholder(
+                shape=(100, 2000, 2000, 4), dtype=flow.uint8
+            ),
+            step: flow.typing.ListNumpy.Placeholder((1,), dtype=flow.int64),
+            tag: flow.typing.ListNumpy.Placeholder((10,), dtype=flow.int8),
+        ):
+            flow.summary.image(value, step=step, tag=tag)
+
+        @flow.global_function(function_config=func_config)
+        def FlushJob():
+            flow.summary.flush_summary_writer()
+
+        CreateWriter()
+        projecotr = flow.summary.Projector(logdir)
+        projecotr.create_embedding_projector()
+        projecotr.create_exception_projector()
+        hparams = {
+            flow.summary.HParam(
+                "learning_rate", flow.summary.RealRange(0.01, 0.1)
+            ): 0.02,
+            flow.summary.HParam("dense_layers", flow.summary.IntegerRange(2, 7)): 5,
+            flow.summary.HParam(
+                "optimizer", flow.summary.ValueSet(["adam", "sgd"])
+            ): "adam",
+            flow.summary.HParam("accuracy", flow.summary.RealRange(0.01, 0.1)): 0.001,
+            flow.summary.HParam("magic", flow.summary.ValueSet([False, True])): True,
+            flow.summary.Metric("loss", float): 0.02,
+            "dropout": 0.6,
+        }
+        for i in range(200):
+            t = ["vgg16", "resnet50", "mask-rcnn", "yolov3"]
+            pb = flow.summary.text(t)
+            value = np.fromstring(str(pb), dtype=np.int8)
+            step = np.array([i], dtype=np.int64)
+            PbJob([value], [step])
+            pb2 = flow.summary.hparams(hparams)
+            value = np.fromstring(str(pb2), dtype=np.int8)
+            step = np.array([i], dtype=np.int64)
+            PbJob([value], [step])
+        for idx in range(100):
+            value = np.array([idx], dtype=np.float32)
+            step = np.array([idx], dtype=np.int64)
+            tag = np.fromstring("scalar", dtype=np.int8)
+            ScalarJob([value], [step], [tag])
+        value = np.array(
+            [
+                [[1, 2, 3, 0], [0, 2, 3, 1], [2, 3, 4, 1]],
+                [[1, 0, 2, 0], [2, 1, 2, 0], [2, 1, 1, 1]],
+            ],
+            dtype=np.float64,
+        )
+        for idx in range(20):
+            value = np.random.rand(100, 100, 100).astype(np.float32)
+            step = np.array([idx], dtype=np.int64)
+            tag = np.fromstring("histogram", dtype=np.int8)
+            HistogramJob([value], [step], [tag])
+        value_ = np.random.rand(10, 10, 10).astype(np.float32)
+        label = (np.random.rand(10) * 10).astype(np.int64)
+        x = (np.random.rand(10, 10, 10) * 255).astype(np.uint8)
+        sample_name = "sample"
+        sample_type = "image"
+        step = 1
+        tag_exception = "exception_projector"
+        tag_embedding = "embedding_projector"
+        for i in range(20):
+            projecotr.exception_projector(
+                value=value,
+                tag=tag_exception,
+                step=step,
+                sample_name=sample_name,
+                sample_type=sample_type,
+                x=x,
+            )
+            projecotr.embedding_projector(
+                value=value,
+                label=label,
+                tag=tag_embedding,
+                step=step,
+                sample_name=sample_name,
+                sample_type=sample_type,
+                x=x,
+            )
+        images = [
+            cv2.cvtColor(np.ones([512, 512], np.uint8), cv2.COLOR_BGR2RGB).astype(
+                np.uint8
+            ),
+            cv2.cvtColor(np.ones([512, 512], np.uint8), cv2.COLOR_BGR2RGB).astype(
+                np.uint8
+            ),
+        ]
+        images = np.array(images, dtype=np.uint8)
+        imageRed = np.ones([512, 512, 3]).astype(np.uint8)
+        Red = np.array([0, 255, 255], dtype=np.uint8)
+        imageNew = np.multiply(imageRed, Red)
+        imageNew = np.expand_dims(imageNew, axis=0)
+        images = np.concatenate((images, imageNew), axis=0)
+        step = np.array([1], dtype=np.int64)
+        tag = np.fromstring("image", dtype=np.int8)
+        for i in range(20):
+            ImageJob([images], [step], [tag])
+        graph = flow.summary.Graph(logdir)
+        graph.write_structure_graph()
+        time.sleep(1)
+        FlushJob()
+        time.sleep(1)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestSummary(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_ENABLE_EAGER"), "only test lazy cases")
+    def test_summary(test_case):
+        summary_demo()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_swish.py b/python/oneflow/compatible/single_client/test/ops/test_swish.py
new file mode 100644
index 0000000000000000000000000000000000000000..d918c0420c9b7a724cb27721afb667f130d4fd24
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_swish.py
@@ -0,0 +1,133 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+from typing import Dict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+def _compare_swish_with_np(input_shape, beta, device_type, machine_ids, device_counts):
+    input_1 = np.random.random(size=input_shape).astype(np.float32)
+    assert device_type in ["cpu", "gpu"]
+    flow.clear_default_session()
+    if device_type == "cpu":
+        flow.config.cpu_device_num(device_counts)
+    else:
+        flow.config.gpu_device_num(device_counts)
+    func_config = flow.FunctionConfig()
+    func_config.default_placement_scope(flow.scope.placement(device_type, machine_ids))
+
+    def np_swish(input, beta):
+        def np_sigmoid(sigmoid_input):
+            return 1 / (1 + np.exp(-sigmoid_input))
+
+        return input * np_sigmoid(beta * input)
+
+    np_out_swish = np_swish(input_1, beta)
+
+    def np_diff(input, beta):
+        def np_sigmoid(sigmoid_input):
+            return 1 / (1 + np.exp(-sigmoid_input))
+
+        _fx = input * np_sigmoid(beta * input)
+        return beta * _fx + (1 - beta * _fx) * np_sigmoid(beta * input)
+
+    _np_grad = np_diff(input_1, beta)
+
+    def assert_prediction_grad(blob: tp.Numpy):
+        assert np.allclose(blob, _np_grad)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def oneflow_swish(
+        of_input_1: tp.Numpy.Placeholder(shape=input_1.shape),
+    ) -> tp.Numpy:
+        with flow.scope.placement(device_type, "0:0"):
+            v = flow.get_variable(
+                shape=input_1.shape,
+                dtype=flow.float32,
+                initializer=flow.zeros_initializer(),
+                name="x_var",
+            )
+            x_var = of_input_1 + v
+        flow.watch_diff(x_var, assert_prediction_grad)
+        of_swish_out = flow.nn.swish(x_var, beta)
+        with flow.scope.placement(device_type, "0:0"):
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+            ).minimize(of_swish_out)
+        return of_swish_out
+
+    of_out_swish = oneflow_swish(input_1)
+    assert np.allclose(of_out_swish, np_out_swish)
+
+
+def _gen_arg_dict(shape, beta, device_type, machine_ids, device_counts):
+    arg_dict = OrderedDict()
+    arg_dict["input_shape"] = [shape]
+    arg_dict["beta"] = [beta]
+    arg_dict["device_type"] = [device_type]
+    arg_dict["machine_ids"] = [machine_ids]
+    arg_dict["device_counts"] = [device_counts]
+    return arg_dict
+
+
+@flow.unittest.skip_unless_1n1d()
+class Testswish1n1d(flow.unittest.TestCase):
+    def test_swish_cpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(4, 6), beta=1, device_type="cpu", machine_ids="0:0", device_counts=1
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_swish_with_np(*arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_swish_gpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(3, 16, 32),
+            beta=10,
+            device_type="gpu",
+            machine_ids="0:0",
+            device_counts=1,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_swish_with_np(*arg)
+
+
+@flow.unittest.skip_unless_1n2d()
+class Teststack1n2d(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_swish_gpu_1n2d(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(3, 8, 8, 4),
+            beta=2,
+            device_type="gpu",
+            machine_ids="0:0-1",
+            device_counts=2,
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_swish_with_np(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_sync_dynamic_resize.py b/python/oneflow/compatible/single_client/test/ops/test_sync_dynamic_resize.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e078138383f57574638a4d3626a406ce36d5713
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_sync_dynamic_resize.py
@@ -0,0 +1,66 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestSyncDynamicResize(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_sync_dynamic_resize(_):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["x_shape"] = [(100,), (1000, 10)]
+        arg_dict["data_type"] = ["float32", "double", "int32", "int64"]
+        arg_dict["size_type"] = ["int32", "int64"]
+        for (device_type, x_shape, data_type, size_type) in GenArgList(arg_dict):
+            flow.clear_default_session()
+            func_config = flow.FunctionConfig()
+            func_config.default_data_type(flow.float)
+
+            @flow.global_function(function_config=func_config)
+            def TestJob(
+                x: oft.Numpy.Placeholder(
+                    x_shape, dtype=type_name_to_flow_type[data_type]
+                ),
+                size: oft.Numpy.Placeholder(
+                    (1,), dtype=type_name_to_flow_type[size_type]
+                ),
+            ):
+                with flow.scope.placement(device_type, "0:0"):
+                    return flow.sync_dynamic_resize(x, size)
+
+            size = np.random.randint(0, x_shape[0])
+            x = np.random.rand(*x_shape).astype(type_name_to_np_type[data_type])
+            y = (
+                TestJob(x, np.array([size]).astype(type_name_to_np_type[size_type]))
+                .get()
+                .numpy_list()[0]
+            )
+            assert np.array_equal(y, x[:size])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_tensor_buffer_ops.py b/python/oneflow/compatible/single_client/test/ops/test_tensor_buffer_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1f8f16c08412625fc43665520804da2e4ed42ee
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_tensor_buffer_ops.py
@@ -0,0 +1,51 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def _test_tensor_buffer_convert(test_case):
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.consistent_view())
+    input_arr = np.random.rand(16, 24, 32, 36).astype(np.float32)
+
+    @flow.global_function(function_config=func_config)
+    def job_fn(x: oft.Numpy.Placeholder(input_arr.shape, dtype=flow.float32)):
+        tensor_buffer = flow.tensor_to_tensor_buffer(x, instance_dims=2)
+        return flow.tensor_buffer_to_tensor(
+            tensor_buffer, dtype=flow.float32, instance_shape=[32, 36]
+        )
+
+    output_arr = job_fn(input_arr).get().numpy()
+    test_case.assertTrue(np.array_equal(input_arr, output_arr))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestTensorBufferOps(flow.unittest.TestCase):
+    def test_tensor_buffer_convert(test_case):
+        _test_tensor_buffer_convert(test_case)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_tensor_buffer_to_list_of_tensors.py b/python/oneflow/compatible/single_client/test/ops/test_tensor_buffer_to_list_of_tensors.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba94bc64a380bc82faa751e816aab4d9f8877c34
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_tensor_buffer_to_list_of_tensors.py
@@ -0,0 +1,68 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList, type_name_to_flow_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+
+def _run_test(shape, shape_list, value_list, data_type):
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_logical_view(flow.scope.consistent_view())
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(function_config=func_config)
+    def TestTensorBufferToListOfTensorsJob():
+        with flow.scope.placement("cpu", "0:0"):
+            x = flow.gen_tensor_buffer(
+                shape, shape_list, value_list, type_name_to_flow_type[data_type]
+            )
+            y = flow.tensor_buffer_to_list_of_tensors(
+                x, (100, 100), type_name_to_flow_type[data_type], True
+            )
+            return y
+
+    (out_0, out_1, out_2, out_3) = TestTensorBufferToListOfTensorsJob().get()
+    assert np.array_equal(out_0.numpy_list()[0], np.zeros((10, 10), np.float))
+    assert np.array_equal(out_1.numpy_list()[0], np.ones((50, 50), np.float))
+    assert np.array_equal(out_2.numpy_list()[0], np.ones((20, 80), np.float) * 2.0)
+    assert np.array_equal(out_3.numpy_list()[0], np.ones((100, 100), np.float) * 3.0)
+
+
+def gen_arg_list():
+    arg_dict = OrderedDict()
+    arg_dict["shape"] = [(2, 2), (4,)]
+    arg_dict["shape_list"] = [[(10, 10), (50, 50), (20, 80), (100, 100)]]
+    arg_dict["value_list"] = [[0.0, 1.0, 2.0, 3.0]]
+    arg_dict["data_type"] = ["float32", "double", "int32", "int64"]
+    return GenArgList(arg_dict)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestTensorBufferToListOfTensors(flow.unittest.TestCase):
+    def test_tensor_buffer_to_list_of_tensors(test_case):
+        for arg in gen_arg_list():
+            _run_test(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_top_k.py b/python/oneflow/compatible/single_client/test/ops/test_top_k.py
new file mode 100644
index 0000000000000000000000000000000000000000..6168b76bb95db248c44b54109179d12602a1481a
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_top_k.py
@@ -0,0 +1,101 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+from test_util import GenArgList, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+from oneflow.compatible.single_client.ops.transpose_util import (
+    get_inversed_perm,
+    get_perm_when_transpose_axis_to_last_dim,
+)
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def compare_with_tensorflow(device_type, in_shape, axis, k, data_type, sorted):
+    assert device_type in ["gpu", "cpu"]
+    assert data_type in ["float32", "double", "int8", "int32", "int64"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_logical_view(flow.scope.mirrored_view())
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(function_config=func_config)
+    def TopKJob(
+        input: oft.ListNumpy.Placeholder(
+            tuple([dim + 10 for dim in in_shape]),
+            dtype=type_name_to_flow_type[data_type],
+        )
+    ):
+        with flow.scope.placement(device_type, "0:0"):
+            return flow.math.top_k(input, axis, k, sorted)
+
+    input = (np.random.random(in_shape) * 100).astype(type_name_to_np_type[data_type])
+    of_out = TopKJob([input]).get().numpy_list()[0]
+    if k <= in_shape[axis]:
+        perm = get_perm_when_transpose_axis_to_last_dim(len(in_shape), axis)
+        x = tf.transpose(input, perm)
+        (_, indices) = tf.math.top_k(x, k, sorted)
+        tf_out = tf.transpose(indices, get_inversed_perm(perm))
+    else:
+        tf_out = tf.argsort(input, axis, direction="DESCENDING", stable=True)
+    assert np.array_equal(of_out, tf_out.numpy())
+
+
+def gen_arg_list():
+    arg_dict = OrderedDict()
+    arg_dict["device_type"] = ["cpu", "gpu"]
+    arg_dict["in_shape"] = [(100,), (10, 10, 50)]
+    arg_dict["axis"] = [-1]
+    arg_dict["k"] = [1, 50]
+    arg_dict["data_type"] = ["float32", "double", "int32", "int64"]
+    arg_dict["sorted"] = [True]
+    return GenArgList(arg_dict)
+
+
+def gen_arg_list_for_test_axis():
+    arg_dict = OrderedDict()
+    arg_dict["device_type"] = ["cpu", "gpu"]
+    arg_dict["in_shape"] = [(10, 10, 500)]
+    arg_dict["axis"] = [-2, 0, 2]
+    arg_dict["k"] = [1, 50, 200]
+    arg_dict["data_type"] = ["float32", "double", "int32", "int64"]
+    arg_dict["sorted"] = [True]
+    return GenArgList(arg_dict)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestTopK(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_top_k(test_case):
+        for arg in gen_arg_list():
+            compare_with_tensorflow(*arg)
+        for arg in gen_arg_list_for_test_axis():
+            compare_with_tensorflow(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_transpose.py b/python/oneflow/compatible/single_client/test/ops/test_transpose.py
new file mode 100644
index 0000000000000000000000000000000000000000..02117e32289d49fe2caa7a1e4f230115958caf6e
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_transpose.py
@@ -0,0 +1,109 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import test_global_storage
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def compare_with_tensorflow(device_type, input_shape, perm):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def TransposeJob():
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "input",
+                shape=input_shape,
+                dtype=flow.float,
+                initializer=flow.random_uniform_initializer(minval=2, maxval=5),
+                trainable=True,
+            )
+            loss = flow.transpose(x, perm)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch(loss, test_global_storage.Setter("loss"))
+            flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))
+            return loss
+
+    of_out = TransposeJob().get()
+    with tf.GradientTape(persistent=True) as tape:
+        x = tf.Variable(test_global_storage.Get("x"))
+        tf_out = tf.transpose(x, perm)
+    loss_diff = test_global_storage.Get("loss_diff")
+    tf_x_diff = tape.gradient(tf_out, x, loss_diff)
+    assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=1e-05, atol=1e-05)
+    assert np.allclose(
+        test_global_storage.Get("x_diff"), tf_x_diff.numpy(), rtol=1e-05, atol=1e-05
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestTranspose(flow.unittest.TestCase):
+    def test_transpose(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(10, 11, 12, 13)]
+        arg_dict["perm"] = [(2, 0, 1, 3), (1, 0, 2, 3), (3, 2, 1, 0), (3, 1, 2, 0)]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_transpose2(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(10, 11, 12)]
+        arg_dict["perm"] = [(2, 0, 1), (1, 0, 2), (2, 1, 0), (1, 2, 0)]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_transpose3(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(10, 11)]
+        arg_dict["perm"] = [(1, 0), (0, 1)]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_transpose_dim6(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(2, 3, 4, 5, 6, 7)]
+        arg_dict["perm"] = [(2, 0, 1, 3, 5, 4)]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_tril.py b/python/oneflow/compatible/single_client/test/ops/test_tril.py
new file mode 100644
index 0000000000000000000000000000000000000000..52353c0e145b046c6fec30b40980e10bf7996614
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_tril.py
@@ -0,0 +1,117 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import (
+    GenArgDict,
+    test_global_storage,
+    type_name_to_flow_type,
+    type_name_to_np_type,
+)
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def _test_tril_fw_bw(test_case, device, shape, type_name, diagonal, fill_value):
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    if type_name == "float16":
+        flow_type = flow.float
+        np_type = np.float32
+    else:
+        flow_type = type_name_to_flow_type[type_name]
+        np_type = type_name_to_np_type[type_name]
+
+    @flow.global_function(type="train", function_config=func_config)
+    def test_tril_fw_bw_job(x: oft.Numpy.Placeholder(shape, dtype=flow_type)):
+        with flow.scope.placement(device, "0:0"):
+            x_var = flow.get_variable(
+                name="xv",
+                shape=(1,),
+                dtype=flow.float,
+                initializer=flow.zeros_initializer(),
+            )
+            x += flow.cast(x_var, dtype=flow_type)
+            if type_name == "float16":
+                out = flow.cast(
+                    flow.math.tril(flow.cast(x, flow.float16), diagonal), flow.float
+                )
+            else:
+                out = flow.math.tril(x, diagonal)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(out)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch(out, test_global_storage.Setter("out"))
+            flow.watch_diff(out, test_global_storage.Setter("out_diff"))
+            return out
+
+    x = np.random.randint(low=0, high=100, size=shape)
+    test_tril_fw_bw_job(x.astype(np_type)).get()
+    np_out = np.where(
+        np.tril(np.ones(shape), diagonal),
+        test_global_storage.Get("x"),
+        np.full(shape, fill_value).astype(np_type),
+    )
+    np_x_diff = np.tril(test_global_storage.Get("out_diff"), diagonal)
+    if type_name == "float16":
+        tolerance = 0.001
+    else:
+        tolerance = 1e-05
+    test_case.assertTrue(
+        np.allclose(
+            np_out, test_global_storage.Get("out"), rtol=tolerance, atol=tolerance
+        )
+    )
+    test_case.assertTrue(
+        np.allclose(
+            np_x_diff, test_global_storage.Get("x_diff"), rtol=tolerance, atol=tolerance
+        )
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestTril(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_tril_fw_bw(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device"] = ["cpu", "gpu"]
+        arg_dict["type_name"] = ["float32", "float16", "double", "int32", "int64"]
+        arg_dict["shape"] = [(3, 6, 8)]
+        arg_dict["diagonal"] = [-8, -1, 0, 8]
+        arg_dict["fill_value"] = [1.0, 0]
+        for arg in GenArgDict(arg_dict):
+            if arg["device"] == "cpu" and arg["type_name"] == "float16":
+                continue
+            if isinstance(arg["fill_value"], float) and arg_dict["type_name"] not in [
+                "float32",
+                "float16",
+                "double",
+            ]:
+                continue
+            _test_tril_fw_bw(test_case, **arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_two_node_boxing.py b/python/oneflow/compatible/single_client/test/ops/test_two_node_boxing.py
new file mode 100644
index 0000000000000000000000000000000000000000..8627d6e2ae5a8fb70b98b0f8250043afc4816552
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_two_node_boxing.py
@@ -0,0 +1,54 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import time
+import unittest
+from typing import Tuple
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+@flow.unittest.skip_unless_2n1d()
+class TestTwoNodeBoxing(flow.unittest.TestCase):
+    def test_two_node_boardcast(test_case):
+        flow.clear_default_session()
+        flow.config.enable_debug_mode(True)
+        flow.config.gpu_device_num(4)
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def split_to_broadcast_job(input_blob: oft.Numpy.Placeholder((96, 96))):
+            with flow.scope.placement("gpu", "0:0"):
+                src = flow.identity(
+                    input_blob.with_distribute(flow.distribute.split(0))
+                )
+            with flow.scope.placement("gpu", ["0:0", "1:0"]):
+                dst = flow.identity(src.with_distribute(flow.distribute.broadcast()))
+            return dst
+
+        x = np.random.rand(96, 96).astype(np.float32)
+        result = split_to_broadcast_job(x).get()
+        test_case.assertTrue(np.array_equal(x, result.numpy()))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_two_stage_reduce.py b/python/oneflow/compatible/single_client/test/ops/test_two_stage_reduce.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb9fa69ab1bcb91c43173f8c860994862434082c
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_two_stage_reduce.py
@@ -0,0 +1,104 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import test_global_storage
+from test_util import GenArgList, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def _compare_with_numpy(test_case, np_func, x, y, axis, keepdims=True):
+    x = test_global_storage.Get("x")
+    dx = test_global_storage.Get("x_diff")
+    np_y = np_func(x, axis=axis, keepdims=True)
+    test_case.assertTrue(np.allclose(y, np_y, rtol=1e-05, atol=1e-05))
+    mask = np.where(x == y, 1, 0)
+    count = np.add.reduce(mask, axis=axis, keepdims=True)
+    np_dx = np.where(x == y, 1 / count, 0)
+    test_case.assertTrue(np.allclose(dx, np_dx, rtol=1e-05, atol=1e-05))
+
+
+def _test_two_stage_reduce(
+    test_case, flow_func, np_func, device_type, axis, split_axis
+):
+    flow.clear_default_session()
+    flow.config.gpu_device_num(4)
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.consistent_view())
+
+    @flow.global_function(type="train", function_config=func_config)
+    def two_stage_reduce_job(x: oft.Numpy.Placeholder((4, 20, 20, 20))):
+        with flow.scope.placement(device_type, "0:0"):
+            x += flow.get_variable(
+                name="v1",
+                shape=(1,),
+                dtype=flow.float,
+                initializer=flow.zeros_initializer(),
+            )
+        with flow.scope.placement(device_type, "0:0-3"):
+            loss = flow_func(
+                x.with_distribute(flow.distribute.split(split_axis)),
+                axis=axis,
+                keepdims=True,
+            )
+            loss = flow.identity(loss)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            return loss
+
+    x = np.random.randint(low=0, high=10, size=(4, 20, 20, 20)).astype(np.float32)
+    y = two_stage_reduce_job(x).get().numpy()
+    _compare_with_numpy(test_case, np_func, x, y, axis=tuple(axis))
+
+
+@flow.unittest.skip_unless_1n4d()
+class TestTwoStageReduce(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_two_stage_reduce_max(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["flow_func"] = [flow.math.two_stage_reduce_max]
+        arg_dict["np_func"] = [np.maximum.reduce]
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["axis"] = [[1], [1, 2], [1, 2, 3]]
+        arg_dict["split_axis"] = [1]
+        for arg in GenArgList(arg_dict):
+            _test_two_stage_reduce(test_case, *arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_two_stage_reduce_min(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["flow_func"] = [flow.math.two_stage_reduce_min]
+        arg_dict["np_func"] = [np.minimum.reduce]
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["axis"] = [[1], [1, 2], [1, 2, 3]]
+        arg_dict["split_axis"] = [1]
+        for arg in GenArgList(arg_dict):
+            _test_two_stage_reduce(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_unary_elementwise_ops.py b/python/oneflow/compatible/single_client/test/ops/test_unary_elementwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..071aa2018b625e12244b0d29a054e918f15e219b
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_unary_elementwise_ops.py
@@ -0,0 +1,620 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import numpy as np
+from scipy.special import erf, erfc, gammaln
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+@flow.unittest.skip_unless_1n2d()
+class TestUnaryElementwiseOps(flow.unittest.TestCase):
+    def test_abs(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def AbsJob(a: oft.Numpy.Placeholder((5, 2))):
+            return flow.math.abs(a)
+
+        x = np.random.rand(5, 2).astype(np.float32)
+        y = AbsJob(x).get().numpy()
+        test_case.assertTrue(np.array_equal(y, np.absolute(x)))
+
+    def test_acos(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def AcosJob(a: oft.Numpy.Placeholder((5, 2))):
+            return flow.math.acos(a)
+
+        x = np.random.rand(5, 2).astype(np.float32)
+        y = AcosJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, np.arccos(x)))
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_acos_consistent_1n2c(test_case):
+        flow.config.gpu_device_num(2)
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def AcosJob(a: oft.Numpy.Placeholder((5, 2))):
+            return flow.math.acos(a)
+
+        x = np.random.rand(5, 2).astype(np.float32)
+        y = AcosJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, np.arccos(x)))
+
+    def test_acos_cpu(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_placement_scope(flow.scope.placement("cpu", "0:0"))
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def AcosJob(a: oft.Numpy.Placeholder((5, 2))):
+            return flow.math.acos(a)
+
+        x = np.random.rand(5, 2).astype(np.float32)
+        y = AcosJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, np.arccos(x)))
+
+    def test_acos_double(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def AcosJob(a: oft.Numpy.Placeholder((5, 2), dtype=flow.double)):
+            return flow.math.acos(a)
+
+        x = np.random.rand(5, 2).astype(np.double)
+        y = AcosJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, np.arccos(x)))
+
+    def test_acosh(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def AcoshJob(a: oft.Numpy.Placeholder((7,))):
+            return flow.math.acosh(a)
+
+        x = np.array([-2, -0.5, 1, 1.2, 200, 10000, float("inf")], dtype=np.float32)
+        y = AcoshJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, np.arccosh(x), equal_nan=True))
+        x = np.random.uniform(low=1.0, high=100.0, size=(7,)).astype(np.float32)
+        y = AcoshJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, np.arccosh(x), equal_nan=True))
+
+    def test_asin(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def AsinJob(a: oft.Numpy.Placeholder((2,))):
+            return flow.math.asin(a)
+
+        x = np.array([0.8659266, 0.7068252], dtype=np.float32)
+        y = AsinJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, np.arcsin(x), equal_nan=True))
+        x = np.random.uniform(low=-1.0, high=1.0, size=(2,)).astype(np.float32)
+        y = AsinJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, np.arcsin(x), equal_nan=True))
+
+    def test_asinh(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def AsinhJob(a: oft.Numpy.Placeholder((8,))):
+            return flow.math.asinh(a)
+
+        x = np.array(
+            [-float("inf"), -2, -0.5, 1, 1.2, 200, 10000, float("inf")],
+            dtype=np.float32,
+        )
+        y = AsinhJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, np.arcsinh(x), equal_nan=True))
+        x = np.random.uniform(size=(8,)).astype(np.float32)
+        y = AsinhJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, np.arcsinh(x), equal_nan=True))
+
+    def test_atan(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def AtanJob(a: oft.Numpy.Placeholder((2,))):
+            return flow.math.atan(a)
+
+        x = np.array([1.731261, 0.99920404], dtype=np.float32)
+        y = AtanJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, np.arctan(x), equal_nan=True))
+        pi = 3.14159265357
+        x = np.random.uniform(low=-pi / 2, high=pi / 2, size=(2,)).astype(np.float32)
+        y = AtanJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, np.arctan(x), equal_nan=True))
+
+    def test_atanh(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def AtanhJob(a: oft.Numpy.Placeholder((8,))):
+            return flow.math.atanh(a)
+
+        x = np.array(
+            [-float("inf"), -1, -0.5, 1, 0, 0.5, 10, float("inf")], dtype=np.float32
+        )
+        y = AtanhJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, np.arctanh(x), equal_nan=True))
+        x = np.random.uniform(size=(8,)).astype(np.float32)
+        y = AtanhJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, np.arctanh(x), equal_nan=True))
+
+    def test_ceil(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def CeilJob(a: oft.Numpy.Placeholder((8,))):
+            return flow.math.ceil(a)
+
+        x = np.random.uniform(low=-10.0, high=10.0, size=(8,)).astype(np.float32)
+        y = CeilJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, np.ceil(x), equal_nan=True))
+
+    def test_cos(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def CosJob(a: oft.Numpy.Placeholder((8,))):
+            return flow.math.cos(a)
+
+        x = np.array(
+            [-float("inf"), -9, -0.5, 1, 1.2, 200, 10000, float("inf")],
+            dtype=np.float32,
+        )
+        y = CosJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, np.cos(x), equal_nan=True))
+        x = np.random.uniform(size=(8,)).astype(np.float32)
+        y = CosJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, np.cos(x), equal_nan=True))
+
+    def test_cosh(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def CoshJob(a: oft.Numpy.Placeholder((8,))):
+            return flow.math.cosh(a)
+
+        x = np.array(
+            [-float("inf"), -9, -0.5, 1, 1.2, 2, 10, float("inf")], dtype=np.float32
+        )
+        y = CoshJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, np.cosh(x), equal_nan=True))
+        x = np.random.uniform(size=(8,)).astype(np.float32)
+        y = CoshJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, np.cosh(x), equal_nan=True))
+
+    def test_erf(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def ErfJob(a: oft.Numpy.Placeholder((8,))):
+            return flow.math.erf(a)
+
+        x = np.random.uniform(size=(8,)).astype(np.float32)
+        y = ErfJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, erf(x), equal_nan=True))
+
+    def test_erfc(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def ErfcJob(a: oft.Numpy.Placeholder((8,))):
+            return flow.math.erfc(a)
+
+        x = np.random.uniform(size=(8,)).astype(np.float32)
+        y = ErfcJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, erfc(x), equal_nan=True))
+
+    def test_exp(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def ExpJob(a: oft.Numpy.Placeholder((8,))):
+            return flow.math.exp(a)
+
+        x = np.random.uniform(size=(8,)).astype(np.float32)
+        y = ExpJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, np.exp(x), equal_nan=True))
+
+    def test_expm1(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def Expm1Job(a: oft.Numpy.Placeholder((8,))):
+            return flow.math.expm1(a)
+
+        x = np.random.uniform(size=(8,)).astype(np.float32)
+        y = Expm1Job(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, np.expm1(x), equal_nan=True))
+
+    def test_floor(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def FloorJob(a: oft.Numpy.Placeholder((8,))):
+            return flow.math.floor(a)
+
+        x = np.random.uniform(low=-10.0, high=10.0, size=(8,)).astype(np.float32)
+        y = FloorJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, np.floor(x), equal_nan=True))
+
+    def test_lgamma(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def LgammaJob(a: oft.Numpy.Placeholder((6,))):
+            return flow.math.lgamma(a)
+
+        x = np.array([0, 0.5, 1, 4.5, -4, -5.6], dtype=np.float32)
+        y = LgammaJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, gammaln(x), equal_nan=True))
+
+    def test_log(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def LogJob(a: oft.Numpy.Placeholder((4,))):
+            return flow.math.log(a)
+
+        x = np.array([0, 0.5, 1, 5], dtype=np.float32)
+        y = LogJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, np.log(x), equal_nan=True))
+
+    def test_log1p(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def Log1pJob(a: oft.Numpy.Placeholder((4,))):
+            return flow.math.log1p(a)
+
+        x = np.array([0, 0.5, 1, 5], dtype=np.float32)
+        y = Log1pJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, np.log1p(x), equal_nan=True))
+
+    def test_log_sigmoid(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def LogSigmoidJob(a: oft.Numpy.Placeholder((8,))):
+            return flow.math.log_sigmoid(a)
+
+        x = np.random.uniform(low=-5.0, high=5.0, size=(8,)).astype(np.float32)
+        y = LogSigmoidJob(x).get().numpy()
+        test_case.assertTrue(
+            np.allclose(
+                y, -np.log(1 + np.exp(-x)), equal_nan=True, rtol=0.001, atol=1e-05
+            )
+        )
+
+    def test_negative(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def NegativeJob(a: oft.Numpy.Placeholder((8,))):
+            return flow.math.negative(a)
+
+        x = np.random.uniform(low=-10.0, high=10.0, size=(8,)).astype(np.float32)
+        y = NegativeJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, -x, equal_nan=True))
+
+    def test_reciprocal(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def ReciprocalJob(a: oft.Numpy.Placeholder((8,))):
+            return flow.math.reciprocal(a)
+
+        x = np.random.uniform(low=-10.0, high=10.0, size=(8,)).astype(np.float32)
+        y = ReciprocalJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, 1.0 / x, equal_nan=True))
+
+    def test_reciprocal_no_nan(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def ReciprocalNoNanJob(a: oft.Numpy.Placeholder((4,))):
+            return flow.math.reciprocal_no_nan(a)
+
+        x = np.array([2.0, 0.5, 0, 1], dtype=np.float32)
+        out = np.array([0.5, 2, 0.0, 1.0], dtype=np.float32)
+        y = ReciprocalNoNanJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, out, equal_nan=True))
+
+    def test_rint(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def RintJob(a: oft.Numpy.Placeholder((8,))):
+            return flow.math.rint(a)
+
+        x = np.random.uniform(low=-10.0, high=10.0, size=(8,)).astype(np.float32)
+        y = RintJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, np.rint(x), equal_nan=True))
+
+    def test_rint_special_value(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def RintJob(a: oft.Numpy.Placeholder((9,))):
+            return flow.math.rint(a)
+
+        x = np.array(
+            [0.5000001, -1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.5, 3.5], dtype=np.float32
+        )
+        out = np.array(
+            [1.0, -2.0, -2.0, -0.0, 0.0, 2.0, 2.0, 2.0, 4.0], dtype=np.float32
+        )
+        y = RintJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, out, equal_nan=True))
+
+    def test_round(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def RoundJob(a: oft.Numpy.Placeholder((8,))):
+            return flow.math.round(a)
+
+        x = np.random.uniform(low=-10.0, high=10.0, size=(8,)).astype(np.float32)
+        y = RoundJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, np.round(x), equal_nan=True))
+
+    def test_round_special_value(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def RoundJob(a: oft.Numpy.Placeholder((5,))):
+            return flow.math.round(a)
+
+        x = np.array([0.9, 2.5, 2.3, 1.5, -4.5], dtype=np.float32)
+        out = np.array([1.0, 2.0, 2.0, 2.0, -4.0], dtype=np.float32)
+        y = RoundJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, out, equal_nan=True))
+
+    def test_rsqrt(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def RsqrtJob(a: oft.Numpy.Placeholder((8,))):
+            return flow.math.rsqrt(a)
+
+        x = np.random.uniform(low=-10.0, high=10.0, size=(8,)).astype(np.float32)
+        y = RsqrtJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, 1 / np.sqrt(x), equal_nan=True))
+
+    def test_sigmoid_v2(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def SigmoidJob(a: oft.Numpy.Placeholder((8,))):
+            return flow.math.sigmoid_v2(a)
+
+        x = np.random.uniform(low=-2.0, high=2.0, size=(8,)).astype(np.float32)
+        y = SigmoidJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, 1.0 / (1.0 + np.exp(-x)), equal_nan=True))
+
+    def test_sign(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def SignJob(a: oft.Numpy.Placeholder((8,))):
+            return flow.math.sign(a)
+
+        x = np.random.uniform(low=-100.0, high=100.0, size=(8,)).astype(np.float32)
+        y = SignJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, np.sign(x), equal_nan=True))
+
+    def test_sign_double(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def SignJob(a: oft.Numpy.Placeholder((8,), dtype=flow.double)):
+            return flow.math.sign(a)
+
+        x = np.random.uniform(low=-100.0, high=100.0, size=(8,)).astype(np.double)
+        y = SignJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, np.sign(x), equal_nan=True))
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_sign_double_consistent_1n2c(test_case):
+        flow.config.gpu_device_num(2)
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def SignJob(a: oft.Numpy.Placeholder((8,), dtype=flow.double)):
+            return flow.math.sign(a)
+
+        x = np.random.uniform(low=-100.0, high=100.0, size=(8,)).astype(np.double)
+        y = SignJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, np.sign(x), equal_nan=True))
+
+    def test_sin(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def SinJob(a: oft.Numpy.Placeholder((8,))):
+            return flow.math.sin(a)
+
+        x = np.array(
+            [-float("inf"), -9, -0.5, 1, 1.2, 200, 10, float("inf")], dtype=np.float32
+        )
+        y = SinJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, np.sin(x), equal_nan=True))
+        x = np.random.uniform(low=-100.0, high=100.0, size=(8,)).astype(np.float32)
+        y = SinJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, np.sin(x), equal_nan=True))
+
+    def test_softplus(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def SoftplusJob(a: oft.Numpy.Placeholder((8,))):
+            return flow.math.softplus(a)
+
+        x = np.random.uniform(low=-10.0, high=10.0, size=(8,)).astype(np.float32)
+        y = SoftplusJob(x).get().numpy()
+        test_case.assertTrue(
+            np.allclose(
+                y, np.log(np.exp(x) + 1), equal_nan=True, rtol=0.001, atol=1e-05
+            )
+        )
+
+    def test_sqrt(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def SqrtJob(a: oft.Numpy.Placeholder((8,))):
+            return flow.math.sqrt(a)
+
+        x = np.random.uniform(low=0.0, high=100.0, size=(8,)).astype(np.float32)
+        y = SqrtJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, np.sqrt(x), equal_nan=True))
+
+    def test_square(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def SquareJob(a: oft.Numpy.Placeholder((8,))):
+            return flow.math.square(a)
+
+        x = np.random.uniform(low=-100.0, high=100.0, size=(8,)).astype(np.float32)
+        y = SquareJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, x * x, equal_nan=True))
+
+    def test_tan(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def TanJob(a: oft.Numpy.Placeholder((8,))):
+            return flow.math.tan(a)
+
+        x = np.array(
+            [-float("inf"), -9, -0.5, 1, 1.2, 200, 10000, float("inf")],
+            dtype=np.float32,
+        )
+        y = TanJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, np.tan(x), equal_nan=True))
+        x = np.random.uniform(low=-100.0, high=100.0, size=(8,)).astype(np.float32)
+        y = TanJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, np.tan(x), equal_nan=True))
+
+    def test_tanh(test_case):
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def TanhJob(a: oft.Numpy.Placeholder((8,))):
+            return flow.math.tanh(a)
+
+        x = np.array(
+            [-float("inf"), -5, -0.5, 1, 1.2, 2, 3, float("inf")], dtype=np.float32
+        )
+        y = TanhJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, np.tanh(x), equal_nan=True))
+        x = np.random.uniform(low=-100.0, high=100.0, size=(8,)).astype(np.float32)
+        y = TanhJob(x).get().numpy()
+        test_case.assertTrue(np.allclose(y, np.tanh(x), equal_nan=True))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_unique.py b/python/oneflow/compatible/single_client/test/ops/test_unique.py
new file mode 100644
index 0000000000000000000000000000000000000000..61087ceedebb4b080bd3ccd0b63c688b36bd2756
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_unique.py
@@ -0,0 +1,84 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+func_config = flow.FunctionConfig()
+func_config.default_data_type(flow.float)
+
+
+def _check_unique(test_case, x, y, idx, count, num_unique):
+    (ref_y, ref_count) = np.unique(x, return_counts=True)
+    sorted_idx = np.argsort(ref_y)
+    ref_y = ref_y[sorted_idx]
+    ref_count = ref_count[sorted_idx]
+    num_unique = num_unique.item()
+    test_case.assertTrue(num_unique, np.size(ref_y))
+    y = y[0:num_unique]
+    test_case.assertTrue(np.array_equal(y[idx], x))
+    sorted_idx = np.argsort(y)
+    test_case.assertTrue(np.array_equal(ref_y, y[sorted_idx]))
+    count = count[0:num_unique]
+    test_case.assertTrue(np.array_equal(count[sorted_idx], ref_count))
+
+
+def _run_test(test_case, x, dtype, device):
+    @flow.global_function(function_config=func_config)
+    def UniqueWithCountsJob(x: oft.Numpy.Placeholder(x.shape, dtype=dtype)):
+        with flow.scope.placement(device, "0:0"):
+            return flow.experimental.unique_with_counts(x)
+
+    (y, idx, count, num_unique) = UniqueWithCountsJob(x).get()
+    _check_unique(
+        test_case, x, y.numpy(), idx.numpy(), count.numpy(), num_unique.numpy()
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestUnique(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_unique_with_counts_int(test_case):
+        x = np.asarray(list(range(32)) * 2).astype(np.int32)
+        np.random.shuffle(x)
+        _run_test(test_case, x, flow.int32, "gpu")
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_unique_with_counts_float(test_case):
+        x = np.asarray(list(range(32)) * 2).astype(np.float32)
+        np.random.shuffle(x)
+        _run_test(test_case, x, flow.float32, "gpu")
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_unique_with_counts_random_gpu(test_case):
+        x = np.random.randint(0, 32, 1024).astype(np.int32)
+        np.random.shuffle(x)
+        _run_test(test_case, x, flow.int32, "gpu")
+
+    def test_unique_with_counts_random_cpu(test_case):
+        x = np.random.randint(0, 32, 1024).astype(np.int32)
+        np.random.shuffle(x)
+        _run_test(test_case, x, flow.int32, "cpu")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_unpack_pack.py b/python/oneflow/compatible/single_client/test/ops/test_unpack_pack.py
new file mode 100644
index 0000000000000000000000000000000000000000..831486f36b8715b0380a9053cfd71b26f7013a38
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_unpack_pack.py
@@ -0,0 +1,46 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+func_config = flow.FunctionConfig()
+func_config.default_logical_view(flow.scope.mirrored_view())
+func_config.default_data_type(flow.float)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestUnpackPack(flow.unittest.TestCase):
+    def test_unpack_pack(test_case):
+        if flow.eager_execution_enabled():
+            return
+
+        @flow.global_function(function_config=func_config)
+        def UnpackPackJob(a: oft.Numpy.Placeholder((3, 4))):
+            return flow.pack(flow.unpack(a, 3), 3)
+
+        x = np.random.rand(3, 4).astype(np.float32)
+        y = UnpackPackJob(x).get().numpy()
+        test_case.assertTrue(np.array_equal(y, x))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_unsorted_batch_segment_sum.py b/python/oneflow/compatible/single_client/test/ops/test_unsorted_batch_segment_sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ab2d7e3eebde05d0d1dfe454c9c7314142dc928
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_unsorted_batch_segment_sum.py
@@ -0,0 +1,124 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import test_global_storage
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+func_config = flow.FunctionConfig()
+func_config.default_data_type(flow.float)
+func_config.default_logical_view(flow.scope.consistent_view())
+
+
+def _check(test_case, data, segment_ids, out_shape, out):
+    test_case.assertEqual(out.shape, out_shape)
+    ref = np.zeros_like(out)
+    for (idx, i) in np.ndenumerate(segment_ids):
+        out_idx = list(idx)
+        out_idx[-1] = i
+        out_idx = tuple(out_idx)
+        ref[out_idx] += data[idx]
+    test_case.assertTrue(np.allclose(ref, out, atol=1e-05, rtol=1e-05))
+
+
+def _check_bw(test_case, params, indices, out_shape, out):
+    ref = np.zeros_like(out)
+    for (idx, i) in np.ndenumerate(indices):
+        in_idx = list(idx)
+        in_idx[-1] = i
+        in_idx = tuple(in_idx)
+        ref[idx] += params[in_idx]
+    test_case.assertTrue(np.array_equal(ref, out))
+
+
+def _gen_segment_ids(out_shape, num_segments, segment_ids_shape):
+    axis = len(segment_ids_shape) - 1
+    return np.random.randint(
+        low=0, high=out_shape[axis], size=segment_ids_shape, dtype=np.int32
+    )
+
+
+def _gen_data(out_shape, num_segments, segment_ids_shape):
+    axis = len(segment_ids_shape) - 1
+    data_shape = out_shape[0:axis] + (segment_ids_shape[axis],) + out_shape[axis + 1 :]
+    return np.random.rand(*data_shape).astype(np.float32)
+
+
+def _make_unsoted_segment_sum_fn(device, data, segment_ids, num_segments):
+    flow.clear_default_session()
+
+    @flow.global_function(type="train", function_config=func_config)
+    def unsorted_batch_segment_sum_job(
+        data: oft.Numpy.Placeholder(data.shape, dtype=flow.float),
+        segment_ids: oft.Numpy.Placeholder(segment_ids.shape, dtype=flow.int32),
+    ):
+        with flow.scope.placement(device, "0:0"):
+            x = flow.get_variable(
+                "data",
+                shape=data.shape,
+                dtype=flow.float32,
+                initializer=flow.constant_initializer(0),
+            )
+            data = x + data
+            res = flow.math.unsorted_batch_segment_sum(
+                data=data, segment_ids=segment_ids, num_segments=num_segments
+            )
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+            ).minimize(res)
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch_diff(res, test_global_storage.Setter("loss_diff"))
+            return res
+
+    return unsorted_batch_segment_sum_job(data, segment_ids)
+
+
+def _run_test(test_case, device, out_shape, num_segments, segment_ids_shape):
+    segment_ids = _gen_segment_ids(out_shape, num_segments, segment_ids_shape)
+    data = _gen_data(out_shape, num_segments, segment_ids_shape)
+    unsorted_batch_segment_sum_out = _make_unsoted_segment_sum_fn(
+        device, data, segment_ids, num_segments
+    ).get()
+    out_ndarray = unsorted_batch_segment_sum_out.numpy()
+    grad_in_ndarray = test_global_storage.Get("x_diff")
+    grad_out_ndarray = test_global_storage.Get("loss_diff")
+    _check(test_case, data, segment_ids, out_shape, out_ndarray)
+    _check_bw(
+        test_case, grad_out_ndarray, segment_ids, grad_in_ndarray.shape, grad_in_ndarray
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestUnsortedBatchSegmentSum(flow.unittest.TestCase):
+    def test_unsorted_batch_segment_sum(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["out_shape"] = [(2, 4, 7, 6)]
+        arg_dict["num_segments"] = [7]
+        arg_dict["segment_ids_shape"] = [(2, 4, 5)]
+        for arg in GenArgList(arg_dict):
+            _run_test(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_unsorted_segment_sum.py b/python/oneflow/compatible/single_client/test/ops/test_unsorted_segment_sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..86bfc0dbef0853049225343af70c97ec1c4ad85c
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_unsorted_segment_sum.py
@@ -0,0 +1,156 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+func_config = flow.FunctionConfig()
+func_config.default_data_type(flow.float)
+func_config.default_logical_view(flow.scope.consistent_view())
+
+
+def _check(test_case, data_type, data, segment_ids, out_shape, axis, out):
+    test_case.assertEqual(out.shape, out_shape)
+    ref = np.zeros_like(out)
+    if axis != 0:
+        ref_perm = [axis] + list(range(0, axis)) + list(range(axis + 1, ref.ndim))
+        ref = np.transpose(ref, ref_perm)
+        data_perm = (
+            list(range(axis, axis + segment_ids.ndim))
+            + list(range(0, axis))
+            + list(range(axis + segment_ids.ndim, data.ndim))
+        )
+        data = np.transpose(data, data_perm)
+    for (idx, i) in np.ndenumerate(segment_ids):
+        ref[i] += data[idx]
+    if axis != 0:
+        ref_perm = list(range(1, axis + 1)) + [0] + list(range(axis + 1, ref.ndim))
+        ref = np.transpose(ref, ref_perm)
+    if data_type == "float16":
+        ref = ref.astype(np.float16).astype(np.float32)
+    test_case.assertTrue(np.allclose(ref, out, rtol=0.001, atol=0.001))
+
+
+def _gen_segment_ids(out_shape, axis, segment_ids_shape):
+    return np.random.randint(0, out_shape[axis], tuple(segment_ids_shape)).astype(
+        np.int32
+    )
+
+
+def _gen_data(data_type, out_shape, axis, segment_ids_shape):
+    data_shape = out_shape[0:axis] + segment_ids_shape + out_shape[axis + 1 :]
+    if data_type == "float16":
+        return (
+            np.random.uniform(-1.0, 1.0, size=data_shape)
+            .astype(np.float16)
+            .astype(np.float32)
+        )
+    else:
+        return np.random.uniform(-1.0, 1.0, size=data_shape).astype(
+            type_name_to_np_type[data_type]
+        )
+
+
+def _run_test(test_case, device, data_type, out_shape, axis, segment_ids_shape):
+    flow.clear_default_session()
+    if data_type == "float16":
+        dtype = flow.float
+        np_type = np.float32
+    else:
+        dtype = type_name_to_flow_type[data_type]
+        np_type = type_name_to_np_type[data_type]
+    segment_ids = _gen_segment_ids(out_shape, axis, segment_ids_shape)
+    data = _gen_data(data_type, out_shape, axis, segment_ids_shape)
+
+    @flow.global_function(function_config=func_config)
+    def unsorted_segment_sum_job(
+        data: oft.Numpy.Placeholder(data.shape, dtype=dtype),
+        segment_ids: oft.Numpy.Placeholder(segment_ids.shape, dtype=flow.int32),
+    ):
+        with flow.scope.placement(device, "0:0"):
+            if data_type == "float16":
+                data = flow.cast(data, dtype=flow.float16)
+                return flow.cast(
+                    flow.math.unsorted_segment_sum(
+                        data=data,
+                        segment_ids=segment_ids,
+                        num_segments=out_shape[axis],
+                        axis=axis,
+                    ),
+                    dtype=dtype,
+                )
+            else:
+                return flow.math.unsorted_segment_sum(
+                    data=data,
+                    segment_ids=segment_ids,
+                    num_segments=out_shape[axis],
+                    axis=axis,
+                )
+
+    @flow.global_function(function_config=func_config)
+    def unsorted_segment_sum_like_job(
+        data: oft.Numpy.Placeholder(data.shape, dtype=dtype),
+        segment_ids: oft.Numpy.Placeholder(segment_ids.shape, dtype=flow.int32),
+        like: oft.Numpy.Placeholder(out_shape, dtype=dtype),
+    ):
+        with flow.scope.placement(device, "0:0"):
+            if data_type == "float16":
+                data = flow.cast(data, dtype=flow.float16)
+                like = flow.cast(like, dtype=flow.float16)
+                return flow.cast(
+                    flow.math.unsorted_segment_sum_like(
+                        data=data, segment_ids=segment_ids, like=like, axis=axis
+                    ),
+                    dtype=dtype,
+                )
+            else:
+                return flow.math.unsorted_segment_sum_like(
+                    data=data, segment_ids=segment_ids, like=like, axis=axis
+                )
+
+    out = unsorted_segment_sum_job(data, segment_ids).get()
+    _check(test_case, data_type, data, segment_ids, out_shape, axis, out.numpy())
+    like = np.zeros(out_shape, dtype=np_type)
+    out = unsorted_segment_sum_like_job(data, segment_ids, like).get()
+    _check(test_case, data_type, data, segment_ids, out_shape, axis, out.numpy())
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestUnsortedSegmentSum(flow.unittest.TestCase):
+    def test_unsorted_segment_sum(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["data_type"] = ["float32", "float16"]
+        arg_dict["out_shape"] = [(4,), (4, 5), (4, 5, 6), (4, 5, 6, 7)]
+        arg_dict["axis"] = [0, 1, 2, 3]
+        arg_dict["segment_ids_shape"] = [(64,), (64, 96)]
+        for arg in GenArgList(arg_dict):
+            if arg[0] == "cpu" and arg[1] == "float16":
+                continue
+            if arg[3] >= len(arg[2]):
+                continue
+            _run_test(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_unsorted_segment_sum_fw_bw.py b/python/oneflow/compatible/single_client/test/ops/test_unsorted_segment_sum_fw_bw.py
new file mode 100644
index 0000000000000000000000000000000000000000..57175b3b30e59cdbc4729a7d12578ec7812f39c0
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_unsorted_segment_sum_fw_bw.py
@@ -0,0 +1,160 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def _random_inputs(data_shape, segment_ids_shape, axis, num_segments):
+    data = np.random.rand(*data_shape).astype(np.float32)
+    segment_ids = np.random.randint(
+        low=0, high=num_segments, size=segment_ids_shape, dtype=np.int32
+    )
+    return (data, segment_ids)
+
+
+def _make_unsorted_segment_sum_fn(
+    data, segment_ids, axis, num_segments, device_type, mirrored, compare_fn
+):
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    if mirrored:
+        func_config.default_logical_view(flow.scope.mirrored_view())
+    else:
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+    def do_unsorted_segment_sum(x_blob, i_blob):
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "data",
+                shape=data.shape,
+                dtype=flow.float32,
+                initializer=flow.constant_initializer(0),
+            )
+            x = x + x_blob
+            y = flow.math.unsorted_segment_sum(
+                x, i_blob, axis=axis, num_segments=num_segments
+            )
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+            ).minimize(y)
+        flow.watch_diff(x, compare_fn)
+        return y
+
+    if mirrored:
+
+        @flow.global_function(type="train", function_config=func_config)
+        def unsorted_segment_sum_fn(
+            data_def: oft.ListNumpy.Placeholder(data.shape, dtype=flow.float),
+            segment_ids_def: oft.ListNumpy.Placeholder(
+                segment_ids.shape, dtype=flow.int32
+            ),
+        ):
+            return do_unsorted_segment_sum(data_def, segment_ids_def)
+
+    else:
+
+        @flow.global_function(type="train", function_config=func_config)
+        def unsorted_segment_sum_fn(
+            data_def: oft.Numpy.Placeholder(data.shape, dtype=flow.float),
+            segment_ids_def: oft.Numpy.Placeholder(segment_ids.shape, dtype=flow.int32),
+        ):
+            return do_unsorted_segment_sum(data_def, segment_ids_def)
+
+    return unsorted_segment_sum_fn
+
+
+def _compare_unsorted_segment_sum_with_tf(
+    test_case,
+    device_type,
+    data_shape,
+    segment_ids_shape,
+    axis,
+    num_segments,
+    mirrored=False,
+):
+    (data, segment_ids) = _random_inputs(
+        data_shape, segment_ids_shape, axis, num_segments
+    )
+    i = tf.constant(segment_ids)
+    with tf.GradientTape() as t:
+        x = tf.Variable(data)
+        y = tf.math.unsorted_segment_sum(x, i, num_segments=num_segments)
+    dy = t.gradient(y, x)
+    if isinstance(dy, tf.IndexedSlices):
+        test_case.assertTrue(
+            np.array_equal(segment_ids.ravel(), dy.segment_ids.numpy().ravel())
+        )
+        zero_data = tf.Variable(np.full(data.shape, 0.0, dtype=np.float32))
+        dy = tf.math.tensor_scatter_nd_add(zero_data, i, dy.values)
+    if mirrored:
+
+        def compare_dy(data_grad):
+            test_case.assertTrue(np.array_equal(dy.numpy(), data_grad.numpy_list()[0]))
+
+    else:
+
+        def compare_dy(data_grad):
+            test_case.assertTrue(np.array_equal(dy.numpy(), data_grad.numpy()))
+
+    unsorted_segment_sum_fn = _make_unsorted_segment_sum_fn(
+        data, segment_ids, axis, num_segments, device_type, mirrored, compare_dy
+    )
+    if mirrored:
+        of_y = unsorted_segment_sum_fn([data], [segment_ids]).get().numpy_list()[0]
+    else:
+        of_y = unsorted_segment_sum_fn(data, segment_ids).get().numpy()
+    test_case.assertTrue(np.allclose(y.numpy(), of_y, rtol=1e-05, atol=1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestUnsortedSegmentSumFwBw(flow.unittest.TestCase):
+    def test_unsorted_segment_sum(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["data_shape"] = [(300, 1, 4)]
+        arg_dict["segment_ids_shape"] = [300]
+        arg_dict["axis"] = [0]
+        arg_dict["num_segments"] = [2]
+        for arg in GenArgList(arg_dict):
+            _compare_unsorted_segment_sum_with_tf(test_case, *arg)
+
+    def test_unsorted_segment_sum_case_1(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["data_shape"] = [(200, 10, 20)]
+        arg_dict["segment_ids_shape"] = [200]
+        arg_dict["axis"] = [0]
+        arg_dict["num_segments"] = [5]
+        for arg in GenArgList(arg_dict):
+            _compare_unsorted_segment_sum_with_tf(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_unsorted_segment_sum_model_parallel.py b/python/oneflow/compatible/single_client/test/ops/test_unsorted_segment_sum_model_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..4716c442504cc26cf24b81bc2eeade77077812e6
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_unsorted_segment_sum_model_parallel.py
@@ -0,0 +1,118 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def _gen_test_data(out_shape, segment_ids_shape, axis):
+    data_shape = out_shape[:axis] + segment_ids_shape + out_shape[axis + 1 :]
+    data = np.random.rand(*data_shape).astype(np.float32)
+    segment_ids = np.random.randint(
+        low=0, high=out_shape[axis], size=segment_ids_shape
+    ).astype(np.int32)
+    out = np.zeros(shape=out_shape, dtype=np.float32)
+    if axis != 0:
+        ref_perm = [axis] + list(range(0, axis)) + list(range(axis + 1, out.ndim))
+        out = np.transpose(out, ref_perm)
+        data_perm = (
+            list(range(axis, axis + segment_ids.ndim))
+            + list(range(0, axis))
+            + list(range(axis + segment_ids.ndim, data.ndim))
+        )
+        data_copy = np.transpose(data, data_perm)
+    else:
+        data_copy = data
+    for (idx, i) in np.ndenumerate(segment_ids):
+        out[i] += data_copy[idx]
+    if axis != 0:
+        ref_perm = list(range(1, axis + 1)) + [0] + list(range(axis + 1, out.ndim))
+        out = np.transpose(out, ref_perm)
+    return (data, segment_ids, out)
+
+
+def _test_unsorted_segment_sum_model_parallel_fw(
+    test_case, device_type, out_shape, segment_ids_shape, axis, split_axis
+):
+    flow.clear_default_session()
+    flow.config.gpu_device_num(4)
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.default_logical_view(flow.scope.consistent_view())
+    (data_arr, segment_ids_arr, out_arr) = _gen_test_data(
+        out_shape, segment_ids_shape, axis
+    )
+
+    @flow.global_function(function_config=func_config)
+    def unsorted_segment_sum_job(
+        data: oft.Numpy.Placeholder(data_arr.shape, dtype=flow.float),
+        segment_ids: oft.Numpy.Placeholder(segment_ids_arr.shape, dtype=flow.int32),
+        like: oft.Numpy.Placeholder(out_arr.shape, dtype=flow.float),
+    ):
+        with flow.scope.placement(device_type, "0:0-3"):
+            if split_axis < axis:
+                data = data.with_distribute(flow.distribute.split(split_axis))
+            elif split_axis == axis:
+                data = data.with_distribute(flow.distribute.broadcast())
+            else:
+                data = data.with_distribute(
+                    flow.distribute.split(split_axis + len(segment_ids.shape) - 1)
+                )
+            segment_ids = segment_ids.with_distribute(flow.distribute.broadcast())
+            like = like.with_distribute(flow.distribute.split(split_axis))
+            if split_axis == axis:
+                out0 = like
+            else:
+                out0 = flow.unsorted_segment_sum(
+                    data=data,
+                    segment_ids=segment_ids,
+                    num_segments=out_shape[axis],
+                    axis=axis,
+                )
+            out1 = flow.unsorted_segment_sum_like(
+                data=data, segment_ids=segment_ids, like=like, axis=axis
+            )
+            return (out0, out1)
+
+    (out0, out1) = unsorted_segment_sum_job(data_arr, segment_ids_arr, out_arr).get()
+    test_case.assertTrue(np.allclose(out0.numpy(), out_arr))
+    test_case.assertTrue(np.allclose(out1.numpy(), out_arr))
+
+
+@flow.unittest.skip_unless_1n4d()
+class TestUnsortedSegmentSumModelParallel(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_unsorted_segment_sum_model_parallel_fw(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["cpu", "gpu"]
+        arg_dict["out_shape"] = [(96, 96, 96)]
+        arg_dict["segment_ids_shape"] = [(32, 48)]
+        arg_dict["axis"] = [0, 1, 2]
+        arg_dict["split_axis"] = [0, 1, 2]
+        for arg in GenArgList(arg_dict):
+            _test_unsorted_segment_sum_model_parallel_fw(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_upsample.py b/python/oneflow/compatible/single_client/test/ops/test_upsample.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0e41e3834d18e22970fabcbc66c2798d3ef9391
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_upsample.py
@@ -0,0 +1,377 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import test_global_storage
+from test_util import GenArgList, type_name_to_flow_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def cartesian(arrays, out=None):
+    """
+    From https://stackoverflow.com/a/1235363
+    Generate a cartesian product of input arrays.
+    Parameters
+    ----------
+    arrays : list of array-like
+        1-D arrays to form the cartesian product of.
+    out : ndarray
+        Array to place the cartesian product in.
+    Returns
+    -------
+    out : ndarray
+        2-D array of shape (M, len(arrays)) containing cartesian products
+        formed of input arrays.
+    Examples
+    --------
+    >>> cartesian(([1, 2, 3], [4, 5], [6, 7]))
+    array([[1, 4, 6],
+           [1, 4, 7],
+           [1, 5, 6],
+           [1, 5, 7],
+           [2, 4, 6],
+           [2, 4, 7],
+           [2, 5, 6],
+           [2, 5, 7],
+           [3, 4, 6],
+           [3, 4, 7],
+           [3, 5, 6],
+           [3, 5, 7]])
+    """
+    arrays = [np.asarray(x) for x in arrays]
+    dtype = arrays[0].dtype
+    n = np.prod([x.size for x in arrays])
+    if out is None:
+        out = np.zeros([n, len(arrays)], dtype=dtype)
+    m = n // arrays[0].size
+    out[:, 0] = np.repeat(arrays[0], m)
+    if arrays[1:]:
+        cartesian(arrays[1:], out=out[0:m, 1:])
+        for j in range(1, arrays[0].size):
+            out[j * m : (j + 1) * m, 1:] = out[0:m, 1:]
+    return out
+
+
+def interpolate_1d_with_x(
+    data,
+    scale_factor,
+    x,
+    get_coeffs,
+    roi=None,
+    extrapolation_value=0.0,
+    scaler="half_pixel",
+    exclude_outside=False,
+):
+    def get_neighbor_idxes(x, n, limit):
+        """
+        Return the n nearest indexes, prefer the indexes smaller than x
+        As a result, the ratio must be in (0, 1]
+        Examples:
+        get_neighbor_idxes(4, 2, 10) == [3, 4]
+        get_neighbor_idxes(4, 3, 10) == [3, 4, 5]
+        get_neighbor_idxes(4.4, 3, 10) == [3, 4, 5]
+        get_neighbor_idxes(4.5, 3, 10) == [3, 4, 5]
+        get_neighbor_idxes(4.6, 3, 10) == [4, 5, 6]
+        get_neighbor_idxes(4.4, 1, 10) == [4]
+        get_neighbor_idxes(4.6, 1, 10) == [5]
+        :param x:
+        :param n: the number of the wanted indexes
+        :param limit: the maximum value of index
+        :return: An np.array containing n nearest indexes in ascending order
+        """
+        idxes = sorted(range(limit), key=lambda idx: (abs(x - idx), idx))[:n]
+        idxes = sorted(idxes)
+        return np.array(idxes)
+
+    def get_neighbor(x, n, data):
+        """
+        Pad `data` in 'edge' mode, and get n nearest elements in the padded array and their indexes in the original
+        array
+        :param x:
+        :param n:  the number of the wanted elements
+        :param data: the array
+        :return: A tuple containing the indexes of neighbor elements (the index can be smaller than 0 or higher than
+        len(data)) and the value of these elements
+        """
+        pad_width = np.ceil(n / 2).astype(np.int32)
+        padded = np.pad(data, pad_width, mode="edge")
+        x += pad_width
+        idxes = get_neighbor_idxes(x, n, len(padded))
+        ret = padded[idxes]
+        return (idxes - pad_width, ret)
+
+    input_width = len(data)
+    output_width = scale_factor * input_width
+    if scaler == "align_corners":
+        if output_width == 1:
+            x_ori = 0.0
+        else:
+            x_ori = x * (input_width - 1) / (output_width - 1)
+    elif scaler == "asymmetric":
+        x_ori = x / scale_factor
+    elif scaler == "pytorch_half_pixel":
+        if output_width == 1:
+            x_ori = -0.5
+        else:
+            x_ori = (x + 0.5) / scale_factor - 0.5
+    else:
+        x_ori = (x + 0.5) / scale_factor - 0.5
+    x_ori_int = np.floor(x_ori).astype(np.int32).item()
+    if x_ori.is_integer():
+        ratio = 1
+    else:
+        ratio = x_ori - x_ori_int
+    coeffs = get_coeffs(ratio)
+    n = len(coeffs)
+    (idxes, points) = get_neighbor(x_ori, n, data)
+    if exclude_outside:
+        for (i, idx) in enumerate(idxes):
+            if idx < 0 or idx >= input_width:
+                coeffs[i] = 0
+        coeffs /= sum(coeffs)
+    return np.dot(coeffs, points).item()
+
+
+def interpolate_nd_with_x(data, n, scale_factors, x, get_coeffs, roi=None, **kwargs):
+    if n == 1:
+        return interpolate_1d_with_x(
+            data, scale_factors[0], x[0], get_coeffs, roi=roi, **kwargs
+        )
+    return interpolate_1d_with_x(
+        [
+            interpolate_nd_with_x(
+                data[i],
+                n - 1,
+                scale_factors[1:],
+                x[1:],
+                get_coeffs,
+                roi=None if roi is None else np.concatenate([roi[1:n], roi[n + 1 :]]),
+                **kwargs
+            )
+            for i in range(data.shape[0])
+        ],
+        scale_factors[0],
+        x[0],
+        get_coeffs,
+        roi=None if roi is None else [roi[0], roi[n]],
+        **kwargs
+    )
+
+
+def interpolate_nd(
+    data, get_coeffs, output_size=None, scale_factors=None, roi=None, **kwargs
+):
+    def get_all_coords(data):
+        return cartesian([list(range(data.shape[i])) for i in range(len(data.shape))])
+
+    assert output_size is not None or scale_factors is not None
+    if output_size is not None:
+        scale_factors = np.array(output_size) / np.array(data.shape)
+    else:
+        if isinstance(scale_factors, int):
+            height_scale = scale_factors
+            width_scale = scale_factors
+        else:
+            assert isinstance(scale_factors, (list, tuple))
+            assert len(scale_factors) == 2
+            height_scale = scale_factors[0]
+            width_scale = scale_factors[1]
+        output_size = np.stack(
+            [
+                data.shape[0],
+                data.shape[1],
+                data.shape[2] * height_scale,
+                data.shape[3] * width_scale,
+            ]
+        ).astype(np.int32)
+        scale_factors = np.array([1, 1, height_scale, width_scale])
+    assert scale_factors is not None
+    ret = np.zeros(output_size)
+    for x in get_all_coords(ret):
+        ret[tuple(x)] = interpolate_nd_with_x(
+            data, len(data.shape), scale_factors, x, get_coeffs, roi=roi, **kwargs
+        )
+    return ret
+
+
+def linear_coeffs(ratio):
+    return np.array([1 - ratio, ratio])
+
+
+def nearest_coeffs(ratio, mode="round_prefer_floor"):
+    if type(ratio) == int or ratio.is_integer():
+        return np.array([0, 1])
+    elif mode == "round_prefer_floor":
+        return np.array([ratio <= 0.5, ratio > 0.5])
+    elif mode == "round_prefer_ceil":
+        return np.array([ratio < 0.5, ratio >= 0.5])
+    elif mode == "floor":
+        return np.array([1, 0])
+    elif mode == "ceil":
+        return np.array([0, 1])
+
+
+def compare_with_tensorflow(
+    device_type, input_shape, dtype, size, data_format, interpolation, align_corners
+):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def UpsampleJob():
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "input",
+                shape=input_shape,
+                dtype=type_name_to_flow_type[dtype],
+                initializer=flow.random_uniform_initializer(minval=2, maxval=5),
+                trainable=True,
+            )
+            loss = flow.layers.upsample_2d(
+                x,
+                size=size,
+                data_format=data_format,
+                interpolation=interpolation,
+                align_corners=align_corners,
+            )
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(loss)
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            flow.watch(loss, test_global_storage.Setter("loss"))
+            flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))
+            return loss
+
+    of_out = UpsampleJob().get()
+    channel_pos = "channels_first" if data_format.startswith("NC") else "channels_last"
+    with tf.GradientTape(persistent=True) as tape:
+        x = tf.Variable(test_global_storage.Get("x").astype(np.float32))
+        tf_out = tf.keras.layers.UpSampling2D(
+            size=size, data_format=channel_pos, interpolation=interpolation
+        )(x)
+    loss_diff = test_global_storage.Get("loss_diff").astype(np.float32)
+    tf_x_diff = tape.gradient(tf_out, x, loss_diff)
+    assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=1e-05, atol=1e-05)
+    assert np.allclose(
+        test_global_storage.Get("x_diff"), tf_x_diff.numpy(), rtol=1e-05, atol=1e-05
+    )
+
+
+def compare_with_numpy(
+    device_type, input_shape, dtype, size, data_format, interpolation, align_corners
+):
+    assert device_type in ["gpu"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(type="predict", function_config=func_config)
+    def UpsampleJob():
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "input",
+                shape=input_shape,
+                dtype=type_name_to_flow_type[dtype],
+                initializer=flow.random_uniform_initializer(minval=2, maxval=5),
+                trainable=False,
+            )
+            loss = flow.layers.upsample_2d(
+                x,
+                size=size,
+                data_format=data_format,
+                interpolation=interpolation,
+                align_corners=align_corners,
+            )
+            flow.watch(x, test_global_storage.Setter("x1"))
+            flow.watch(loss, test_global_storage.Setter("loss1"))
+            return loss
+
+    of_out = UpsampleJob().get()
+    channel_pos = "channels_first" if data_format.startswith("NC") else "channels_last"
+    if align_corners:
+        assert interpolation == "bilinear"
+        x = test_global_storage.Get("x1")
+        if data_format == "NHWC":
+            x = np.transpose(x, axes=[0, 3, 1, 2])
+        coeffs_dict = {"bilinear": linear_coeffs}
+        coeffs = coeffs_dict[interpolation]
+        scaler = "align_corners"
+        np_out = interpolate_nd(x, coeffs, scale_factors=size, scaler=scaler).astype(
+            np.float32
+        )
+        of_out_np = of_out.numpy()
+        if data_format == "NHWC":
+            of_out_np = np.transpose(of_out_np, axes=[0, 3, 1, 2])
+        assert np.allclose(of_out_np, np_out, rtol=1e-05, atol=1e-05)
+    else:
+        x = test_global_storage.Get("x1")
+        if data_format == "NHWC":
+            x = np.transpose(x, axes=[0, 3, 1, 2])
+        coeffs_dict = {"bilinear": linear_coeffs, "nearest": nearest_coeffs}
+        coeffs = coeffs_dict[interpolation]
+        scaler = "pytorch_half_pixel"
+        np_out = interpolate_nd(x, coeffs, scale_factors=size, scaler=scaler).astype(
+            np.float32
+        )
+        of_out_np = of_out.numpy()
+        if data_format == "NHWC":
+            of_out_np = np.transpose(of_out_np, axes=[0, 3, 1, 2])
+        assert np.allclose(of_out_np, np_out, rtol=1e-05, atol=1e-05)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestUpsample(flow.unittest.TestCase):
+    def test_upsample(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["input_shape"] = [(2, 11, 12, 13)]
+        arg_dict["dtype"] = ["float32", "double"]
+        arg_dict["size"] = [(2, 2), 3, (1, 2)]
+        arg_dict["data_format"] = ["NCHW", "NHWC"]
+        arg_dict["interpolation"] = ["nearest", "bilinear"]
+        arg_dict["align_corners"] = [False]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+
+    def test_upsample_align_corners(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["input_shape"] = [(2, 5, 6, 7)]
+        arg_dict["dtype"] = ["float32", "double"]
+        arg_dict["size"] = [(2, 2), 3, (1, 2)]
+        arg_dict["data_format"] = ["NCHW", "NHWC"]
+        arg_dict["interpolation"] = ["bilinear"]
+        arg_dict["align_corners"] = [True, False]
+        for arg in GenArgList(arg_dict):
+            compare_with_numpy(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_user_op_attr_auto_type.py b/python/oneflow/compatible/single_client/test/ops/test_user_op_attr_auto_type.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8a9c9f73565d1bf4a4a74d30c93e36755bd2a0e
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_user_op_attr_auto_type.py
@@ -0,0 +1,60 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def _test_user_op_attr_auto_type(input, attr1, attr2):
+    return (
+        flow.user_op_builder("test_user_op_attr_auto_type")
+        .Op("test_user_op_attr_auto_type")
+        .Input("in", [input])
+        .Output("out")
+        .Attr("int1", attr1)
+        .Attr("int2", attr2)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestUserOpAttrAutoType(flow.unittest.TestCase):
+    def test_user_op_attr_auto_type(test_case):
+        flow.clear_default_session()
+        function_config = flow.FunctionConfig()
+        function_config.default_data_type(flow.float)
+
+        @flow.global_function(function_config=function_config)
+        def _test_user_op_attr_auto_type_job(
+            input: oft.Numpy.Placeholder((1,), dtype=flow.float)
+        ):
+            attr1 = 1
+            attr2 = 2
+            return _test_user_op_attr_auto_type(input, attr1, attr2)
+
+        input = [1]
+        _test_user_op_attr_auto_type_job(np.array(input, dtype=np.float32))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_user_op_module.py b/python/oneflow/compatible/single_client/test/ops/test_user_op_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..955115d7fb504bf038b604de4e9f0298655f95b1
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_user_op_module.py
@@ -0,0 +1,104 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from typing import Tuple
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+class Add(flow.deprecated.nn.Module):
+    def __init__(self):
+        flow.deprecated.nn.Module.__init__(self)
+        self.module_builder_ = flow.consistent_user_op_module_builder("add_n")
+        self.module_builder_.InputSize("in", 2).Output("out")
+        self.module_builder_.user_op_module.InitOpKernel()
+
+    def forward(self, x, y):
+        return (
+            self.module_builder_.OpName("Add_%s" % self.call_seq_no)
+            .Input("in", [x, y])
+            .Build()
+            .InferAndTryRun()
+            .RemoteBlobList()[0]
+        )
+
+
+def _make_global_func(test_case, x_shape, y_shape):
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float32)
+
+    @flow.global_function(function_config=func_config)
+    def AddJob(
+        x: oft.Numpy.Placeholder(shape=x_shape), y: oft.Numpy.Placeholder(shape=y_shape)
+    ) -> oft.Numpy:
+        with flow.scope.namespace("AddJob"):
+            add_op = flow.find_or_create_module("Add", Add)
+            z = add_op(x, y)
+            test_case.assertTrue(
+                z.op_name == "AddJob-Add_{}".format(add_op.call_seq_no - 1)
+            )
+            v = add_op(z, x)
+            test_case.assertTrue(
+                v.op_name == "AddJob-Add_{}".format(add_op.call_seq_no - 1)
+            )
+        return z
+
+    return AddJob
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestUserOpModule(flow.unittest.TestCase):
+    def test_Add(test_case):
+        @flow.global_function()
+        def AddJob(xs: Tuple[(oft.Numpy.Placeholder((5, 2)),) * 2]):
+            adder = flow.find_or_create_module("Add", Add)
+            x = adder(*xs)
+            y = adder(*xs)
+            return adder(x, y)
+
+        inputs = tuple((np.random.rand(5, 2).astype(np.float32) for i in range(2)))
+        r = AddJob(inputs).get().numpy()
+        test_case.assertTrue(np.allclose(r, sum(inputs) * 2))
+        r = AddJob(inputs).get().numpy()
+        test_case.assertTrue(np.allclose(r, sum(inputs) * 2))
+
+    def test_find_or_create_module_reuse(test_case):
+        @flow.global_function()
+        def AddJob(xs: Tuple[(oft.Numpy.Placeholder((5, 2)),) * 2]):
+            adder = flow.find_or_create_module("Add", Add, reuse=True)
+            adder = flow.find_or_create_module("Add", Add, reuse=True)
+            x = adder(*xs)
+            return adder(x, x)
+
+        inputs = tuple((np.random.rand(5, 2).astype(np.float32) for i in range(2)))
+        r = AddJob(inputs).get().numpy()
+
+    def test_user_op_module_builder_in_namespace(test_case):
+        x = np.random.rand(2, 5).astype(np.float32)
+        y = np.random.rand(2, 5).astype(np.float32)
+        flow.clear_default_session()
+        add_func = _make_global_func(test_case, x.shape, y.shape)
+        ret = add_func(x, y)
+        test_case.assertTrue(np.array_equal(ret, x + y))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_util.py b/python/oneflow/compatible/single_client/test/ops/test_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1461a7363f60f2612c0ec65d34150dae9ca73c4e
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_util.py
@@ -0,0 +1,185 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import itertools
+import os
+from collections import OrderedDict
+from collections.abc import Iterable
+
+import numpy as np
+import test_global_storage
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+def GenCartesianProduct(sets):
+    assert isinstance(sets, Iterable)
+    for set in sets:
+        assert isinstance(set, Iterable)
+        if os.getenv("ONEFLOW_TEST_CPU_ONLY"):
+            if "gpu" in set:
+                set.remove("gpu")
+    return itertools.product(*sets)
+
+
+def GenArgList(arg_dict):
+    assert isinstance(arg_dict, OrderedDict)
+    assert all([isinstance(x, list) for x in arg_dict.values()])
+    sets = [arg_set for (_, arg_set) in arg_dict.items()]
+    return GenCartesianProduct(sets)
+
+
+def GenArgDict(arg_dict):
+    return [dict(zip(arg_dict.keys(), x)) for x in GenArgList(arg_dict)]
+
+
+class Args:
+    def __init__(self, flow_args, tf_args=None):
+        super().__init__()
+        if tf_args is None:
+            tf_args = flow_args
+        self.flow_args = flow_args
+        self.tf_args = tf_args
+
+    def __str__(self):
+        return "flow_args={} tf_args={}".format(self.flow_args, self.tf_args)
+
+    def __repr__(self):
+        return self.__str__()
+
+
+def RunOneflowOp(device_type, flow_op, x, flow_args):
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def FlowJob(x: oft.Numpy.Placeholder(x.shape)):
+        with flow.scope.placement(device_type, "0:0"):
+            x += flow.get_variable(
+                name="v1",
+                shape=(1,),
+                dtype=flow.float,
+                initializer=flow.zeros_initializer(),
+            )
+            loss = flow_op(x, *flow_args)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0]), momentum=0
+            ).minimize(loss)
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+            return loss
+
+    y = FlowJob(x).get().numpy()
+    x_diff = test_global_storage.Get("x_diff")
+    return (y, x_diff)
+
+
+def RunTensorFlowOp(tf_op, x, tf_args):
+    import tensorflow as tf
+
+    gpus = tf.config.experimental.list_physical_devices("GPU")
+    for gpu in gpus:
+        tf.config.experimental.set_memory_growth(gpu, True)
+    with tf.GradientTape(persistent=True) as tape:
+        x = tf.Variable(x)
+        y = tf_op(x, *tf_args)
+    x_diff = tape.gradient(y, x)
+    return (y.numpy(), x_diff.numpy())
+
+
+def CompareOpWithTensorFlow(
+    device_type,
+    flow_op,
+    tf_op,
+    input_shape,
+    op_args=None,
+    input_minval=-10,
+    input_maxval=10,
+    y_rtol=1e-05,
+    y_atol=1e-05,
+    x_diff_rtol=1e-05,
+    x_diff_atol=1e-05,
+):
+    assert device_type in ["gpu", "cpu"]
+    if op_args is None:
+        (flow_args, tf_args) = ([], [])
+    else:
+        (flow_args, tf_args) = (op_args.flow_args, op_args.tf_args)
+    x = np.random.uniform(low=input_minval, high=input_maxval, size=input_shape).astype(
+        np.float32
+    )
+    (of_y, of_x_diff) = RunOneflowOp(device_type, flow_op, x, flow_args)
+    (tf_y, tf_x_diff) = RunTensorFlowOp(tf_op, x, tf_args)
+    assert np.allclose(of_y, tf_y, rtol=y_rtol, atol=y_atol)
+    assert np.allclose(of_x_diff, tf_x_diff, rtol=x_diff_rtol, atol=x_diff_atol)
+
+
+type_name_to_flow_type = {
+    "float16": flow.float16,
+    "float32": flow.float32,
+    "double": flow.double,
+    "int8": flow.int8,
+    "int32": flow.int32,
+    "int64": flow.int64,
+    "char": flow.char,
+    "uint8": flow.uint8,
+}
+type_name_to_np_type = {
+    "float16": np.float16,
+    "float32": np.float32,
+    "double": np.float64,
+    "int8": np.int8,
+    "int32": np.int32,
+    "int64": np.int64,
+    "char": np.byte,
+    "uint8": np.uint8,
+}
+
+
+def FlattenArray(input_array):
+    output_array = list()
+    for x in np.nditer(input_array):
+        output_array.append(x.tolist())
+    return output_array
+
+
+def Array2Numpy(input_array, target_shape):
+    return np.array(input_array).reshape(target_shape, order="C")
+
+
+def Index2Coordinate(idx, tensor_shape):
+    coordinate = []
+    tmp = idx
+    for i in range(len(tensor_shape) - 1, -1, -1):
+        axis_size = tensor_shape[i]
+        coor = tmp % axis_size
+        coordinate.insert(0, int(coor))
+        tmp = (tmp - coor) / axis_size
+    return coordinate
+
+
+def Coordinate2Index(coordinate, tensor_shape):
+    if len(coordinate) != len(tensor_shape):
+        raise "wrong coordinate or shape"
+    idx = 0
+    for (i, coor) in enumerate(coordinate):
+        size_at_axis = coor
+        for j in range(i + 1, len(tensor_shape)):
+            size_at_axis *= tensor_shape[j]
+        idx += size_at_axis
+    return idx
diff --git a/python/oneflow/compatible/single_client/test/ops/test_watch.py b/python/oneflow/compatible/single_client/test/ops/test_watch.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3db04c54c370a3613e961cc0cde70a5bf945684
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_watch.py
@@ -0,0 +1,63 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+
+@flow.unittest.skip_unless_1n2d()
+class TestWatch(flow.unittest.TestCase):
+    def test_simple(test_case):
+        flow.config.gpu_device_num(1)
+        data = np.ones((10,), dtype=np.float32)
+
+        def EqOnes(x):
+            test_case.assertTrue(np.allclose(data, x.numpy()))
+
+        @flow.global_function()
+        def ReluJob(x: oft.Numpy.Placeholder((10,))):
+            flow.watch(x, EqOnes)
+
+        ReluJob(data)
+
+    @unittest.skipIf(
+        flow.unittest.env.eager_execution_enabled(), "Doesn't work in eager mode"
+    )
+    def test_two_device(test_case):
+        flow.config.gpu_device_num(2)
+        data = np.ones((10,), dtype=np.float32)
+
+        def EqOnes(x):
+            test_case.assertTrue(np.allclose(data, x.numpy()))
+
+        func_config = flow.FunctionConfig()
+        func_config.default_logical_view(flow.scope.mirrored_view())
+
+        @flow.global_function(function_config=func_config)
+        def ReluJob(x: oft.Numpy.Placeholder((10,))):
+            y = flow.math.relu(x)
+            flow.watch(y, EqOnes)
+
+        ReluJob(data)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_watch_diff.py b/python/oneflow/compatible/single_client/test/ops/test_watch_diff.py
new file mode 100644
index 0000000000000000000000000000000000000000..185812f08ef32616bf684b83f00c2ee638b41f63
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_watch_diff.py
@@ -0,0 +1,70 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import test_global_storage
+from test_util import GenArgList, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+
+def WatchDiff(test_case, device_type, input_shape, dtype):
+    assert device_type in ["gpu", "cpu"]
+    assert dtype in ["float32", "double"]
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    def CheckOnes(diff):
+        ones = np.ones(input_shape)
+        test_case.assertTrue(np.allclose(diff.numpy(), ones, rtol=1e-05, atol=1e-05))
+
+    @flow.global_function(type="train", function_config=func_config)
+    def TrainJob():
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "in",
+                shape=input_shape,
+                dtype=type_name_to_flow_type[dtype],
+                initializer=flow.random_uniform_initializer(),
+                trainable=True,
+            )
+            flow.watch_diff(x, CheckOnes)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+            ).minimize(x)
+
+    TrainJob()
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestWatchDiff(flow.unittest.TestCase):
+    def test_watch_diff(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(10,)]
+        arg_dict["dtype"] = ["float32"]
+        for arg in GenArgList(arg_dict):
+            WatchDiff(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_where.py b/python/oneflow/compatible/single_client/test/ops/test_where.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba52a1b04a8d917c7325965de3d6f1ee9de5bd85
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_where.py
@@ -0,0 +1,315 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+from test_util import GenArgDict
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as oft
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+
+def _random_input(cond_shape, x_shape, y_shape):
+    condition = np.random.randint(low=0, high=2, size=cond_shape).astype(np.int32)
+    x = np.random.standard_normal(x_shape).astype(np.float32)
+    y = np.random.standard_normal(y_shape).astype(np.float32)
+    return (condition, x, y)
+
+
+def _of_where(
+    condition,
+    x,
+    y,
+    device_type="gpu",
+    machine_device_ids="0:0",
+    dynamic=False,
+    dz_dx_watcher=None,
+    dz_dy_watcher=None,
+):
+    flow.clear_default_session()
+    flow.config.gpu_device_num(4)
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    if callable(dz_dx_watcher) and callable(dz_dy_watcher):
+        func_config_type = "train"
+
+        def do_where(condition, x, y):
+            with flow.scope.placement(device_type, "0:0"):
+                x_var = flow.get_variable(
+                    "x",
+                    shape=x.shape,
+                    dtype=flow.float,
+                    initializer=flow.constant_initializer(0),
+                )
+                x_var = flow.cast_to_current_logical_view(x_var)
+                x_var = x_var + x
+                y_var = flow.get_variable(
+                    "y",
+                    shape=y.shape,
+                    dtype=flow.float,
+                    initializer=flow.constant_initializer(0),
+                )
+                y_var = flow.cast_to_current_logical_view(y_var)
+                y_var = y_var + y
+            z = flow.where(condition, x_var, y_var)
+            with flow.scope.placement(device_type, "0:0"):
+                flow.optimizer.SGD(
+                    flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+                ).minimize(z)
+            flow.watch_diff(x_var, dz_dx_watcher)
+            flow.watch_diff(y_var, dz_dy_watcher)
+            return z
+
+    else:
+        func_config_type = "predict"
+
+        def do_where(condition, x, y):
+            return flow.where(condition, x, y)
+
+    if dynamic:
+        func_config.default_placement_scope(flow.scope.placement(device_type, "0:0"))
+        func_config.default_logical_view(flow.scope.mirrored_view())
+
+        @flow.global_function(type=func_config_type, function_config=func_config)
+        def where_fn(
+            condition_def: oft.ListNumpy.Placeholder(condition.shape, dtype=flow.int32),
+            x_def: oft.ListNumpy.Placeholder(x.shape, dtype=flow.float),
+            y_def: oft.ListNumpy.Placeholder(y.shape, dtype=flow.float),
+        ):
+            return do_where(condition_def, x_def, y_def)
+
+        return where_fn([condition], [x], [y]).get().numpy_list()[0]
+    else:
+        func_config.default_placement_scope(
+            flow.scope.placement(device_type, machine_device_ids)
+        )
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(type=func_config_type, function_config=func_config)
+        def where_fn(
+            condition_def: oft.Numpy.Placeholder(condition.shape, dtype=flow.int32),
+            x_def: oft.Numpy.Placeholder(x.shape, dtype=flow.float),
+            y_def: oft.Numpy.Placeholder(y.shape, dtype=flow.float),
+        ):
+            return do_where(condition_def, x_def, y_def)
+
+        return where_fn(condition, x, y).get().numpy()
+
+
+def _compare_with_np(test_case, cond_shape, x_shape, y_shape, device_type, dynamic):
+    (condition, x, y) = _random_input(cond_shape, x_shape, y_shape)
+    z = np.where(condition, x, y)
+    of_z = _of_where(condition, x, y, device_type, "0:0", dynamic)
+    test_case.assertTrue(np.array_equal(z, of_z))
+
+
+def _compare_with_tf(
+    test_case,
+    cond_shape,
+    x_shape,
+    y_shape,
+    device_type="gpu",
+    machine_device_ids="0:0",
+    dynamic=False,
+    verbose=False,
+):
+    (condition, x, y) = _random_input(cond_shape, x_shape, y_shape)
+    condition_constant = tf.constant(condition, dtype=tf.bool)
+    with tf.GradientTape(persistent=True) as t:
+        x_var = tf.Variable(x)
+        y_var = tf.Variable(y)
+        z = tf.where(condition_constant, x_var, y_var)
+    dz_dx = t.gradient(z, x_var)
+    dz_dy = t.gradient(z, y_var)
+
+    def compare_dz_dx(dz_dx_blob):
+        if verbose:
+            print("condition:", condition)
+            print("tf_dz_dx:", dz_dx.numpy())
+            print(
+                "of_dz_dx:",
+                dz_dx_blob.numpy_list()[0] if dynamic else dz_dx_blob.numpy(),
+            )
+        test_case.assertTrue(
+            np.array_equal(
+                dz_dx.numpy(),
+                dz_dx_blob.numpy_list()[0] if dynamic else dz_dx_blob.numpy(),
+            )
+        )
+
+    def compare_dz_dy(dz_dy_blob):
+        if verbose:
+            print("condition:", condition)
+            print("tf_dz_dy:", dz_dy.numpy())
+            print(
+                "of_dz_dy:",
+                dz_dy_blob.numpy_list()[0] if dynamic else dz_dy_blob.numpy(),
+            )
+        test_case.assertTrue(
+            np.array_equal(
+                dz_dy.numpy(),
+                dz_dy_blob.numpy_list()[0] if dynamic else dz_dy_blob.numpy(),
+            )
+        )
+
+    of_z = _of_where(
+        condition,
+        x,
+        y,
+        device_type,
+        machine_device_ids,
+        dynamic,
+        compare_dz_dx,
+        compare_dz_dy,
+    )
+    test_case.assertTrue(np.array_equal(z.numpy(), of_z))
+
+
+def _of_where_with_x_and_y_are_none(input, input_shape=None):
+    flow.clear_default_session()
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    if input_shape is None:
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(function_config=func_config)
+        def where_fn(input_def: oft.Numpy.Placeholder(input.shape, dtype=flow.float)):
+            return flow.where(input_def)
+
+    else:
+        func_config.default_logical_view(flow.scope.mirrored_view())
+
+        @flow.global_function(function_config=func_config)
+        def where_fn(
+            input_def: oft.ListNumpy.Placeholder(input_shape, dtype=flow.float)
+        ):
+            return flow.where(input_def)
+
+    return where_fn([input]).get().numpy_list()[0]
+
+
+@unittest.skipIf(True, "skip for now because of single-client tensor_list removed")
+class TestWhere(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_where(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["cond_shape"] = [[5, 10]]
+        arg_dict["x_shape"] = [[5, 10]]
+        arg_dict["y_shape"] = [[5, 10]]
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["dynamic"] = [True, False]
+        for arg in GenArgDict(arg_dict):
+            _compare_with_np(test_case, **arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_where_case_1(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["cond_shape"] = [[4, 5, 8]]
+        arg_dict["x_shape"] = [[1, 5, 8]]
+        arg_dict["y_shape"] = [[4, 1, 8]]
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["dynamic"] = [True, False]
+        for arg in GenArgDict(arg_dict):
+            _compare_with_np(test_case, **arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_where_case_2(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["cond_shape"] = [[10, 7, 9]]
+        arg_dict["x_shape"] = [[20, 10, 7, 9]]
+        arg_dict["y_shape"] = [[20, 10, 1, 1]]
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["dynamic"] = [True, False]
+        for arg in GenArgDict(arg_dict):
+            _compare_with_np(test_case, **arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_where_case_3(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["cond_shape"] = [[12, 25, 6]]
+        arg_dict["x_shape"] = [[12, 1, 6]]
+        arg_dict["y_shape"] = [[25, 1]]
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["dynamic"] = [True, False]
+        for arg in GenArgDict(arg_dict):
+            _compare_with_np(test_case, **arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_where_grad(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["cond_shape"] = [[10]]
+        arg_dict["x_shape"] = [[10]]
+        arg_dict["y_shape"] = [[10]]
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["machine_device_ids"] = ["0:0"]
+        arg_dict["dynamic"] = [True, False]
+        for arg in GenArgDict(arg_dict):
+            _compare_with_tf(test_case, **arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_where_grad_case_1(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["cond_shape"] = [[3, 7, 10]]
+        arg_dict["x_shape"] = [[3, 1, 10]]
+        arg_dict["y_shape"] = [[7, 10]]
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["dynamic"] = [True, False]
+        for arg in GenArgDict(arg_dict):
+            _compare_with_tf(test_case, **arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_where_grad_case_2(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["cond_shape"] = [[16, 1]]
+        arg_dict["x_shape"] = [[4, 1, 20]]
+        arg_dict["y_shape"] = [[8, 4, 16, 20]]
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["dynamic"] = [True, False]
+        for arg in GenArgDict(arg_dict):
+            _compare_with_tf(test_case, **arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_where_grad_4card(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["cond_shape"] = [[10]]
+        arg_dict["x_shape"] = [[10]]
+        arg_dict["y_shape"] = [[10]]
+        arg_dict["device_type"] = ["gpu"]
+        arg_dict["machine_device_ids"] = ["0:0-3"]
+        arg_dict["dynamic"] = [False]
+        for arg in GenArgDict(arg_dict):
+            _compare_with_tf(test_case, **arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_where_argwhere(test_case):
+        rand_input = np.random.random_sample((11, 3, 5)).astype(np.float32)
+        rand_input[np.nonzero(rand_input < 0.5)] = 0.0
+        ret = _of_where_with_x_and_y_are_none(rand_input, input_shape=(11, 3, 5))
+        exp_ret = np.argwhere(rand_input)
+        test_case.assertTrue(np.array_equal(exp_ret, ret))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_zero_pad2d.py b/python/oneflow/compatible/single_client/test/ops/test_zero_pad2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..62c04e7fd3ba59cc2f4287140ea570da4c17ea0e
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_zero_pad2d.py
@@ -0,0 +1,264 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import (
+    Args,
+    Array2Numpy,
+    Coordinate2Index,
+    FlattenArray,
+    GenArgDict,
+    GenArgList,
+    Index2Coordinate,
+)
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+def _make_op_function(
+    test_case, input, padding, grad, device_type, value_type, machine_ids, device_counts
+):
+    flow.clear_default_session()
+    if device_type == "cpu":
+        flow.config.cpu_device_num(device_counts)
+    else:
+        flow.config.gpu_device_num(device_counts)
+    func_config = flow.FunctionConfig()
+    if value_type == flow.float16:
+        func_config.default_data_type(flow.float32)
+    else:
+        func_config.default_data_type(value_type)
+    func_config.default_placement_scope(flow.scope.placement(device_type, machine_ids))
+    func_config.default_logical_view(flow.scope.consistent_view())
+
+    def _compare_diff(blob: tp.Numpy):
+        test_case.assertTrue(np.allclose(grad, blob, 0.001, 0.001))
+
+    if value_type == flow.float32 or value_type == flow.float64:
+
+        @flow.global_function(type="train", function_config=func_config)
+        def op_function(x: tp.Numpy.Placeholder(input.shape, dtype=value_type)):
+            with flow.scope.placement(device_type, "0:0"):
+                x += flow.get_variable(
+                    name="input",
+                    shape=input.shape,
+                    dtype=value_type,
+                    initializer=flow.zeros_initializer(),
+                )
+                out = flow.zero_pad2d(x, padding)
+                flow.optimizer.SGD(
+                    flow.optimizer.PiecewiseConstantScheduler([], [0]), momentum=0
+                ).minimize(out)
+            flow.watch_diff(x, _compare_diff)
+            return out
+
+        return op_function
+    elif value_type == flow.int32:
+
+        @flow.global_function(type="train", function_config=func_config)
+        def op_function(x: tp.Numpy.Placeholder(input.shape, dtype=flow.float32)):
+            with flow.scope.placement(device_type, "0:0"):
+                x += flow.get_variable(
+                    name="input",
+                    shape=input.shape,
+                    dtype=flow.float32,
+                    initializer=flow.zeros_initializer(),
+                )
+                y_int32 = flow.zero_pad2d(x, padding)
+                y_fp32 = flow.cast(y_int32, dtype=flow.float32)
+                flow.optimizer.SGD(
+                    flow.optimizer.PiecewiseConstantScheduler([], [0]), momentum=0
+                ).minimize(y_fp32)
+            flow.watch_diff(x, _compare_diff)
+            return y_fp32
+
+        return op_function
+    elif value_type == flow.float16:
+
+        @flow.global_function(type="train", function_config=func_config)
+        def op_function(x: tp.Numpy.Placeholder(input.shape, dtype=flow.float32)):
+            with flow.scope.placement(device_type, "0:0"):
+                x_var = flow.get_variable(
+                    name="input",
+                    shape=input.shape,
+                    dtype=flow.float32,
+                    initializer=flow.constant_initializer(0),
+                )
+                x_var = flow.cast_to_current_logical_view(x_var)
+                input_x = x_var + x
+                x_fp32 = flow.cast(input_x, flow.float32)
+                x_fp16 = flow.cast(input_x, dtype=flow.float16)
+                y_fp16 = flow.zero_pad2d(x_fp16, padding)
+                y_fp32 = flow.cast(y_fp16, dtype=flow.float32)
+                flow.optimizer.SGD(
+                    flow.optimizer.PiecewiseConstantScheduler([], [0]), momentum=0
+                ).minimize(y_fp32)
+            flow.watch_diff(x_fp32, _compare_diff)
+            return y_fp32
+
+        return op_function
+
+
+def gen_numpy_test_sample(input_shape, padding, is_float=True):
+    (c_idx, h_idx, w_idx) = (1, 2, 3)
+    pad_left = padding[0]
+    pad_right = padding[1]
+    pad_top = padding[2]
+    pad_bottom = padding[3]
+    pad_shape = ((0, 0), (0, 0), (pad_top, pad_bottom), (pad_left, pad_right))
+
+    def _np_zero_pad2d(input, pad_shape4):
+        numpy_zero = np.pad(input, pad_shape, "constant", constant_values=0.0)
+        return numpy_zero
+
+    def _np_zero_pad2d_grad(src, dest):
+        (dx_height, dx_width) = (input.shape[h_idx], input.shape[w_idx])
+        (dy_height, dy_width) = (output.shape[h_idx], output.shape[w_idx])
+        numpy_src = np.ones(src.shape, np.int32)
+        numpy_dest = np.zeros(dest.shape, np.int32)
+        array_src = FlattenArray(numpy_src)
+        array_dest = FlattenArray(numpy_dest)
+        src_num = src.shape[c_idx] * src.shape[h_idx] * src.shape[w_idx]
+        dest_num = dest.shape[c_idx] * dest.shape[h_idx] * dest.shape[w_idx]
+        elements_num = src.shape[0] * src_num
+        for iter_n in range(elements_num):
+            coords = Index2Coordinate(iter_n, src.shape)
+            (n, c, i, j) = (coords[0], coords[c_idx], coords[h_idx], coords[w_idx])
+            ip_x = ip_y = 0
+            if (
+                j >= pad_left
+                and j < dx_width + pad_left
+                and (i >= pad_top)
+                and (i < dx_height + pad_top)
+            ):
+                ip_x = j - pad_left
+                ip_y = i - pad_top
+                src_index = n * src_num + c * dy_width * dy_height + i * dy_width + j
+                dest_index = (
+                    n * dest_num + c * dx_width * dx_height + ip_y * dx_width + ip_x
+                )
+                array_dest[dest_index] += array_src[src_index]
+        numpy_dest = Array2Numpy(array_dest, dest.shape)
+        return numpy_dest
+
+    if is_float:
+        input = np.random.random(input_shape).astype(np.float32)
+    else:
+        input = np.random.randint(0, 100, input_shape)
+    output = _np_zero_pad2d(input, pad_shape)
+    grad = _np_zero_pad2d_grad(output, input)
+    numpy_results = {"input": input, "padding": padding, "output": output, "grad": grad}
+    return numpy_results
+
+
+def _compare_op_function_with_samples(
+    test_case, device_type, sample, value_type, machine_ids, device_count
+):
+    op_function = _make_op_function(
+        test_case,
+        sample["input"].astype(value_type[0]),
+        sample["padding"],
+        sample["grad"].astype(value_type[0]),
+        device_type,
+        value_type[1],
+        machine_ids,
+        device_count,
+    )
+    y = (
+        op_function(sample["input"].astype(value_type[0]))
+        .get()
+        .numpy()
+        .astype(value_type[0])
+    )
+    if value_type == flow.float16:
+        test_case.assertTrue(
+            np.allclose(y, sample["output"].astype(np.float32), 0.001, 0.001)
+        )
+    else:
+        test_case.assertTrue(np.allclose(y, sample["output"].astype(value_type[0])))
+
+
+def _gen_arg_dict(
+    device_type="gpu", value_type="float", machine_ids="0:0", device_count=1
+):
+    arg_dict = OrderedDict()
+    arg_dict["device_type"] = [device_type]
+    arg_dict["samples"] = []
+    arg_dict["samples"].append(gen_numpy_test_sample((2, 1, 2, 2), [1, 1, 1, 1]))
+    arg_dict["samples"].append(gen_numpy_test_sample((4, 2, 3, 3), [2, 2, 2, 2]))
+    arg_dict["samples"].append(gen_numpy_test_sample((2, 3, 4, 5), [3, 2, 1, 2]))
+    if value_type == "float":
+        if device_type == "gpu":
+            arg_dict["value_type"] = [(np.float32, flow.float32)]
+        else:
+            arg_dict["value_type"] = [(np.float32, flow.float32)]
+    elif value_type == "int":
+        arg_dict["value_type"] = [(np.float32, flow.int32)]
+    else:
+        raise "float or int for value type only"
+    arg_dict["machine_ids"] = [machine_ids]
+    arg_dict["device_count"] = [device_count]
+    return arg_dict
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestZeroPad2d1n1d(flow.unittest.TestCase):
+    def test_op_function_int_cpu(test_case):
+        arg_dict = _gen_arg_dict("cpu", "int", "0:0", 1)
+        for arg in GenArgList(arg_dict):
+            _compare_op_function_with_samples(test_case, *arg)
+
+    def test_op_function_float_cpu(test_case):
+        arg_dict = _gen_arg_dict("cpu", "float", "0:0", 1)
+        for arg in GenArgList(arg_dict):
+            _compare_op_function_with_samples(test_case, *arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_op_function_int_gpu(test_case):
+        arg_dict = _gen_arg_dict("gpu", "int", "0:0", 1)
+        for arg in GenArgList(arg_dict):
+            _compare_op_function_with_samples(test_case, *arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_op_function_float_gpu(test_case):
+        arg_dict = _gen_arg_dict("gpu", "float", "0:0", 1)
+        for arg in GenArgList(arg_dict):
+            _compare_op_function_with_samples(test_case, *arg)
+
+
+@flow.unittest.skip_unless_1n2d()
+class TestZeroPad2d1n2d(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_op_function_float(test_case):
+        arg_dict = _gen_arg_dict("gpu", "float", "0:0-1", 2)
+        for arg in GenArgList(arg_dict):
+            _compare_op_function_with_samples(test_case, *arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_op_function_int(test_case):
+        arg_dict = _gen_arg_dict("gpu", "int", "0:0-1", 2)
+        for arg in GenArgList(arg_dict):
+            _compare_op_function_with_samples(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/ops/test_zeros.py b/python/oneflow/compatible/single_client/test/ops/test_zeros.py
new file mode 100644
index 0000000000000000000000000000000000000000..4141e1f11df24df3638c1b5b7ed3ce862243d8ec
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/ops/test_zeros.py
@@ -0,0 +1,101 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from collections import OrderedDict
+from typing import Dict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.compatible.single_client import typing as tp
+
+
+def _compare_zeros_with_np(input_shape, device_type, machine_ids, device_counts):
+    assert device_type in ["cpu", "gpu"]
+    flow.clear_default_session()
+    if device_type == "cpu":
+        flow.config.cpu_device_num(device_counts)
+    else:
+        flow.config.gpu_device_num(device_counts)
+    func_config = flow.FunctionConfig()
+    func_config.default_placement_scope(flow.scope.placement(device_type, machine_ids))
+    np_out_zeros = np.zeros(shape=input_shape, dtype=np.float32)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def oneflow_zeros() -> tp.Numpy:
+        with flow.scope.placement(device_type, "0:0"):
+            v = flow.get_variable(
+                shape=np_out_zeros.shape,
+                dtype=flow.float32,
+                initializer=flow.zeros_initializer(),
+                name="x_var",
+            )
+        of_zeros = flow.zeros(shape=input_shape, dtype=flow.float32)
+        of_out = of_zeros + v
+        with flow.scope.placement(device_type, "0:0"):
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
+            ).minimize(of_out)
+        return of_zeros
+
+    of_out_zeros = oneflow_zeros()
+    assert np.allclose(of_out_zeros, np_out_zeros)
+
+
+def _gen_arg_dict(shape, device_type, machine_ids, device_counts):
+    arg_dict = OrderedDict()
+    arg_dict["input_shape"] = [shape]
+    arg_dict["device_type"] = [device_type]
+    arg_dict["machine_ids"] = [machine_ids]
+    arg_dict["device_counts"] = [device_counts]
+    return arg_dict
+
+
+@flow.unittest.skip_unless_1n1d()
+class Testzeros1n1d(flow.unittest.TestCase):
+    def test_zeros_cpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(3, 3), device_type="cpu", machine_ids="0:0", device_counts=1
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_zeros_with_np(*arg)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_zeros_gpu(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(3, 16, 32), device_type="gpu", machine_ids="0:0", device_counts=1
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_zeros_with_np(*arg)
+
+
+@flow.unittest.skip_unless_1n2d()
+class Testzeros1n2d(flow.unittest.TestCase):
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_zeros_gpu_1n2d(test_case):
+        arg_dict = _gen_arg_dict(
+            shape=(3, 8, 8, 4), device_type="gpu", machine_ids="0:0-1", device_counts=2
+        )
+        for arg in GenArgList(arg_dict):
+            _compare_zeros_with_np(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/serving/alexnet.py b/python/oneflow/compatible/single_client/test/serving/alexnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..26491b084b819e4569d2ddbc9975877f02f75aaa
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/serving/alexnet.py
@@ -0,0 +1,127 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+
+
+def _conv2d_layer(
+    name,
+    input,
+    filters,
+    kernel_size=3,
+    strides=1,
+    padding="SAME",
+    data_format="NCHW",
+    dilation_rate=1,
+    activation=flow.math.relu,
+    use_bias=False,
+    weight_initializer=flow.random_uniform_initializer(),
+    bias_initializer=flow.random_uniform_initializer(),
+):
+    weight_shape = (filters, input.shape[1], kernel_size, kernel_size)
+    weight = flow.get_variable(
+        name + "-weight",
+        shape=weight_shape,
+        dtype=input.dtype,
+        initializer=weight_initializer,
+    )
+    output = flow.nn.conv2d(
+        input, weight, strides, padding, None, data_format, dilation_rate, name=name
+    )
+    if use_bias:
+        bias = flow.get_variable(
+            name + "-bias",
+            shape=(filters,),
+            dtype=input.dtype,
+            initializer=bias_initializer,
+        )
+        output = flow.nn.bias_add(output, bias, data_format)
+    if callable(activation):
+        output = activation(output)
+    return output
+
+
+def load_data(batch_size, data_dir, data_part_num):
+    rgb_mean = [123.68, 116.78, 103.94]
+    (image, label) = flow.data.ofrecord_image_classification_reader(
+        ofrecord_dir=data_dir,
+        batch_size=batch_size,
+        data_part_num=data_part_num,
+        image_feature_name="encoded",
+        label_feature_name="class/label",
+        color_space="RGB",
+        name="decode",
+    )
+    res_image = flow.image.resize(image, resize_x=227, resize_y=227, color_space="RGB")
+    normal = flow.image.crop_mirror_normalize(
+        res_image,
+        color_space="RGB",
+        output_layout="NCHW",
+        mean=rgb_mean,
+        output_dtype=flow.float,
+    )
+    return (label, normal)
+
+
+def alexnet(image, label, trainable=True):
+    conv1 = _conv2d_layer(
+        "conv1", image, filters=64, kernel_size=11, strides=4, padding="VALID"
+    )
+    pool1 = flow.nn.avg_pool2d(conv1, 3, 2, "VALID", "NCHW", name="pool1")
+    conv2 = _conv2d_layer("conv2", pool1, filters=192, kernel_size=5)
+    pool2 = flow.nn.avg_pool2d(conv2, 3, 2, "VALID", "NCHW", name="pool2")
+    conv3 = _conv2d_layer("conv3", pool2, filters=384)
+    conv4 = _conv2d_layer("conv4", conv3, filters=384)
+    conv5 = _conv2d_layer("conv5", conv4, filters=256)
+    pool5 = flow.nn.avg_pool2d(conv5, 3, 2, "VALID", "NCHW", name="pool5")
+    if len(pool5.shape) > 2:
+        pool5 = flow.flatten(pool5, start_dim=1, end_dim=-1)
+    initializer = flow.truncated_normal_initializer(stddev=0.816496580927726)
+    fc1 = flow.layers.dense(
+        inputs=pool5,
+        units=4096,
+        activation=flow.math.relu,
+        use_bias=False,
+        kernel_initializer=initializer,
+        bias_initializer=False,
+        trainable=trainable,
+        name="fc1",
+    )
+    dropout1 = fc1
+    fc2 = flow.layers.dense(
+        inputs=dropout1,
+        units=4096,
+        activation=flow.math.relu,
+        use_bias=False,
+        kernel_initializer=initializer,
+        bias_initializer=False,
+        trainable=trainable,
+        name="fc2",
+    )
+    dropout2 = fc2
+    fc3 = flow.layers.dense(
+        inputs=dropout2,
+        units=1001,
+        activation=None,
+        use_bias=False,
+        kernel_initializer=initializer,
+        bias_initializer=False,
+        trainable=trainable,
+        name="fc3",
+    )
+    loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+        label, fc3, name="softmax_loss"
+    )
+    return loss
diff --git a/python/oneflow/compatible/single_client/test/serving/insightface_resnet100.py b/python/oneflow/compatible/single_client/test/serving/insightface_resnet100.py
new file mode 100644
index 0000000000000000000000000000000000000000..cef5e697b3c69065ee0575dca61869a33193d767
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/serving/insightface_resnet100.py
@@ -0,0 +1,383 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+from oneflow.core.operator import op_conf_pb2 as op_conf_util
+
+
+def _get_initializer():
+    return flow.variance_scaling_initializer(2.0, "fan_out", "random_normal", "NCHW")
+
+
+def _get_regularizer():
+    return flow.regularizers.l2(0.0005)
+
+
+def _conv2d_layer(
+    name,
+    input,
+    filters,
+    kernel_size=3,
+    strides=1,
+    padding="SAME",
+    group_num=1,
+    data_format="NCHW",
+    dilation_rate=1,
+    activation=None,
+    use_bias=False,
+    weight_initializer=_get_initializer(),
+    bias_initializer=flow.zeros_initializer(),
+    weight_regularizer=_get_regularizer(),
+    bias_regularizer=_get_regularizer(),
+):
+    weight_shape = (
+        int(filters),
+        int(input.shape[1] / group_num),
+        int(kernel_size),
+        int(kernel_size),
+    )
+    weight = flow.get_variable(
+        name + "-weight",
+        shape=weight_shape,
+        dtype=input.dtype,
+        initializer=weight_initializer,
+        regularizer=weight_regularizer,
+    )
+    output = flow.nn.conv2d(
+        input,
+        weight,
+        strides,
+        padding,
+        None,
+        data_format,
+        dilation_rate,
+        groups=group_num,
+        name=name,
+    )
+    if use_bias:
+        bias = flow.get_variable(
+            name + "-bias",
+            shape=(filters,),
+            dtype=input.dtype,
+            initializer=bias_initializer,
+            regularizer=bias_regularizer,
+        )
+        output = flow.nn.bias_add(output, bias, data_format)
+    if activation is not None:
+        if activation == op_conf_util.kRelu:
+            output = flow.math.relu(output)
+        else:
+            raise NotImplementedError
+    return output
+
+
+def _batch_norm(
+    inputs,
+    epsilon,
+    center=True,
+    scale=True,
+    trainable=True,
+    is_training=True,
+    name=None,
+):
+    return flow.layers.batch_normalization(
+        inputs=inputs,
+        axis=1,
+        momentum=0.9,
+        epsilon=epsilon,
+        center=center,
+        scale=scale,
+        beta_initializer=flow.zeros_initializer(),
+        gamma_initializer=flow.ones_initializer(),
+        beta_regularizer=_get_regularizer(),
+        gamma_regularizer=_get_regularizer(),
+        moving_mean_initializer=flow.zeros_initializer(),
+        moving_variance_initializer=flow.ones_initializer(),
+        trainable=trainable,
+        training=is_training,
+        name=name,
+    )
+
+
+def _prelu(inputs, name=None):
+    return flow.layers.prelu(
+        inputs,
+        alpha_initializer=flow.constant_initializer(0.25),
+        alpha_regularizer=_get_regularizer(),
+        shared_axes=[2, 3],
+        name=name,
+    )
+
+
+def _avg_pool(inputs, pool_size, strides, padding, name=None):
+    return flow.nn.avg_pool2d(
+        input=inputs, ksize=pool_size, strides=strides, padding=padding
+    )
+
+
+def _dropout(input_blob, dropout_prob):
+    return flow.nn.dropout(input_blob, rate=dropout_prob)
+
+
+def Linear(
+    input_blob,
+    num_filter=1,
+    kernel=None,
+    stride=None,
+    pad="valid",
+    num_group=1,
+    bn_is_training=True,
+    name=None,
+    suffix="",
+):
+    conv = _conv2d_layer(
+        name="%s%s_conv2d" % (name, suffix),
+        input=input_blob,
+        filters=num_filter,
+        kernel_size=kernel,
+        strides=stride,
+        padding=pad,
+        group_num=num_group,
+        use_bias=False,
+        dilation_rate=1,
+        activation=None,
+    )
+    bn = _batch_norm(
+        conv,
+        epsilon=0.001,
+        is_training=bn_is_training,
+        name="%s%s_batchnorm" % (name, suffix),
+    )
+    return bn
+
+
+def residual_unit_v3(in_data, num_filter, stride, dim_match, bn_is_training, name):
+    suffix = ""
+    use_se = 0
+    bn1 = _batch_norm(
+        in_data,
+        epsilon=2e-05,
+        is_training=bn_is_training,
+        name="%s%s_bn1" % (name, suffix),
+    )
+    conv1 = _conv2d_layer(
+        name="%s%s_conv1" % (name, suffix),
+        input=bn1,
+        filters=num_filter,
+        kernel_size=3,
+        strides=[1, 1],
+        padding="same",
+        use_bias=False,
+        dilation_rate=1,
+        activation=None,
+    )
+    bn2 = _batch_norm(
+        conv1,
+        epsilon=2e-05,
+        is_training=bn_is_training,
+        name="%s%s_bn2" % (name, suffix),
+    )
+    prelu = _prelu(bn2, name="%s%s_relu1" % (name, suffix))
+    conv2 = _conv2d_layer(
+        name="%s%s_conv2" % (name, suffix),
+        input=prelu,
+        filters=num_filter,
+        kernel_size=3,
+        strides=stride,
+        padding="same",
+        use_bias=False,
+        dilation_rate=1,
+        activation=None,
+    )
+    bn3 = _batch_norm(
+        conv2,
+        epsilon=2e-05,
+        is_training=bn_is_training,
+        name="%s%s_bn3" % (name, suffix),
+    )
+    if use_se:
+        input_blob = _avg_pool(bn3, pool_size=[7, 7], strides=[1, 1], padding="VALID")
+        input_blob = _conv2d_layer(
+            name="%s%s_se_conv1" % (name, suffix),
+            input=input_blob,
+            filters=num_filter // 16,
+            kernel_size=1,
+            strides=[1, 1],
+            padding="valid",
+            use_bias=True,
+            dilation_rate=1,
+            activation=None,
+        )
+        input_blob = _prelu(input_blob, name="%s%s_se_relu1" % (name, suffix))
+        input_blob = _conv2d_layer(
+            name="%s%s_se_conv2" % (name, suffix),
+            input=input_blob,
+            filters=num_filter,
+            kernel_size=1,
+            strides=[1, 1],
+            padding="valid",
+            use_bias=True,
+            dilation_rate=1,
+            activation=None,
+        )
+        input_blob = flow.math.sigmoid(input=input_blob)
+        bn3 = flow.math.multiply(x=input_blob, y=bn3)
+    if dim_match:
+        input_blob = in_data
+    else:
+        input_blob = _conv2d_layer(
+            name="%s%s_conv1sc" % (name, suffix),
+            input=in_data,
+            filters=num_filter,
+            kernel_size=1,
+            strides=stride,
+            padding="valid",
+            use_bias=False,
+            dilation_rate=1,
+            activation=None,
+        )
+        input_blob = _batch_norm(
+            input_blob,
+            epsilon=2e-05,
+            is_training=bn_is_training,
+            name="%s%s_sc" % (name, suffix),
+        )
+    identity = flow.math.add(x=bn3, y=input_blob)
+    return identity
+
+
+def Resnet100(input_blob, embedding_size, fc_type="GDC", bn_is_training=True, **kw):
+    filter_list = [64, 64, 128, 256, 512]
+    num_stages = 4
+    units = [3, 13, 30, 3]
+    input_blob = _conv2d_layer(
+        name="conv0",
+        input=input_blob,
+        filters=filter_list[0],
+        kernel_size=3,
+        strides=[1, 1],
+        padding="same",
+        use_bias=False,
+        dilation_rate=1,
+        activation=None,
+    )
+    input_blob = _batch_norm(
+        input_blob, epsilon=2e-05, is_training=bn_is_training, name="bn0"
+    )
+    input_blob = _prelu(input_blob, name="relu0")
+    for i in range(num_stages):
+        input_blob = residual_unit_v3(
+            input_blob,
+            filter_list[i + 1],
+            [2, 2],
+            False,
+            bn_is_training=bn_is_training,
+            name="stage%d_unit%d" % (i + 1, 1),
+        )
+        for j in range(units[i] - 1):
+            input_blob = residual_unit_v3(
+                input_blob,
+                filter_list[i + 1],
+                [1, 1],
+                True,
+                bn_is_training=bn_is_training,
+                name="stage%d_unit%d" % (i + 1, j + 2),
+            )
+    if fc_type == "GDC":
+        input_blob = Linear(
+            input_blob,
+            num_filter=512,
+            num_group=512,
+            kernel=7,
+            pad="valid",
+            stride=[1, 1],
+            bn_is_training=bn_is_training,
+            name="conv_6dw7_7",
+        )
+        input_blob = flow.reshape(input_blob, (input_blob.shape[0], -1))
+        pre_fc1 = flow.layers.dense(
+            inputs=input_blob,
+            units=embedding_size,
+            activation=None,
+            use_bias=True,
+            kernel_initializer=_get_initializer(),
+            bias_initializer=flow.zeros_initializer(),
+            kernel_regularizer=_get_regularizer(),
+            bias_regularizer=_get_regularizer(),
+            trainable=True,
+            name="pre_fc1",
+        )
+        fc1 = _batch_norm(
+            pre_fc1,
+            epsilon=2e-05,
+            center=True,
+            scale=False,
+            is_training=bn_is_training,
+            name="fc1",
+        )
+    elif fc_type == "E":
+        input_blob = _batch_norm(
+            input_blob, epsilon=2e-05, is_training=bn_is_training, name="bn1"
+        )
+        input_blob = _dropout(input_blob, dropout_prob=0.4)
+        input_blob = flow.reshape(input_blob, (input_blob.shape[0], -1))
+        pre_fc1 = flow.layers.dense(
+            inputs=input_blob,
+            units=embedding_size,
+            activation=None,
+            use_bias=True,
+            kernel_initializer=_get_initializer(),
+            bias_initializer=flow.zeros_initializer(),
+            kernel_regularizer=_get_regularizer(),
+            bias_regularizer=_get_regularizer(),
+            trainable=True,
+            name="pre_fc1",
+        )
+        fc1 = _batch_norm(
+            pre_fc1,
+            epsilon=2e-05,
+            center=True,
+            scale=False,
+            is_training=bn_is_training,
+            name="fc1",
+        )
+    elif fc_type == "FC":
+        input_blob = _batch_norm(
+            input_blob, epsilon=2e-05, is_training=bn_is_training, name="bn1"
+        )
+        input_blob = flow.reshape(input_blob, (input_blob.shape[0], -1))
+        pre_fc1 = flow.layers.dense(
+            inputs=input_blob,
+            units=embedding_size,
+            activation=None,
+            use_bias=True,
+            kernel_initializer=_get_initializer(),
+            bias_initializer=flow.zeros_initializer(),
+            kernel_regularizer=_get_regularizer(),
+            bias_regularizer=_get_regularizer(),
+            trainable=True,
+            name="pre_fc1",
+        )
+        fc1 = _batch_norm(
+            pre_fc1,
+            epsilon=2e-05,
+            center=True,
+            scale=False,
+            is_training=bn_is_training,
+            name="fc1",
+        )
+    else:
+        print("unimplemented")
+    return fc1
diff --git a/python/oneflow/compatible/single_client/test/serving/ofrecord_dataset.py b/python/oneflow/compatible/single_client/test/serving/ofrecord_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..b17f42198e656e4111985500b498da3a04e31195
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/serving/ofrecord_dataset.py
@@ -0,0 +1,230 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+import random
+import struct
+
+import cv2
+import numpy as np
+
+from oneflow.core.record import record_pb2 as record_pb
+
+
+class OFRecordDataset(object):
+    def __init__(
+        self,
+        data_dir,
+        num_data_parts,
+        part_name_suffix_length,
+        batch_size,
+        shuffle_data_part,
+    ):
+        self.data_dir_ = data_dir
+        self.num_data_parts_ = num_data_parts
+        self.part_name_suffix_length_ = part_name_suffix_length
+        self.batch_size_ = batch_size
+        self.epoch_cnt_ = 0
+        self.cur_data_part_idx_ = 0
+        self.shuffle_data_part_ = shuffle_data_part
+        self.reader_ = None
+        self.num_read_batchs_ = 0
+
+    @property
+    def batch_size(self):
+        return self.batch_size_
+
+    @batch_size.setter
+    def batch_size(self, bs):
+        self.batch_size_ = bs
+
+    @property
+    def num_read_batchs(self):
+        return self.num_read_batchs_
+
+    def __del__(self):
+        if self.reader_ is not None:
+            self.reader_.close()
+
+    def __iter__(self):
+        self._gen_data_part_seq()
+        self._open_data_part_file()
+        while True:
+            yield self._read_one_batch()
+
+    def load_batchs(self, num_batchs):
+        image_list = []
+        label_list = []
+        for (i, (image_array, label_array)) in enumerate(self):
+            if i >= num_batchs:
+                break
+            image_list.append(image_array)
+            label_list.append(label_array)
+        return (image_list, label_list)
+
+    def parse_record(self, record):
+        raise NotImplementedError
+
+    def collate(self, batch):
+        raise NotImplementedError
+
+    def reset(self):
+        self.epoch_cnt_ = 0
+        self.cur_data_part_idx_ = 0
+        if self.reader_ is not None:
+            self.reader_.close()
+        self.num_read_batchs_ = 0
+
+    def _move_to_next_data_part(self):
+        self.cur_data_part_idx_ += 1
+        if self.cur_data_part_idx_ >= len(self.data_part_seq_):
+            self.epoch_cnt_ += 1
+            self._gen_data_part_seq()
+        self._open_data_part_file()
+
+    def _gen_data_part_seq(self):
+        data_part_name_pattern = "part-{:0" + str(self.part_name_suffix_length_) + "d}"
+        self.data_part_seq_ = [
+            data_part_name_pattern.format(i) for i in range(self.num_data_parts_)
+        ]
+        if self.shuffle_data_part_:
+            random.shuffle(self.data_part_seq_)
+
+    def _open_data_part_file(self):
+        if self.reader_ is not None:
+            self.reader_.close()
+        data_part_file_path = os.path.join(
+            self.data_dir_, self.data_part_seq_[self.cur_data_part_idx_]
+        )
+        self.reader_ = open(data_part_file_path, "rb")
+
+    def _read_one_batch(self):
+        assert self.reader_ is not None
+        batch = []
+        for i in range(self.batch_size_):
+            record_head = self.reader_.read(8)
+            if record_head is None or len(record_head) != 8:
+                self._move_to_next_data_part()
+                break
+            record = record_pb.OFRecord()
+            record_byte_size = struct.unpack("q", record_head)[0]
+            record.ParseFromString(self.reader_.read(record_byte_size))
+            batch.append(self.parse_record(record))
+        self.num_read_batchs_ += 1
+        return self.collate(batch)
+
+
+class ImageNetRecordDataset(OFRecordDataset):
+    def __init__(
+        self,
+        data_dir="/dataset/ImageNet/ofrecord/validation",
+        num_data_parts=256,
+        part_name_suffix_length=5,
+        batch_size=4,
+        shuffle_data_part=False,
+        image_resize_size=224,
+        data_format="NCHW",
+    ):
+        super().__init__(
+            data_dir,
+            num_data_parts,
+            part_name_suffix_length,
+            batch_size,
+            shuffle_data_part,
+        )
+        self.image_resize_size_ = image_resize_size
+        self.data_format_ = data_format
+
+    def parse_record(self, record):
+        image_raw_bytes = record.feature["encoded"].bytes_list.value[0]
+        image = cv2.imdecode(
+            np.frombuffer(image_raw_bytes, np.uint8), cv2.IMREAD_COLOR
+        ).astype(np.float32)
+        image = self.preprocess_image(image)
+        label = record.feature["class/label"].int32_list.value[0]
+        return (image, label)
+
+    def collate(self, batch):
+        batched_image = np.stack([data[0] for data in batch], axis=0)
+        batched_label = np.array([data[1] for data in batch], dtype=np.int32)
+        return (batched_image, batched_label)
+
+    def preprocess_image(self, image):
+        image = cv2.resize(image, (self.image_resize_size_, self.image_resize_size_))
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        norm_rgb_mean = np.array([123.68, 116.779, 103.939], dtype=np.float32)
+        norm_rgb_std = np.array([58.393, 57.12, 57.375], dtype=np.float32)
+        image = (image - norm_rgb_mean) / norm_rgb_std
+        if self.data_format_ == "NCHW":
+            assert image.shape[2] == 3
+            image = np.transpose(image, (2, 0, 1))
+        elif self.data_format_ == "NHWC":
+            assert image.shape[2] == 3
+        else:
+            raise ValueError("Unsupported image data format")
+        return np.ascontiguousarray(image)
+
+
+class FaceEmoreRecordDataset(OFRecordDataset):
+    def __init__(
+        self,
+        data_dir="/dataset/insightface/train_ofrecord/faces_emore",
+        num_data_parts=256,
+        part_name_suffix_length=1,
+        batch_size=4,
+        shuffle_data_part=False,
+        image_width=112,
+        image_height=112,
+        color_space="RGB",
+        data_format="NCHW",
+    ):
+        super().__init__(
+            data_dir,
+            num_data_parts,
+            part_name_suffix_length,
+            batch_size,
+            shuffle_data_part,
+        )
+        self.image_width_ = image_width
+        self.image_height_ = image_height
+        self.color_space_ = color_space
+        self.data_format_ = data_format
+
+    def parse_record(self, record):
+        image_raw_bytes = record.feature["encoded"].bytes_list.value[0]
+        image = cv2.imdecode(
+            np.frombuffer(image_raw_bytes, np.uint8), cv2.IMREAD_COLOR
+        ).astype(np.float32)
+        image = self.preprocess_image(image)
+        issame = record.feature["issame"].int32_list.value[0]
+        return (image, issame)
+
+    def preprocess_image(self, image):
+        image = cv2.resize(image, (self.image_height_, self.image_width_))
+        if self.color_space_ == "RGB":
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        if self.data_format_ == "NCHW":
+            assert image.shape[2] == 3
+            image = np.transpose(image, (2, 0, 1))
+        elif self.data_format_ == "NHWC":
+            assert image.shape[2] == 3
+        else:
+            raise ValueError("Unsupported image data format")
+        return image
+
+    def collate(self, batch):
+        image = np.stack([data[0] for data in batch], axis=0)
+        issame = np.array([data[1] for data in batch], dtype=np.int32)
+        return (image, issame)
diff --git a/python/oneflow/compatible/single_client/test/serving/resnet_model.py b/python/oneflow/compatible/single_client/test/serving/resnet_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..066ca548fb7583e9005656121578141548ff0e07
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/serving/resnet_model.py
@@ -0,0 +1,191 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+
+BLOCK_COUNTS = [3, 4, 6, 3]
+BLOCK_FILTERS = [256, 512, 1024, 2048]
+BLOCK_FILTERS_INNER = [64, 128, 256, 512]
+
+
+class ResnetBuilder(object):
+    def __init__(
+        self, weight_regularizer, trainable=True, training=True, channel_last=False
+    ):
+        self.data_format = "NHWC" if channel_last else "NCHW"
+        self.weight_initializer = flow.variance_scaling_initializer(
+            2, "fan_in", "random_normal", data_format=self.data_format
+        )
+        self.weight_regularizer = weight_regularizer
+        self.trainable = trainable
+        self.training = training
+
+    def _conv2d(
+        self, name, input, filters, kernel_size, strides=1, padding="SAME", dilations=1
+    ):
+        if self.data_format == "NHWC":
+            shape = (filters, kernel_size, kernel_size, input.shape[3])
+        else:
+            shape = (filters, input.shape[1], kernel_size, kernel_size)
+        weight = flow.get_variable(
+            name + "-weight",
+            shape=shape,
+            dtype=input.dtype,
+            initializer=self.weight_initializer,
+            regularizer=self.weight_regularizer,
+            model_name="weight",
+            trainable=self.trainable,
+        )
+        return flow.nn.conv2d(
+            input,
+            weight,
+            strides,
+            padding,
+            None,
+            self.data_format,
+            dilations,
+            name=name,
+        )
+
+    def _batch_norm(self, inputs, name=None, last=False):
+        initializer = flow.zeros_initializer() if last else flow.ones_initializer()
+        axis = 1
+        if self.data_format == "NHWC":
+            axis = 3
+        return flow.layers.batch_normalization(
+            inputs=inputs,
+            axis=axis,
+            momentum=0.9,
+            epsilon=1e-05,
+            center=True,
+            scale=True,
+            trainable=self.trainable,
+            training=self.training,
+            gamma_initializer=initializer,
+            moving_variance_initializer=initializer,
+            gamma_regularizer=self.weight_regularizer,
+            beta_regularizer=self.weight_regularizer,
+            name=name,
+        )
+
+    def conv2d_affine(
+        self, input, name, filters, kernel_size, strides, activation=None, last=False
+    ):
+        padding = "SAME" if strides > 1 or kernel_size > 1 else "VALID"
+        output = self._conv2d(name, input, filters, kernel_size, strides, padding)
+        output = self._batch_norm(output, name + "_bn", last=last)
+        if activation == "Relu":
+            output = flow.nn.relu(output)
+        return output
+
+    def bottleneck_transformation(
+        self, input, block_name, filters, filters_inner, strides
+    ):
+        a = self.conv2d_affine(
+            input, block_name + "_branch2a", filters_inner, 1, 1, activation="Relu"
+        )
+        b = self.conv2d_affine(
+            a, block_name + "_branch2b", filters_inner, 3, strides, activation="Relu"
+        )
+        c = self.conv2d_affine(b, block_name + "_branch2c", filters, 1, 1, last=True)
+        return c
+
+    def residual_block(self, input, block_name, filters, filters_inner, strides_init):
+        if strides_init != 1 or block_name == "res2_0":
+            shortcut = self.conv2d_affine(
+                input, block_name + "_branch1", filters, 1, strides_init
+            )
+        else:
+            shortcut = input
+        bottleneck = self.bottleneck_transformation(
+            input, block_name, filters, filters_inner, strides_init
+        )
+        return flow.nn.relu(bottleneck + shortcut)
+
+    def residual_stage(
+        self, input, stage_name, counts, filters, filters_inner, stride_init=2
+    ):
+        output = input
+        for i in range(counts):
+            block_name = "%s_%d" % (stage_name, i)
+            output = self.residual_block(
+                output, block_name, filters, filters_inner, stride_init if i == 0 else 1
+            )
+        return output
+
+    def resnet_conv_x_body(self, input):
+        output = input
+        for (i, (counts, filters, filters_inner)) in enumerate(
+            zip(BLOCK_COUNTS, BLOCK_FILTERS, BLOCK_FILTERS_INNER)
+        ):
+            stage_name = "res%d" % (i + 2)
+            output = self.residual_stage(
+                output, stage_name, counts, filters, filters_inner, 1 if i == 0 else 2
+            )
+        return output
+
+    def resnet_stem(self, input):
+        conv1 = self._conv2d("conv1", input, 64, 7, 2)
+        conv1_bn = flow.nn.relu(self._batch_norm(conv1, "conv1_bn"))
+        pool1 = flow.nn.max_pool2d(
+            conv1_bn,
+            ksize=3,
+            strides=2,
+            padding="SAME",
+            data_format=self.data_format,
+            name="pool1",
+        )
+        return pool1
+
+
+def resnet50(
+    images,
+    trainable=True,
+    need_transpose=False,
+    training=True,
+    wd=1.0 / 32768,
+    channel_last=False,
+):
+    weight_regularizer = flow.regularizers.l2(wd) if wd > 0.0 and wd < 1.0 else None
+    builder = ResnetBuilder(weight_regularizer, trainable, training, channel_last)
+    if need_transpose:
+        images = flow.transpose(images, name="transpose", perm=[0, 3, 1, 2])
+    if channel_last:
+        images = flow.transpose(images, name="transpose", perm=[0, 2, 3, 1])
+    with flow.scope.namespace("Resnet"):
+        stem = builder.resnet_stem(images)
+        body = builder.resnet_conv_x_body(stem)
+        pool5 = flow.nn.avg_pool2d(
+            body,
+            ksize=7,
+            strides=1,
+            padding="VALID",
+            data_format=builder.data_format,
+            name="pool5",
+        )
+        fc1001 = flow.layers.dense(
+            flow.reshape(pool5, (pool5.shape[0], -1)),
+            units=1000,
+            use_bias=True,
+            kernel_initializer=flow.variance_scaling_initializer(
+                2, "fan_in", "random_normal"
+            ),
+            bias_initializer=flow.zeros_initializer(),
+            kernel_regularizer=weight_regularizer,
+            bias_regularizer=weight_regularizer,
+            trainable=trainable,
+            name="fc1001",
+        )
+    return fc1001
diff --git a/python/oneflow/compatible/single_client/test/serving/style_model.py b/python/oneflow/compatible/single_client/test/serving/style_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..b370273eeb4e30dfaaf957e205047bc3f8801797
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/serving/style_model.py
@@ -0,0 +1,200 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible import single_client as flow
+
+
+def instance_norm(input, name_prefix, trainable=True):
+    (mean, variance) = flow.nn.moments(input, [2, 3], keepdims=True)
+    gamma = flow.get_variable(
+        name_prefix + "_gamma",
+        shape=(1, input.shape[1], 1, 1),
+        dtype=input.dtype,
+        initializer=flow.ones_initializer(),
+        trainable=trainable,
+    )
+    beta = flow.get_variable(
+        name_prefix + "_beta",
+        shape=(1, input.shape[1], 1, 1),
+        dtype=input.dtype,
+        initializer=flow.zeros_initializer(),
+        trainable=trainable,
+    )
+    epsilon = 0.001
+    normalized = (input - mean) / flow.math.sqrt(variance + epsilon)
+    return gamma * normalized + beta
+
+
+def conv2d_layer(
+    name,
+    input,
+    out_channel,
+    kernel_size=3,
+    strides=1,
+    padding="SAME",
+    data_format="NCHW",
+    dilation_rate=1,
+    use_bias=True,
+    weight_initializer=flow.variance_scaling_initializer(
+        2, "fan_out", "random_normal", data_format="NCHW"
+    ),
+    bias_initializer=flow.zeros_initializer(),
+    trainable=True,
+):
+    weight_shape = (out_channel, input.shape[1], kernel_size, kernel_size)
+    weight = flow.get_variable(
+        name + "_weight",
+        shape=weight_shape,
+        dtype=input.dtype,
+        initializer=weight_initializer,
+        trainable=trainable,
+    )
+    output = flow.nn.conv2d(
+        input, weight, strides, padding, None, data_format, dilation_rate, name=name
+    )
+    if use_bias:
+        bias = flow.get_variable(
+            name + "_bias",
+            shape=(out_channel,),
+            dtype=input.dtype,
+            initializer=bias_initializer,
+            trainable=trainable,
+        )
+        output = flow.nn.bias_add(output, bias, data_format)
+    return output
+
+
+def upsampleConvLayer(
+    input,
+    name_prefix,
+    channel,
+    kernel_size,
+    hw_scale=(2, 2),
+    data_format="NCHW",
+    interpolation="nearest",
+    trainable=True,
+):
+    upsample = flow.layers.upsample_2d(
+        input,
+        size=hw_scale,
+        data_format=data_format,
+        interpolation=interpolation,
+        name=name_prefix + "_%s" % interpolation,
+    )
+    return conv2d_layer(
+        name_prefix + "_conv",
+        upsample,
+        channel,
+        kernel_size=kernel_size,
+        strides=1,
+        trainable=trainable,
+    )
+
+
+def resBlock(input, channel, name_prefix, trainable=True):
+    out = conv2d_layer(
+        name_prefix + "_conv1",
+        input,
+        channel,
+        kernel_size=3,
+        strides=1,
+        trainable=trainable,
+    )
+    out = instance_norm(out, name_prefix + "_in1", trainable=trainable)
+    out = flow.nn.relu(out)
+    out = conv2d_layer(
+        name_prefix + "_conv2",
+        out,
+        channel,
+        kernel_size=3,
+        strides=1,
+        trainable=trainable,
+    )
+    out = instance_norm(out, name_prefix + "_in2", trainable=trainable)
+    return out + input
+
+
+def deconv(
+    input, out_channel, name_prefix, kernel_size=4, strides=[2, 2], trainable=True
+):
+    weight = flow.get_variable(
+        name_prefix + "_weight",
+        shape=(input.shape[1], out_channel, kernel_size, kernel_size),
+        dtype=flow.float,
+        initializer=flow.random_uniform_initializer(minval=-10, maxval=10),
+        trainable=True,
+    )
+    return flow.nn.conv2d_transpose(
+        input,
+        weight,
+        strides=strides,
+        padding="SAME",
+        output_shape=(
+            input.shape[0],
+            out_channel,
+            input.shape[2] * strides[0],
+            input.shape[3] * strides[1],
+        ),
+    )
+
+
+def styleNet(input, trainable=True):
+    with flow.scope.namespace("style_transfer"):
+        conv1 = conv2d_layer(
+            "first_conv", input, 32, kernel_size=9, strides=1, trainable=trainable
+        )
+        in1 = instance_norm(conv1, "first_conv_in", trainable=trainable)
+        in1 = flow.nn.relu(in1)
+        conv2 = conv2d_layer(
+            "second_conv", in1, 64, kernel_size=3, strides=2, trainable=trainable
+        )
+        in2 = instance_norm(conv2, "second_conv_in", trainable=trainable)
+        in2 = flow.nn.relu(in2)
+        conv3 = conv2d_layer(
+            "third_conv", in2, 128, kernel_size=3, strides=2, trainable=trainable
+        )
+        in3 = instance_norm(conv3, "third_conv_in", trainable=trainable)
+        in3 = flow.nn.relu(in3)
+        res1 = resBlock(in3, 128, "res1", trainable=trainable)
+        res2 = resBlock(res1, 128, "res2", trainable=trainable)
+        res3 = resBlock(res2, 128, "res3", trainable=trainable)
+        res4 = resBlock(res3, 128, "res4", trainable=trainable)
+        res5 = resBlock(res4, 128, "res5", trainable=trainable)
+        upsample1 = upsampleConvLayer(res5, "upsample1", 64, 3, trainable=trainable)
+        in4 = instance_norm(upsample1, "upsample1_in", trainable=trainable)
+        in4 = flow.nn.relu(in4)
+        upsample2 = upsampleConvLayer(in4, "upsample2", 32, 3, trainable=trainable)
+        in5 = instance_norm(upsample2, "upsample2_in", trainable=trainable)
+        in5 = flow.nn.relu(in5)
+        conv1 = conv2d_layer(
+            "last_conv", in5, 3, kernel_size=9, strides=1, trainable=trainable
+        )
+        out = flow.clamp(conv1, 0, 255)
+    return out
+
+
+def gram_matrix(input):
+    b = input.shape[0]
+    ch = input.shape[1]
+    h = input.shape[2]
+    w = input.shape[3]
+    features = flow.reshape(input, [b, ch, h * w])
+    features_t = flow.transpose(features, [0, 2, 1])
+    gram = flow.matmul(features, features_t) / (ch * h * w)
+    return gram
+
+
+def mse_loss(input):
+    return flow.math.reduce_mean(flow.math.square(input))
diff --git a/python/oneflow/compatible/single_client/test/serving/test_alexnet_save_and_load.py b/python/oneflow/compatible/single_client/test/serving/test_alexnet_save_and_load.py
new file mode 100644
index 0000000000000000000000000000000000000000..9200d8a5c743067f5c8974ae29b18656bdf300ae
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/serving/test_alexnet_save_and_load.py
@@ -0,0 +1,194 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import shutil
+import unittest
+
+import numpy as np
+from alexnet import alexnet, load_data
+from google.protobuf import text_format as text_format
+from ofrecord_dataset import ImageNetRecordDataset
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.core.serving import saved_model_pb2 as saved_model_pb
+
+DEFAULT_BATCH_SIZE = 8
+DEFAULT_TRAIN_DATA_PATH = "/dataset/imagenet_227/train/32/"
+DEFAULT_TRAIN_DATA_PART_NUM = 32
+DEFAULT_INFER_DATA_PATH = "/dataset/imagenet_227/train/32/"
+DEFAULT_INFER_DATA_PART_NUM = 32
+DEFAULT_CHECKPOINT_DIR = "/dataset/PNGS/cnns_model_for_test/alexnet/models/of_model_bk"
+DEFAULT_IMAGE_SIZE = 227
+
+
+def init_env():
+    flow.env.init()
+    flow.config.machine_num(1)
+    flow.config.cpu_device_num(1)
+    flow.config.gpu_device_num(1)
+    flow.config.enable_debug_mode(True)
+
+
+def make_alexnet_train_func(batch_size, data_dir, data_part_num):
+    @flow.global_function(type="train")
+    def alexnet_train() -> flow.typing.Numpy:
+        (image, label) = load_data(batch_size, data_dir, data_part_num)
+        loss = alexnet(image, label)
+        flow.optimizer.SGD(
+            flow.optimizer.PiecewiseConstantScheduler([], [1e-05]), momentum=0
+        ).minimize(loss)
+        return loss
+
+    return alexnet_train
+
+
+def make_alexnet_infer_func(batch_size, image_size):
+    input_lbns = {}
+    output_lbns = {}
+    image_shape = (batch_size,) + tuple(image_size)
+    label_shape = (batch_size,)
+
+    @flow.global_function(type="predict")
+    def alexnet_inference(
+        image: flow.typing.Numpy.Placeholder(image_shape, dtype=flow.float32),
+        label: flow.typing.Numpy.Placeholder(label_shape, dtype=flow.int32),
+    ) -> flow.typing.Numpy:
+        input_lbns["image"] = image.logical_blob_name
+        input_lbns["label"] = label.logical_blob_name
+        image = flow.transpose(image, perm=(0, 3, 1, 2))
+        loss = alexnet(image, label, trainable=False)
+        output = loss
+        output_lbns["output"] = output.logical_blob_name
+        return output
+
+    return (alexnet_inference, input_lbns, output_lbns)
+
+
+def load_saved_model(model_meta_file_path):
+    saved_model_proto = saved_model_pb.SavedModel()
+    with open(model_meta_file_path, "rb") as f:
+        text_format.Merge(f.read(), saved_model_proto)
+    return saved_model_proto
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestSaveAndLoadModel(flow.unittest.TestCase):
+    def test_alexnet(test_case, batch_size=DEFAULT_BATCH_SIZE, num_batchs=6):
+        init_env()
+        (alexnet_infer, input_lbns, output_lbns) = make_alexnet_infer_func(
+            batch_size, (DEFAULT_IMAGE_SIZE, DEFAULT_IMAGE_SIZE, 3)
+        )
+        flow.load_variables(flow.checkpoint.get(DEFAULT_CHECKPOINT_DIR))
+        saved_model_path = "alexnet_models"
+        model_name = "alexnet"
+        model_version = 1
+        model_version_path = os.path.join(saved_model_path, str(model_version))
+        if os.path.exists(saved_model_path) and os.path.isdir(saved_model_path):
+            print(
+                "WARNING: The model version path '{}' already exist, old version directory will be removed".format(
+                    model_version_path
+                )
+            )
+            shutil.rmtree(saved_model_path)
+        saved_model_builder = flow.saved_model.ModelBuilder(saved_model_path)
+        signature_builder = (
+            saved_model_builder.ModelName(model_name)
+            .Version(model_version)
+            .AddFunction(alexnet_infer)
+            .AddSignature("regress")
+        )
+        for (input_name, lbn) in input_lbns.items():
+            signature_builder.Input(input_name, lbn)
+        for (output_name, lbn) in output_lbns.items():
+            signature_builder.Output(output_name, lbn)
+        saved_model_builder.Save()
+        new_batch_size = int(batch_size / 2)
+        dataset = ImageNetRecordDataset(
+            batch_size=new_batch_size,
+            image_resize_size=DEFAULT_IMAGE_SIZE,
+            data_format="NHWC",
+        )
+        (image_list, label_list) = dataset.load_batchs(num_batchs)
+        assert image_list[0].shape[0] == new_batch_size
+        image_size = tuple(image_list[0].shape[1:])
+        flow.clear_default_session()
+        (alexnet_infer, _, _) = make_alexnet_infer_func(new_batch_size, image_size)
+        flow.load_variables(flow.checkpoint.get(DEFAULT_CHECKPOINT_DIR))
+        print("alexnet inference result:")
+        origin_outputs = []
+        for (i, (image, label)) in enumerate(zip(image_list, label_list)):
+            output = alexnet_infer(image, label)
+            origin_outputs.append(output)
+            print("iter#{:<6} output:".format(i), output)
+        origin_outputs = np.array(origin_outputs, dtype=np.float32)
+        flow.clear_default_session()
+        model_meta_file_path = os.path.join(
+            saved_model_path, str(model_version), "saved_model.prototxt"
+        )
+        saved_model_proto = load_saved_model(model_meta_file_path)
+        sess = flow.serving.InferenceSession()
+        checkpoint_path = os.path.join(
+            saved_model_path, str(model_version), saved_model_proto.checkpoint_dir
+        )
+        sess.set_checkpoint_path(checkpoint_path)
+        graph_name = saved_model_proto.default_graph_name
+        graph_def = saved_model_proto.graphs[graph_name]
+        signature_def = graph_def.signatures[graph_def.default_signature_name]
+        with sess.open(graph_name, signature_def, new_batch_size):
+            sess.compile(graph_def.op_list)
+        sess.launch()
+        job_name = sess.list_jobs()[0]
+        input_names = sess.list_inputs()
+        print("input names:", input_names)
+        for input_name in input_names:
+            print(
+                'input "{}" info: {}'.format(
+                    input_name, sess.input_info(input_name, job_name)
+                )
+            )
+        output_names = sess.list_outputs()
+        print("output names:", output_names)
+        for output_name in output_names:
+            print(
+                'output "{}" info: {}'.format(
+                    output_name, sess.output_info(output_name, job_name)
+                )
+            )
+        print("load saved alexnet and inference result:")
+        print_input_info = False
+        cmp_outputs = []
+        for (i, (image, label)) in enumerate(zip(image_list, label_list)):
+            if print_input_info:
+                print("image shape: {}, dtype: {}".format(image.shape, image.dtype))
+                print(
+                    "label shape: {}, dtype: {}, data: {}".format(
+                        label.shape, label.dtype, label
+                    )
+                )
+                if i > 1:
+                    print((image - image_list[i - 1]).mean())
+            outputs = sess.run(alexnet_infer.__name__, image=image, label=label)
+            cmp_outputs.append(outputs[0])
+            print("iter#{:<6} output:".format(i), outputs[0])
+        cmp_outputs = np.array(cmp_outputs, dtype=np.float32)
+        test_case.assertTrue(np.allclose(origin_outputs, cmp_outputs))
+        sess.close()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/serving/test_insightface_save_and_load.py b/python/oneflow/compatible/single_client/test/serving/test_insightface_save_and_load.py
new file mode 100644
index 0000000000000000000000000000000000000000..09647c2a13d27906ce7af5600876b10ed8765536
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/serving/test_insightface_save_and_load.py
@@ -0,0 +1,147 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import argparse
+import os
+import shutil
+import sys
+import unittest
+
+import numpy as np
+from insightface_resnet100 import Resnet100
+from ofrecord_dataset import FaceEmoreRecordDataset
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+
+def init_env():
+    flow.env.init()
+    flow.config.machine_num(1)
+    flow.config.cpu_device_num(1)
+    flow.config.gpu_device_num(1)
+    flow.config.enable_debug_mode(True)
+
+
+def get_predict_config(device_type="gpu", device_num=1, default_data_type=flow.float32):
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(default_data_type)
+    func_config.default_logical_view(flow.scope.consistent_view())
+    func_config.default_placement_scope(
+        flow.scope.placement(device_type, "0:0-{}".format(device_num - 1))
+    )
+    return func_config
+
+
+def make_insightface_resnet100_func(
+    batch_size=1, image_height=112, image_width=112, channels=3
+):
+    shape = (batch_size, channels, image_height, image_width)
+
+    @flow.global_function(type="predict", function_config=get_predict_config())
+    def insightface_resnet100_func(
+        image: flow.typing.Numpy.Placeholder(shape),
+    ) -> flow.typing.Numpy:
+        embedding = Resnet100(image, embedding_size=512, fc_type="FC")
+        return embedding
+
+    return insightface_resnet100_func
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestSaveAndLoadModel(flow.unittest.TestCase):
+    DATA_DIR = "/dataset/insightface/eval_ofrecord/lfw"
+    NUM_DATA_PARTS = 1
+    MODEL_DIR = "/dataset/model_zoo/insightface/emore_r100_arcface"
+    BATCH_SIZE = 1
+    IMAGE_SIZE = 112
+    NUM_ITER = 6
+
+    def test_insightface(self):
+        init_env()
+        print("Get data from FaceEmoreRecordDataset")
+        dataset = FaceEmoreRecordDataset(
+            data_dir=self.DATA_DIR,
+            num_data_parts=self.NUM_DATA_PARTS,
+            batch_size=self.BATCH_SIZE,
+            image_width=self.IMAGE_SIZE,
+            image_height=self.IMAGE_SIZE,
+            data_format="NCHW",
+        )
+        (image_list, issame_list) = dataset.load_batchs(self.NUM_ITER)
+        print("Define inference function for insightface")
+        infer_fn = make_insightface_resnet100_func(
+            self.BATCH_SIZE, self.IMAGE_SIZE, self.IMAGE_SIZE
+        )
+        print("Load variables for insightface model")
+        flow.load_variables(flow.checkpoint.get(self.MODEL_DIR))
+        print("Call inference function directly")
+        features = []
+        for (i, image) in enumerate(image_list):
+            feature = infer_fn(image)
+            features.append(feature)
+        print("Save model for insightface")
+        saved_model_path = "insightface_models"
+        model_version = 1
+        model_version_path = os.path.join(saved_model_path, str(model_version))
+        if os.path.exists(model_version_path) and os.path.isdir(model_version_path):
+            print(
+                "WARNING: The model version path '{}' already exist, old version directory will be removed".format(
+                    model_version_path
+                )
+            )
+            shutil.rmtree(model_version_path)
+        saved_model_builder = (
+            flow.saved_model.ModelBuilder(saved_model_path)
+            .ModelName("insightface")
+            .Version(model_version)
+        )
+        saved_model_builder.AddFunction(infer_fn).Finish()
+        saved_model_builder.Save()
+        flow.clear_default_session()
+        print("InferenceSession load model")
+        flow.clear_default_session()
+        sess = flow.serving.InferenceSession()
+        sess.load_saved_model(saved_model_path)
+        sess.launch()
+        job_name = sess.list_jobs()[0]
+        input_names = sess.list_inputs()
+        print("input names:", input_names)
+        for input_name in input_names:
+            print(
+                'input "{}" info: {}'.format(
+                    input_name, sess.input_info(input_name, job_name)
+                )
+            )
+        print("Run model and compare ")
+        for (i, (image, feature)) in enumerate(zip(image_list, features)):
+            input_dict = {input_names[0]: image}
+            infer_result = sess.run(job_name, **input_dict)
+            self.assertTrue(np.allclose(infer_result, feature))
+        sess.close()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-dir")
+    parser.add_argument("--model-dir")
+    (args, unknown) = parser.parse_known_args()
+    if args.data_dir is not None:
+        TestSaveAndLoadModel.DATA_DIR = args.data_dir
+    if args.model_dir is not None:
+        TestSaveAndLoadModel.MODEL_DIR = args.model_dir
+    argv = sys.argv[0:1] + unknown
+    unittest.main(argv=argv)
diff --git a/python/oneflow/compatible/single_client/test/serving/test_resnet_save_and_load.py b/python/oneflow/compatible/single_client/test/serving/test_resnet_save_and_load.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6d5cbc2cf033404a5e66dde1c64185931c3189b
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/serving/test_resnet_save_and_load.py
@@ -0,0 +1,139 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import shutil
+import unittest
+
+import numpy as np
+from google.protobuf import text_format as text_format
+from ofrecord_dataset import ImageNetRecordDataset
+from resnet_model import resnet50
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+from oneflow.core.serving import saved_model_pb2 as saved_model_pb
+
+DEFAULT_BATCH_SIZE = 4
+DEFAULT_CHECKPOINT_DIR = "/dataset/model_zoo/resnet_v15_of_best_model_val_top1_77318"
+DEFAULT_IMAGE_SIZE = 224
+
+
+def init_env():
+    flow.env.init()
+    flow.config.machine_num(1)
+    flow.config.cpu_device_num(1)
+    flow.config.gpu_device_num(1)
+    flow.config.enable_debug_mode(True)
+
+
+def make_resnet_infer_func(batch_size, image_size):
+    input_lbns = {}
+    output_lbns = {}
+    image_shape = (batch_size,) + tuple(image_size)
+
+    @flow.global_function(type="predict")
+    def resnet_inference(
+        image: flow.typing.Numpy.Placeholder(image_shape, dtype=flow.float32)
+    ) -> flow.typing.Numpy:
+        input_lbns["image"] = image.logical_blob_name
+        output = resnet50(image, trainable=False)
+        output = flow.nn.softmax(output)
+        output_lbns["output"] = output.logical_blob_name
+        return output
+
+    return (resnet_inference, input_lbns, output_lbns)
+
+
+def load_saved_model(model_meta_file_path):
+    saved_model_proto = saved_model_pb.SavedModel()
+    with open(model_meta_file_path, "rb") as f:
+        text_format.Merge(f.read(), saved_model_proto)
+    return saved_model_proto
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestSaveAndLoadModel(flow.unittest.TestCase):
+    def test_resnet(test_case, batch_size=DEFAULT_BATCH_SIZE, num_batchs=6):
+        init_env()
+        image_size = (3, DEFAULT_IMAGE_SIZE, DEFAULT_IMAGE_SIZE)
+        (resnet_infer, input_lbns, output_lbns) = make_resnet_infer_func(
+            batch_size, image_size
+        )
+        flow.load_variables(flow.checkpoint.get(DEFAULT_CHECKPOINT_DIR))
+        dataset = ImageNetRecordDataset(
+            batch_size=batch_size,
+            image_resize_size=DEFAULT_IMAGE_SIZE,
+            data_format="NCHW",
+        )
+        (image_list, label_list) = dataset.load_batchs(num_batchs)
+        print("resnet inference result:")
+        origin_outputs = []
+        for (i, (image, label)) in enumerate(zip(image_list, label_list)):
+            output = resnet_infer(image)
+            arg_max = np.argmax(output, axis=1)
+            origin_outputs.append(arg_max)
+            print("iter#{:<6} predict: ".format(i), arg_max, "label: ", label)
+        origin_outputs = np.array(origin_outputs, dtype=np.float32)
+        saved_model_path = "resnet50_models"
+        model_version = 1
+        model_version_path = os.path.join(saved_model_path, str(model_version))
+        if os.path.exists(model_version_path) and os.path.isdir(model_version_path):
+            print(
+                "WARNING: The model version path '{}' already exist, old version directory will be removed".format(
+                    model_version_path
+                )
+            )
+            shutil.rmtree(model_version_path)
+        saved_model_builder = flow.saved_model.ModelBuilder(saved_model_path)
+        signature_builder = (
+            saved_model_builder.ModelName("resnet50")
+            .Version(model_version)
+            .AddFunction(resnet_infer)
+            .AddSignature("regress")
+        )
+        for (input_name, lbn) in input_lbns.items():
+            signature_builder.Input(input_name, lbn)
+        for (output_name, lbn) in output_lbns.items():
+            signature_builder.Output(output_name, lbn)
+        saved_model_builder.Save()
+        flow.clear_default_session()
+        sess = flow.serving.InferenceSession()
+        sess.load_saved_model(saved_model_path)
+        sess.launch()
+        job_name = sess.list_jobs()[0]
+        input_names = sess.list_inputs()
+        print("input names:", input_names)
+        for input_name in input_names:
+            print(
+                'input "{}" info: {}'.format(
+                    input_name, sess.input_info(input_name, job_name)
+                )
+            )
+        print("load saved resnet and inference result:")
+        cmp_outputs = []
+        for (i, (image, label)) in enumerate(zip(image_list, label_list)):
+            outputs = sess.run(resnet_infer.__name__, image=image)
+            arg_max = np.argmax(outputs[0], axis=1)
+            cmp_outputs.append(arg_max)
+            print("iter#{:<6} output:".format(i), arg_max, "label: ", label)
+        cmp_outputs = np.array(cmp_outputs, dtype=np.float32)
+        test_case.assertTrue(np.allclose(origin_outputs, cmp_outputs))
+        sess.close()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/serving/test_style_transfer_save_and_load.py b/python/oneflow/compatible/single_client/test/serving/test_style_transfer_save_and_load.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea3d312bd9b96753205df5bc55da34cc267bfd07
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/serving/test_style_transfer_save_and_load.py
@@ -0,0 +1,143 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import argparse
+import os
+import sys
+import unittest
+
+import cv2
+import numpy as np
+import style_model
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+
+def init_env():
+    flow.config.machine_num(1)
+    flow.config.cpu_device_num(1)
+    flow.config.gpu_device_num(1)
+    flow.config.enable_debug_mode(True)
+
+
+def get_predict_config(device_type="gpu", device_num=1, default_data_type=flow.float32):
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(default_data_type)
+    func_config.default_logical_view(flow.scope.consistent_view())
+    func_config.default_placement_scope(
+        flow.scope.placement(device_type, "0:0-{}".format(device_num - 1))
+    )
+    return func_config
+
+
+def make_style_transfer(image_height, image_width, channels=3):
+    @flow.global_function("predict", get_predict_config())
+    def style_model_predict(
+        image: flow.typing.Numpy.Placeholder(
+            (1, channels, image_height, image_width), dtype=flow.float32
+        )
+    ) -> flow.typing.Numpy:
+        style_out = style_model.styleNet(image, trainable=True)
+        return style_out
+
+    return style_model_predict
+
+
+def load_image(image_file):
+    im = cv2.imread(image_file)
+    im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
+    im = np.transpose(im, (2, 0, 1))
+    im = np.expand_dims(im, axis=0)
+    return np.ascontiguousarray(im, "float32")
+
+
+def recover_image(im):
+    im = np.squeeze(im)
+    im = np.transpose(im, (1, 2, 0))
+    im = cv2.cvtColor(np.float32(im), cv2.COLOR_RGB2BGR)
+    return im.astype(np.uint8)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestSaveAndLoadModel(flow.unittest.TestCase):
+    INPUT_IMAGE_FILE = (
+        "/dataset/model_zoo/fast_neural_style/images/content-images/amber.jpg"
+    )
+    OUTPUT_IMAGE_FILE = None
+    CHECKPOINT_DIR = "/dataset/model_zoo/fast_neural_style/sketch_lr_0.001000_cw_10000.000000_sw_10000000000.000000_epoch_0_iter_4400_loss_3008.877197"
+
+    def test_style_model(self):
+        init_env()
+        input_image = load_image(self.INPUT_IMAGE_FILE)
+        (image_height, image_width) = input_image.shape[2:]
+        style_transfer = make_style_transfer(image_height, image_width)
+        flow.load_variables(flow.checkpoint.get(self.CHECKPOINT_DIR))
+        saved_model_path = "style_models"
+        model_version = 1
+        saved_model_version_dir = os.path.join(saved_model_path, str(model_version))
+        if not os.path.exists(saved_model_version_dir):
+            saved_model_builder = (
+                flow.saved_model.ModelBuilder(saved_model_path)
+                .ModelName("style_transfer")
+                .Version(model_version)
+            )
+            saved_model_builder.AddFunction(style_transfer).Finish()
+            saved_model_builder.Save()
+        flow.clear_default_session()
+        sess = flow.serving.InferenceSession()
+        sess.load_saved_model(saved_model_path)
+        sess.launch()
+        job_names = sess.list_jobs()
+        print("job names:", job_names)
+        input_names = sess.list_inputs()
+        print("input names:", input_names)
+        for input_name in input_names:
+            print(
+                'input "{}" info: {}'.format(
+                    input_name, sess.input_info(input_name, job_names[0])
+                )
+            )
+        output_names = sess.list_outputs()
+        print("output names:", output_names)
+        for output_name in output_names:
+            print(
+                'input "{}" info: {}'.format(
+                    output_name, sess.output_info(output_name, job_names[0])
+                )
+            )
+        input_dict = {input_names[0]: input_image}
+        outputs = sess.run(style_transfer.__name__, **input_dict)
+        if self.OUTPUT_IMAGE_FILE is not None:
+            cv2.imwrite(self.OUTPUT_IMAGE_FILE, recover_image(outputs[0]))
+            print("write styled output image to", self.OUTPUT_IMAGE_FILE)
+        sess.close()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input-image-file")
+    parser.add_argument("--output-image-file")
+    parser.add_argument("--model-dir")
+    (args, unknown) = parser.parse_known_args()
+    if args.input_image_file is not None:
+        TestSaveAndLoadModel.INPUT_IMAGE_FILE = args.input_image_file
+    if args.output_image_file is not None:
+        TestSaveAndLoadModel.OUTPUT_IMAGE_FILE = args.output_image_file
+    if args.model_dir is not None:
+        TestSaveAndLoadModel.CHECKPOINT_DIR = args.model_dir
+    argv = sys.argv[0:1] + unknown
+    unittest.main(argv=argv)
diff --git a/python/oneflow/compatible/single_client/test/xrt/test_add.py b/python/oneflow/compatible/single_client/test/xrt/test_add.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfda38284d17b56ca4fa103e9a9cd5d0f5b2d80d
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/xrt/test_add.py
@@ -0,0 +1,106 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+config = flow.function_config()
+
+
+def make_job(x_shape, y_shape, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def add_job(
+        x=flow.FixedTensorDef(x_shape, dtype=dtype),
+        y=flow.FixedTensorDef(y_shape, dtype=dtype),
+    ):
+        return x + y + x
+
+    return add_job
+
+
+def make_xla_job(x_shape, y_shape, dtype=flow.float32):
+    config.use_xla_jit(True)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def xla_add_job(
+        x=flow.FixedTensorDef(x_shape, dtype=dtype),
+        y=flow.FixedTensorDef(y_shape, dtype=dtype),
+    ):
+        return x + y + x
+
+    return xla_add_job
+
+
+def make_trt_job(x_shape, y_shape, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(True)
+
+    @flow.global_function(config)
+    def trt_add_job(
+        x=flow.FixedTensorDef(x_shape, dtype=dtype),
+        y=flow.FixedTensorDef(y_shape, dtype=dtype),
+    ):
+        return x + y + x
+
+    return trt_add_job
+
+
+class TestAdd(unittest.TestCase):
+    def _test_body(self, x, y, dtype=np.float32):
+        f1 = make_job(x.shape, y.shape, dtype=flow.float32)
+        f2 = make_xla_job(x.shape, y.shape, dtype=flow.float32)
+        f3 = make_trt_job(x.shape, y.shape, dtype=flow.float32)
+        a = f1(x, y).get()
+        b = f2(x, y).get()
+        c = f3(x, y).get()
+        print("without xla: ", a)
+        print("with xla", b)
+        print("with tensorrt", c)
+        self.assertTrue(np.allclose(a.numpy(), b.numpy(), rtol=0.001, atol=1e-05))
+        self.assertTrue(np.allclose(a.numpy(), c.numpy(), rtol=0.001, atol=1e-05))
+        flow.clear_default_session()
+
+    def _test_ones_body(self, x_shape, y_shape, dtype=np.float32):
+        x = np.ones(x_shape, dtype=dtype)
+        y = np.ones(y_shape, dtype=dtype)
+        self._test_body(x, y, dtype=dtype)
+
+    def _test_random_body(self, x_shape, y_shape, dtype=np.float32):
+        x = np.random.random(x_shape).astype(dtype)
+        y = np.random.random(y_shape).astype(dtype)
+        self._test_body(x, y, dtype=dtype)
+
+    def test_ones_input(self):
+        self._test_ones_body((1, 10), (1, 10))
+        self._test_ones_body((2, 10, 2), (2, 10, 2))
+        self._test_ones_body((2, 5, 2, 2), (2, 5, 2, 2))
+
+    def test_random_input(self):
+        self._test_random_body((1, 10), (1, 10))
+        self._test_random_body((2, 10, 2), (2, 10, 2))
+        self._test_random_body((2, 5, 2, 2), (2, 5, 2, 2))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/xrt/test_batch_norm.py b/python/oneflow/compatible/single_client/test/xrt/test_batch_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..bfd73be41d2c4abac289a66b3795e011bb3bc2af
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/xrt/test_batch_norm.py
@@ -0,0 +1,97 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+config = flow.function_config()
+
+
+def make_job(input_shape, axis, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def batch_norm_job(x=flow.FixedTensorDef(input_shape, dtype=dtype)):
+        return flow.layers.batch_normalization(x, axis=axis)
+
+    return batch_norm_job
+
+
+def make_xla_job(input_shape, axis, dtype=flow.float32):
+    config.use_xla_jit(True)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def xla_batch_norm_job(x=flow.FixedTensorDef(input_shape, dtype=dtype)):
+        return flow.layers.batch_normalization(x, axis=axis)
+
+    return xla_batch_norm_job
+
+
+def make_trt_job(input_shape, axis, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(True)
+
+    @flow.global_function(config)
+    def trt_batch_norm_job(x=flow.FixedTensorDef(input_shape, dtype=dtype)):
+        return flow.layers.batch_normalization(x, axis=axis)
+
+    return trt_batch_norm_job
+
+
+class TestRelu(unittest.TestCase):
+    def _test_body(self, x, axis, dtype=np.float32):
+        f1 = make_job(x.shape, axis, dtype=flow.float32)
+        f2 = make_xla_job(x.shape, axis, dtype=flow.float32)
+        f3 = make_trt_job(x.shape, axis, dtype=flow.float32)
+        check_point = flow.train.CheckPoint()
+        check_point.init()
+        a = f1(x).get()
+        b = f2(x).get()
+        print("without xla: ", a)
+        print("with xla: ", b)
+        self.assertTrue(np.allclose(a.numpy(), b.numpy(), rtol=0.001, atol=1e-05))
+        c = f3(x).get()
+        print("with tensorrt: ", c)
+        self.assertTrue(np.allclose(a.numpy(), c.numpy(), rtol=0.001, atol=1e-05))
+        flow.clear_default_session()
+
+    def _test_ones_body(self, shape, axis, dtype=np.float32):
+        x = np.ones(shape, dtype=dtype)
+        self._test_body(x, axis, dtype=dtype)
+
+    def _test_random_body(self, shape, axis, dtype=np.float32):
+        x = np.random.random(shape).astype(dtype)
+        self._test_body(x, axis, dtype=dtype)
+
+    "\n      TensorRT batch norm only support 4-d tensor (NCHW).\n    "
+
+    def test_ones_input(self):
+        self._test_ones_body((2, 1, 2, 2), 1)
+        self._test_ones_body((2, 5, 2, 2), 1)
+
+    def test_random_input(self):
+        self._test_random_body((2, 1, 2, 2), 1)
+        self._test_random_body((2, 5, 2, 2), 1)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/xrt/test_batch_norm_add.py b/python/oneflow/compatible/single_client/test/xrt/test_batch_norm_add.py
new file mode 100644
index 0000000000000000000000000000000000000000..1986caf59303b11ebcb5999b503207cb1591c615
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/xrt/test_batch_norm_add.py
@@ -0,0 +1,131 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+config = flow.function_config()
+
+
+def make_job(input_shape, axis, fuse_add_to_output, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(False)
+    config.enable_fuse_add_to_output(fuse_add_to_output)
+
+    @flow.global_function(config)
+    def batch_norm_job(x=flow.FixedTensorDef(input_shape, dtype=dtype)):
+        out = flow.layers.batch_normalization(x, axis=axis)
+        c = flow.get_variable(
+            "c",
+            shape=out.shape,
+            dtype=flow.float,
+            initializer=flow.ones_initializer(),
+            trainable=True,
+        )
+        out = flow.math.add_n([out, c])
+        return out
+
+    return batch_norm_job
+
+
+def make_xla_job(input_shape, axis, fuse_add_to_output, dtype=flow.float32):
+    config.use_xla_jit(True)
+    config.use_tensorrt(False)
+    config.enable_fuse_add_to_output(fuse_add_to_output)
+
+    @flow.global_function(config)
+    def xla_batch_norm_job(x=flow.FixedTensorDef(input_shape, dtype=dtype)):
+        out = flow.layers.batch_normalization(x, axis=axis)
+        c = flow.get_variable(
+            "c",
+            shape=out.shape,
+            dtype=flow.float,
+            initializer=flow.ones_initializer(),
+            trainable=True,
+        )
+        out = flow.math.add_n([out, c])
+        return out
+
+    return xla_batch_norm_job
+
+
+def make_trt_job(input_shape, axis, fuse_add_to_output, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(True)
+    config.enable_fuse_add_to_output(fuse_add_to_output)
+
+    @flow.global_function(config)
+    def trt_batch_norm_job(x=flow.FixedTensorDef(input_shape, dtype=dtype)):
+        out = flow.layers.batch_normalization(x, axis=axis)
+        c = flow.get_variable(
+            "c",
+            shape=out.shape,
+            dtype=flow.float,
+            initializer=flow.ones_initializer(),
+            trainable=True,
+        )
+        out = flow.math.add_n([out, c])
+        return out
+
+    return trt_batch_norm_job
+
+
+class TestRelu(unittest.TestCase):
+    def _test_body(self, x, axis, fuse_add_to_output, dtype=np.float32):
+        f1 = make_job(x.shape, axis, fuse_add_to_output, dtype=flow.float32)
+        f2 = make_xla_job(x.shape, axis, fuse_add_to_output, dtype=flow.float32)
+        f3 = make_trt_job(x.shape, axis, fuse_add_to_output, dtype=flow.float32)
+        check_point = flow.train.CheckPoint()
+        check_point.init()
+        a = f1(x).get()
+        b = f2(x).get()
+        print("without xla: ", a.numpy())
+        print("with xla: ", b.numpy())
+        self.assertTrue(np.allclose(a.numpy(), b.numpy(), rtol=0.001, atol=1e-05))
+        c = f3(x).get()
+        print("with tensorrt: ", c.numpy())
+        self.assertTrue(np.allclose(a.numpy(), c.numpy(), rtol=0.001, atol=1e-05))
+        flow.clear_default_session()
+
+    def _test_ones_body(self, shape, axis, fuse_add_to_output, dtype=np.float32):
+        x = np.ones(shape, dtype=dtype)
+        self._test_body(x, axis, fuse_add_to_output, dtype=dtype)
+
+    def _test_random_body(self, shape, axis, fuse_add_to_output, dtype=np.float32):
+        x = np.random.random(shape).astype(dtype)
+        self._test_body(x, axis, fuse_add_to_output, dtype=dtype)
+
+    "\n      TensorRT batch norm only support 4-d tensor (NCHW).\n    "
+
+    def test_ones_input(self):
+        self._test_ones_body((2, 1, 2, 2), 1, True)
+        self._test_ones_body((2, 1, 2, 2), 1, False)
+        self._test_ones_body((2, 5, 2, 2), 1, True)
+        self._test_ones_body((2, 5, 2, 2), 1, False)
+
+    def test_random_input(self):
+        self._test_random_body((2, 1, 2, 2), 1, True)
+        self._test_random_body((2, 1, 2, 2), 1, False)
+        self._test_random_body((2, 5, 2, 2), 1, True)
+        self._test_random_body((2, 5, 2, 2), 1, False)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/xrt/test_bias_add.py b/python/oneflow/compatible/single_client/test/xrt/test_bias_add.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2eb37e99c966ded8ec84e49fc01e0bea1b8fc97
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/xrt/test_bias_add.py
@@ -0,0 +1,107 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+config = flow.function_config()
+
+
+def make_job(x_shape, b_shape, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def bias_add_job(
+        x=flow.FixedTensorDef(x_shape, dtype=dtype),
+        bias=flow.FixedTensorDef(b_shape, dtype=dtype),
+    ):
+        return flow.nn.bias_add(x, bias)
+
+    return bias_add_job
+
+
+def make_xla_job(x_shape, b_shape, dtype=flow.float32):
+    config.use_xla_jit(True)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def xla_bias_add_job(
+        x=flow.FixedTensorDef(x_shape, dtype=dtype),
+        bias=flow.FixedTensorDef(b_shape, dtype=dtype),
+    ):
+        return flow.nn.bias_add(x, bias)
+
+    return xla_bias_add_job
+
+
+def make_trt_job(x_shape, b_shape, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(True)
+
+    @flow.global_function(config)
+    def trt_bias_add_job(
+        x=flow.FixedTensorDef(x_shape, dtype=dtype),
+        bias=flow.FixedTensorDef(b_shape, dtype=dtype),
+    ):
+        return flow.nn.bias_add(x, bias)
+
+    return trt_bias_add_job
+
+
+class TestBiasAdd(unittest.TestCase):
+    def _test_body(self, x, bias, dtype=np.float32):
+        f1 = make_job(x.shape, bias.shape, dtype=flow.float32)
+        f2 = make_xla_job(x.shape, bias.shape, dtype=flow.float32)
+        a = f1(x, bias).get()
+        b = f2(x, bias).get()
+        print("without xla: ", a)
+        print("with xla: ", b)
+        self.assertTrue(np.allclose(a.numpy(), b.numpy(), rtol=0.001, atol=1e-05))
+        flow.clear_default_session()
+        f3 = make_trt_job(x.shape, bias.shape, dtype=flow.float32)
+        c = f3(x, bias).get()
+        print("with tensorrt: ", c)
+        self.assertTrue(np.allclose(a.numpy(), c.numpy(), rtol=0.001, atol=1e-05))
+        flow.clear_default_session()
+
+    def _test_ones_body(self, x_shape, bias_shape, dtype=np.float32):
+        x = np.ones(x_shape, dtype=dtype)
+        b = np.ones(bias_shape, dtype=dtype)
+        self._test_body(x, b, dtype=dtype)
+
+    def _test_random_body(self, x_shape, bias_shape, dtype=np.float32):
+        x = np.random.random(x_shape).astype(dtype)
+        b = np.random.random(bias_shape).astype(dtype)
+        self._test_body(x, b, dtype=dtype)
+
+    def test_ones_input(self):
+        self._test_ones_body((1, 10), 10)
+        self._test_ones_body((2, 10, 2), 10)
+        self._test_ones_body((2, 5, 2, 2), 5)
+
+    def test_random_input(self):
+        self._test_random_body((1, 10), 10)
+        self._test_random_body((2, 10, 2), 10)
+        self._test_random_body((2, 5, 2, 2), 5)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/xrt/test_broadcast_op.py b/python/oneflow/compatible/single_client/test/xrt/test_broadcast_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd5c662d03df407ee291407a3ee6d1c8750f67f0
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/xrt/test_broadcast_op.py
@@ -0,0 +1,184 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+config = flow.function_config()
+
+
+class TestBroadcastOp(unittest.TestCase):
+    run_test = False
+
+    def _test_body(self, x, y, dtype=np.float32):
+        if not self.run_test:
+            return
+        f1 = self.make_job(x.shape, y.shape, dtype=flow.float32)
+        f2 = self.make_xla_job(x.shape, y.shape, dtype=flow.float32)
+        a = f1(x, y).get()
+        b = f2(x, y).get()
+        print("without xla: ", a)
+        print("with xla", b)
+        self.assertTrue(np.allclose(a.numpy(), b.numpy(), rtol=0.001, atol=1e-05))
+        flow.clear_default_session()
+
+    def _test_ones_body(self, x_shape, y_shape, dtype=np.float32):
+        x = np.ones(x_shape, dtype=dtype)
+        y = np.ones(y_shape, dtype=dtype)
+        self._test_body(x, y, dtype=dtype)
+
+    def _test_random_body(self, x_shape, y_shape, dtype=np.float32):
+        x = np.random.random(x_shape).astype(dtype)
+        y = np.random.random(y_shape).astype(dtype)
+        self._test_body(x, y, dtype=dtype)
+
+    def test_ones_input(self):
+        self._test_ones_body((1, 10), (1, 1))
+        self._test_ones_body((2, 10, 2), (2, 1, 2))
+        self._test_ones_body((2, 5, 2, 2), (1, 5, 2, 2))
+
+    def test_random_input(self):
+        self._test_random_body((1, 10), (1, 1))
+        self._test_random_body((2, 10, 2), (2, 1, 2))
+        self._test_random_body((2, 5, 2, 2), (1, 5, 2, 2))
+
+
+class TestBroadcastAddOp(TestBroadcastOp):
+    run_test = True
+
+    def make_job(self, x_shape, y_shape, dtype=flow.float32):
+        config.use_xla_jit(False)
+        config.use_tensorrt(False)
+
+        @flow.global_function(config)
+        def broadcast_add_job(
+            x=flow.FixedTensorDef(x_shape, dtype=dtype),
+            y=flow.FixedTensorDef(y_shape, dtype=dtype),
+        ):
+            return flow.math.add(x, y)
+
+        return broadcast_add_job
+
+    def make_xla_job(self, x_shape, y_shape, dtype=flow.float32):
+        config.use_xla_jit(True)
+        config.use_tensorrt(False)
+
+        @flow.global_function(config)
+        def xla_broadcast_add_job(
+            x=flow.FixedTensorDef(x_shape, dtype=dtype),
+            y=flow.FixedTensorDef(y_shape, dtype=dtype),
+        ):
+            return flow.math.add(x, y)
+
+        return xla_broadcast_add_job
+
+
+class TestBroadcastMulOp(TestBroadcastOp):
+    run_test = True
+
+    def make_job(self, x_shape, y_shape, dtype=flow.float32):
+        config.use_xla_jit(False)
+        config.use_tensorrt(False)
+
+        @flow.global_function(config)
+        def broadcast_mul_job(
+            x=flow.FixedTensorDef(x_shape, dtype=dtype),
+            y=flow.FixedTensorDef(y_shape, dtype=dtype),
+        ):
+            return flow.math.multiply(x, y)
+
+        return broadcast_mul_job
+
+    def make_xla_job(self, x_shape, y_shape, dtype=flow.float32):
+        config.use_xla_jit(True)
+        config.use_tensorrt(False)
+
+        @flow.global_function(config)
+        def xla_broadcast_mul_job(
+            x=flow.FixedTensorDef(x_shape, dtype=dtype),
+            y=flow.FixedTensorDef(y_shape, dtype=dtype),
+        ):
+            return flow.math.multiply(x, y)
+
+        return xla_broadcast_mul_job
+
+
+class TestBroadcastDivOp(TestBroadcastOp):
+    run_test = True
+
+    def make_job(self, x_shape, y_shape, dtype=flow.float32):
+        config.use_xla_jit(False)
+        config.use_tensorrt(False)
+
+        @flow.global_function(config)
+        def broadcast_div_job(
+            x=flow.FixedTensorDef(x_shape, dtype=dtype),
+            y=flow.FixedTensorDef(y_shape, dtype=dtype),
+        ):
+            return flow.math.divide(x, y)
+
+        return broadcast_div_job
+
+    def make_xla_job(self, x_shape, y_shape, dtype=flow.float32):
+        config.use_xla_jit(True)
+        config.use_tensorrt(False)
+
+        @flow.global_function(config)
+        def xla_broadcast_div_job(
+            x=flow.FixedTensorDef(x_shape, dtype=dtype),
+            y=flow.FixedTensorDef(y_shape, dtype=dtype),
+        ):
+            return flow.math.divide(x, y)
+
+        return xla_broadcast_div_job
+
+
+class TestBroadcastMinOp(TestBroadcastOp):
+    run_test = True
+
+    def make_job(self, x_shape, y_shape, dtype=flow.float32):
+        config.use_xla_jit(False)
+        config.use_tensorrt(False)
+
+        @flow.global_function(config)
+        def broadcast_min_job(
+            x=flow.FixedTensorDef(x_shape, dtype=dtype),
+            y=flow.FixedTensorDef(y_shape, dtype=dtype),
+        ):
+            return flow.math.minimum(x, y)
+
+        return broadcast_min_job
+
+    def make_xla_job(self, x_shape, y_shape, dtype=flow.float32):
+        config.use_xla_jit(True)
+        config.use_tensorrt(False)
+
+        @flow.global_function(config)
+        def xla_broadcast_min_job(
+            x=flow.FixedTensorDef(x_shape, dtype=dtype),
+            y=flow.FixedTensorDef(y_shape, dtype=dtype),
+        ):
+            return flow.math.minimum(x, y)
+
+        return xla_broadcast_min_job
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/xrt/test_cast.py b/python/oneflow/compatible/single_client/test/xrt/test_cast.py
new file mode 100644
index 0000000000000000000000000000000000000000..d79b7f79c233fad9f58f2db08c13bb809bd68f65
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/xrt/test_cast.py
@@ -0,0 +1,80 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+config = flow.function_config()
+
+
+def make_job(input_shape, dtype=flow.float32, target_dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def cast_job(x=flow.FixedTensorDef(input_shape, dtype=dtype)):
+        return flow.cast(x, dtype=target_dtype)
+
+    return cast_job
+
+
+def make_xla_job(input_shape, dtype=flow.float32, target_dtype=flow.float32):
+    config.use_xla_jit(True)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def xla_cast_job(x=flow.FixedTensorDef(input_shape, dtype=dtype)):
+        return flow.cast(x, dtype=target_dtype)
+
+    return xla_cast_job
+
+
+class TestCast(unittest.TestCase):
+    def _test_body(self, x, dtype=flow.float32, target_dtype=flow.float32):
+        f1 = make_job(x.shape, dtype=dtype, target_dtype=target_dtype)
+        f2 = make_xla_job(x.shape, dtype=dtype, target_dtype=target_dtype)
+        a = f1(x).get()
+        b = f2(x).get()
+        print("without xla: ", a)
+        print("with xla", b)
+        self.assertTrue(np.allclose(a.numpy(), b.numpy(), rtol=0.001, atol=1e-05))
+        flow.clear_default_session()
+
+    def _test_ones_body(self, shape, dtype=flow.float32, target_dtype=flow.float32):
+        np_dtype = flow.convert_oneflow_dtype_to_numpy_dtype(dtype)
+        x = np.ones(shape, dtype=np_dtype)
+        self._test_body(x, dtype=dtype, target_dtype=target_dtype)
+
+    def _test_random_body(self, shape, dtype=flow.float32, target_dtype=flow.float32):
+        np_dtype = flow.convert_oneflow_dtype_to_numpy_dtype(dtype)
+        x = (1000 * np.random.random(shape)).astype(np_dtype)
+        self._test_body(x, dtype=dtype, target_dtype=target_dtype)
+
+    def test_ones_input(self):
+        self._test_ones_body(1, flow.float32, flow.int32)
+        self._test_ones_body((1, 10), flow.int32, flow.float32)
+
+    def test_random_input(self):
+        self._test_random_body(1, flow.float32, flow.int32)
+        self._test_random_body((1, 10), flow.int32, flow.float32)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/xrt/test_concat.py b/python/oneflow/compatible/single_client/test/xrt/test_concat.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab010b96c847667d16a5930da3aaf4f656cb931a
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/xrt/test_concat.py
@@ -0,0 +1,90 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+config = flow.function_config()
+
+
+def make_job(a_shape, b_shape, axis, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def concat_job(
+        x=flow.FixedTensorDef(a_shape, dtype=dtype),
+        y=flow.FixedTensorDef(b_shape, dtype=dtype),
+    ):
+        return flow.concat([x, y], axis=axis)
+
+    return concat_job
+
+
+def make_trt_job(a_shape, b_shape, axis, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(True)
+
+    @flow.global_function(config)
+    def trt_concat_job(
+        x=flow.FixedTensorDef(a_shape, dtype=dtype),
+        y=flow.FixedTensorDef(b_shape, dtype=dtype),
+    ):
+        return flow.concat([x, y], axis=axis)
+
+    return trt_concat_job
+
+
+class Testconcat(unittest.TestCase):
+    def _test_body(self, x, y, axis, dtype=np.float32):
+        f1 = make_job(x.shape, y.shape, axis, dtype=flow.float32)
+        f2 = make_trt_job(x.shape, y.shape, axis, dtype=flow.float32)
+        a = f1(x, y).get()
+        b = f2(x, y).get()
+        print("without xla: ", a)
+        print("with tensorrt: ", b)
+        self.assertTrue(np.allclose(a.numpy(), b.numpy(), rtol=0.001, atol=1e-05))
+        flow.clear_default_session()
+
+    def _test_ones_body(self, a_shape, b_shape, axis, dtype=np.float32):
+        x = np.ones(a_shape, dtype=dtype)
+        y = np.ones(b_shape, dtype=dtype)
+        self._test_body(x, y, axis, dtype=dtype)
+
+    def _test_random_body(self, a_shape, b_shape, axis, dtype=np.float32):
+        x = np.random.random(a_shape).astype(dtype)
+        y = np.random.random(b_shape).astype(dtype)
+        self._test_body(x, y, axis, dtype=dtype)
+
+    def test_ones_input(self):
+        self._test_ones_body((5, 2), (5, 3), axis=1)
+        self._test_ones_body((5, 2), (5, 3), axis=-1)
+        self._test_ones_body((5, 1, 2), (5, 1, 2), axis=1)
+        self._test_ones_body((5, 1, 2), (5, 1, 2), axis=2)
+
+    def test_random_input(self):
+        self._test_random_body((5, 2), (5, 3), axis=1)
+        self._test_random_body((5, 2), (5, 3), axis=-1)
+        self._test_random_body((5, 1, 2), (5, 1, 2), axis=1)
+        self._test_random_body((5, 3, 2), (5, 3, 2), axis=2)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/xrt/test_conv2d.py b/python/oneflow/compatible/single_client/test/xrt/test_conv2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..28fcf70d80f297acea300d8e1402e7cd3347eb3c
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/xrt/test_conv2d.py
@@ -0,0 +1,394 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+config = flow.function_config()
+
+
+def make_job(
+    x_shape,
+    w_shape,
+    kernel_size=None,
+    strides=None,
+    padding="valid",
+    data_format="NCHW",
+    dilation_rate=None,
+    dtype=flow.float32,
+):
+    config.use_xla_jit(False)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def conv2d_job(
+        x=flow.FixedTensorDef(x_shape, dtype=dtype),
+        weight=flow.FixedTensorDef(w_shape, dtype=dtype),
+    ):
+        return flow.nn.conv2d(
+            x, weight, strides, padding, None, data_format, dilation_rate
+        )
+
+    return conv2d_job
+
+
+def make_trt_job(
+    x_shape,
+    w_shape,
+    kernel_size=None,
+    strides=None,
+    padding="valid",
+    data_format="NCHW",
+    dilation_rate=None,
+    dtype=flow.float32,
+):
+    config.use_xla_jit(False)
+    config.use_tensorrt(True)
+
+    @flow.global_function(config)
+    def trt_conv2d_job(
+        x=flow.FixedTensorDef(x_shape, dtype=dtype),
+        weight=flow.FixedTensorDef(w_shape, dtype=dtype),
+    ):
+        return flow.nn.conv2d(
+            x, weight, strides, padding, None, data_format, dilation_rate
+        )
+
+    return trt_conv2d_job
+
+
+class TestConv2d(unittest.TestCase):
+    def make_filter_shape(self, shape, filters, kernel_size, data_format):
+        if data_format == "NCHW":
+            return [filters, shape[1], kernel_size, kernel_size]
+        else:
+            return [filters, kernel_size, kernel_size, shape[3]]
+
+    def _test_body(
+        self,
+        x,
+        filters,
+        kernel_size,
+        strides,
+        padding,
+        data_format,
+        dilation_rate,
+        dtype=np.float32,
+    ):
+        f1 = make_job(
+            x.shape,
+            filters.shape,
+            kernel_size,
+            strides,
+            padding,
+            data_format,
+            dilation_rate,
+            dtype=flow.float32,
+        )
+        f2 = make_trt_job(
+            x.shape,
+            filters.shape,
+            kernel_size,
+            strides,
+            padding,
+            data_format,
+            dilation_rate,
+            dtype=flow.float32,
+        )
+        a = f1(x, filters).get()
+        b = f2(x, filters).get()
+        print("without xla: ", a)
+        print("with tensorrt: ", b)
+        self.assertTrue(a.shape == b.shape)
+        self.assertTrue(np.allclose(a.numpy(), b.numpy(), rtol=0.001, atol=1e-05))
+        flow.clear_default_session()
+
+    def _test_ones_body(
+        self,
+        shape,
+        filters,
+        kernel_size,
+        strides,
+        padding,
+        data_format,
+        dilation_rate,
+        dtype=np.float32,
+    ):
+        assert len(shape) == 4
+        x = np.ones(shape, dtype=dtype)
+        w_shape = self.make_filter_shape(shape, filters, kernel_size, data_format)
+        weight = np.random.random(w_shape).astype(dtype)
+        self._test_body(
+            x,
+            weight,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            dilation_rate=dilation_rate,
+        )
+
+    def _test_random_body(
+        self,
+        shape,
+        filters,
+        kernel_size,
+        strides,
+        padding,
+        data_format,
+        dilation_rate,
+        dtype=np.float32,
+    ):
+        assert len(shape) == 4
+        x = np.random.random(shape).astype(dtype)
+        w_shape = self.make_filter_shape(shape, filters, kernel_size, data_format)
+        weight = np.random.random(w_shape).astype(dtype)
+        self._test_body(
+            x,
+            weight,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            dilation_rate=dilation_rate,
+        )
+
+    def test_ones_kernel_1x1(self):
+        self._test_ones_body(
+            shape=[1, 1, 1, 1],
+            filters=1,
+            kernel_size=1,
+            strides=1,
+            padding="VALID",
+            data_format="NCHW",
+            dilation_rate=1,
+        )
+        self._test_ones_body(
+            shape=[1, 3, 1, 1],
+            filters=1,
+            kernel_size=1,
+            strides=1,
+            padding="SAME",
+            data_format="NCHW",
+            dilation_rate=1,
+        )
+        self._test_ones_body(
+            shape=[1, 1, 5, 5],
+            filters=1,
+            kernel_size=1,
+            strides=1,
+            padding="VALID",
+            data_format="NCHW",
+            dilation_rate=1,
+        )
+        self._test_ones_body(
+            shape=[3, 3, 5, 5],
+            filters=1,
+            kernel_size=1,
+            strides=1,
+            padding="VALID",
+            data_format="NCHW",
+            dilation_rate=1,
+        )
+
+    def test_random_kernel_1x1(self):
+        self._test_random_body(
+            shape=[1, 1, 1, 1],
+            filters=1,
+            kernel_size=1,
+            strides=1,
+            padding="VALID",
+            data_format="NCHW",
+            dilation_rate=1,
+        )
+        self._test_random_body(
+            shape=[1, 3, 1, 1],
+            filters=1,
+            kernel_size=1,
+            strides=1,
+            padding="SAME",
+            data_format="NCHW",
+            dilation_rate=1,
+        )
+        self._test_random_body(
+            shape=[1, 1, 5, 5],
+            filters=1,
+            kernel_size=1,
+            strides=1,
+            padding="VALID",
+            data_format="NCHW",
+            dilation_rate=1,
+        )
+        self._test_random_body(
+            shape=[3, 3, 5, 5],
+            filters=1,
+            kernel_size=1,
+            strides=1,
+            padding="VALID",
+            data_format="NCHW",
+            dilation_rate=1,
+        )
+
+    def test_ones_kernel_3x3(self):
+        self._test_ones_body(
+            shape=[1, 1, 3, 3],
+            filters=1,
+            kernel_size=3,
+            strides=1,
+            padding="VALID",
+            data_format="NCHW",
+            dilation_rate=1,
+        )
+        self._test_ones_body(
+            shape=[1, 3, 5, 5],
+            filters=1,
+            kernel_size=3,
+            strides=1,
+            padding="SAME",
+            data_format="NCHW",
+            dilation_rate=1,
+        )
+        self._test_ones_body(
+            shape=[1, 5, 3, 3],
+            filters=1,
+            kernel_size=3,
+            strides=1,
+            padding="VALID",
+            data_format="NCHW",
+            dilation_rate=1,
+        )
+
+    def test_random_kernel_3x3(self):
+        self._test_random_body(
+            shape=[1, 1, 3, 3],
+            filters=1,
+            kernel_size=3,
+            strides=1,
+            padding="VALID",
+            data_format="NCHW",
+            dilation_rate=1,
+        )
+        self._test_random_body(
+            shape=[1, 3, 3, 3],
+            filters=1,
+            kernel_size=3,
+            strides=1,
+            padding="SAME",
+            data_format="NCHW",
+            dilation_rate=1,
+        )
+        self._test_random_body(
+            shape=[1, 3, 3, 3],
+            filters=1,
+            kernel_size=3,
+            strides=1,
+            padding="SAME",
+            data_format="NCHW",
+            dilation_rate=1,
+        )
+        self._test_random_body(
+            shape=[1, 3, 3, 3],
+            filters=1,
+            kernel_size=3,
+            strides=1,
+            padding="SAME",
+            data_format="NCHW",
+            dilation_rate=1,
+        )
+
+    def test_ones_kernel_11x11(self):
+        self._test_ones_body(
+            shape=[1, 3, 24, 24],
+            filters=3,
+            kernel_size=11,
+            strides=4,
+            padding="VALID",
+            data_format="NCHW",
+            dilation_rate=1,
+        )
+        self._test_ones_body(
+            shape=[1, 3, 24, 24],
+            filters=3,
+            kernel_size=11,
+            strides=4,
+            padding="SAME",
+            data_format="NCHW",
+            dilation_rate=1,
+        )
+        self._test_ones_body(
+            shape=[1, 3, 27, 27],
+            filters=3,
+            kernel_size=11,
+            strides=4,
+            padding="VALID",
+            data_format="NCHW",
+            dilation_rate=1,
+        )
+        self._test_ones_body(
+            shape=[1, 3, 27, 27],
+            filters=3,
+            kernel_size=11,
+            strides=4,
+            padding="SAME",
+            data_format="NCHW",
+            dilation_rate=1,
+        )
+
+    def test_random_kernel_11x11(self):
+        self._test_random_body(
+            shape=[1, 3, 24, 24],
+            filters=3,
+            kernel_size=11,
+            strides=4,
+            padding="VALID",
+            data_format="NCHW",
+            dilation_rate=1,
+        )
+        self._test_random_body(
+            shape=[1, 3, 24, 24],
+            filters=3,
+            kernel_size=11,
+            strides=4,
+            padding="SAME",
+            data_format="NCHW",
+            dilation_rate=1,
+        )
+        self._test_random_body(
+            shape=[1, 3, 27, 27],
+            filters=3,
+            kernel_size=11,
+            strides=4,
+            padding="VALID",
+            data_format="NCHW",
+            dilation_rate=1,
+        )
+        self._test_random_body(
+            shape=[1, 3, 27, 27],
+            filters=3,
+            kernel_size=11,
+            strides=4,
+            padding="SAME",
+            data_format="NCHW",
+            dilation_rate=1,
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/xrt/test_gather.py b/python/oneflow/compatible/single_client/test/xrt/test_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e0745220d3fa237a8cefd3e2dfa458dc0a04ab3
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/xrt/test_gather.py
@@ -0,0 +1,131 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+config = flow.function_config()
+
+
+class TestGather(unittest.TestCase):
+    def _test_body(self, x, indices, axis, dtype=flow.float32):
+        indices = np.array(indices).astype(np.int32)
+        f1 = self.make_job(x.shape, indices.shape, axis, dtype=dtype)
+        f2 = self.make_xla_job(x.shape, indices.shape, axis, dtype=dtype)
+        a = f1(x, indices).get()
+        b = f2(x, indices).get()
+        print("without xla: ", a)
+        print("with xla", b)
+        self.assertTrue(np.allclose(a.numpy(), b.numpy(), rtol=0.001, atol=1e-05))
+        flow.clear_default_session()
+
+    def make_job(self, input_shape, indices_shape, axis, dtype=flow.float32):
+        config.use_xla_jit(False)
+        config.use_tensorrt(False)
+
+        @flow.global_function(config)
+        def gather_job(
+            x=flow.FixedTensorDef(input_shape, dtype=dtype),
+            indices=flow.FixedTensorDef(indices_shape, dtype=flow.int32),
+        ):
+            return flow.gather(x, indices, axis=axis)
+
+        return gather_job
+
+    def make_xla_job(self, input_shape, indices_shape, axis, dtype=flow.float32):
+        config.use_xla_jit(True)
+        config.use_tensorrt(False)
+
+        @flow.global_function(config)
+        def xla_gather_job(
+            x=flow.FixedTensorDef(input_shape, dtype=dtype),
+            indices=flow.FixedTensorDef(indices_shape, dtype=flow.int32),
+        ):
+            return flow.gather(x, indices, axis=axis)
+
+        return xla_gather_job
+
+    def _test_ones_body(self, shape, indices, axis, dtype=flow.float32):
+        np_dtype = flow.convert_oneflow_dtype_to_numpy_dtype(dtype)
+        x = np.ones(shape, dtype=np_dtype)
+        self._test_body(x, indices, axis, dtype=dtype)
+
+    def _test_random_body(self, shape, indices, axis, dtype=flow.float32):
+        np_dtype = flow.convert_oneflow_dtype_to_numpy_dtype(dtype)
+        x = np.random.random(shape).astype(np_dtype)
+        self._test_body(x, indices, axis, dtype=dtype)
+
+    def test_ones_input(self):
+        self._test_ones_body((1, 1), [0], 0)
+        self._test_ones_body((2, 2), [0, 0], 0)
+        self._test_ones_body((1, 10), [[0], [0]], 0)
+        self._test_ones_body((1, 10), [[0, 1, 2], [2, 3, 4]], 1)
+        self._test_ones_body((2, 10, 2), [[0, 1], [2, 3], [4, 5]], 1)
+        self._test_ones_body((2, 5, 2, 2), [[0, 0], [1, 1]], 3)
+
+    def test_random_input(self):
+        self._test_random_body((1, 1), [0], 0)
+        self._test_random_body((2, 2), [0, 0], 0)
+        self._test_random_body((1, 10), [[0], [0]], 0)
+        self._test_random_body((1, 10), [[0, 1, 2], [2, 3, 4]], 1)
+        self._test_random_body((2, 10, 2), [[0, 1], [2, 3], [4, 5]], 1)
+        self._test_random_body((2, 5, 2, 2), [[0, 0], [1, 1]], 3)
+
+
+class TestBatchGather(TestGather):
+    def make_job(self, input_shape, indices_shape, axis, dtype=flow.float32):
+        config.use_xla_jit(False)
+        config.use_tensorrt(False)
+
+        @flow.global_function(config)
+        def batch_gather_job(
+            x=flow.FixedTensorDef(input_shape, dtype=dtype),
+            indices=flow.FixedTensorDef(indices_shape, dtype=flow.int32),
+        ):
+            return flow.gather(x, indices, batch_dims=axis)
+
+        return batch_gather_job
+
+    def make_xla_job(self, input_shape, indices_shape, axis, dtype=flow.float32):
+        config.use_xla_jit(True)
+        config.use_tensorrt(False)
+
+        @flow.global_function(config)
+        def xla_batch_gather_job(
+            x=flow.FixedTensorDef(input_shape, dtype=dtype),
+            indices=flow.FixedTensorDef(indices_shape, dtype=flow.int32),
+        ):
+            return flow.gather(x, indices, batch_dims=axis)
+
+        return xla_batch_gather_job
+
+    def test_ones_input(self):
+        self._test_ones_body((2, 3, 2), [[0], [1]], 1)
+        self._test_ones_body((2, 3, 2), [[0, 1], [1, 0]], 1)
+        self._test_ones_body((2, 3, 2, 2), [[[0], [0], [0]], [[1], [1], [1]]], 2)
+
+    def test_random_input(self):
+        self._test_random_body((2, 3, 2), [[0], [1]], 1)
+        self._test_random_body((2, 3, 2), [[0, 1], [1, 2]], 1)
+        self._test_random_body((2, 3, 2, 2), [[[0], [0], [0]], [[1], [1], [1]]], 2)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/xrt/test_gelu.py b/python/oneflow/compatible/single_client/test/xrt/test_gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd57d0349a7fdda9df0494fa369f2a5e676a7a91
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/xrt/test_gelu.py
@@ -0,0 +1,82 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+config = flow.function_config()
+
+
+def make_job(input_shape, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def gelu_job(x=flow.FixedTensorDef(input_shape, dtype=dtype)):
+        return flow.math.gelu(x)
+
+    return gelu_job
+
+
+def make_xla_job(input_shape, dtype=flow.float32):
+    config.use_xla_jit(True)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def xla_gelu_job(x=flow.FixedTensorDef(input_shape, dtype=dtype)):
+        return flow.math.gelu(x)
+
+    return xla_gelu_job
+
+
+class TestGelu(unittest.TestCase):
+    def _test_body(self, x, dtype=np.float32):
+        f1 = make_job(x.shape, dtype=flow.float32)
+        f2 = make_xla_job(x.shape, dtype=flow.float32)
+        a = f1(x).get()
+        b = f2(x).get()
+        print("without xla: ", a)
+        print("with xla", b)
+        self.assertTrue(np.allclose(a.numpy(), b.numpy(), rtol=0.001, atol=1e-05))
+        flow.clear_default_session()
+
+    def _test_ones_body(self, shape, dtype=np.float32):
+        x = np.ones(shape, dtype=dtype)
+        self._test_body(x, dtype=dtype)
+
+    def _test_random_body(self, shape, dtype=np.float32):
+        x = np.random.random(shape).astype(dtype)
+        self._test_body(x, dtype=dtype)
+
+    def test_ones_input(self):
+        self._test_ones_body(1)
+        self._test_ones_body((1, 10))
+        self._test_ones_body((2, 10, 2))
+        self._test_ones_body((2, 5, 2, 2))
+
+    def test_random_input(self):
+        self._test_random_body(1)
+        self._test_random_body((1, 10))
+        self._test_random_body((2, 10, 2))
+        self._test_random_body((2, 5, 2, 2))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/xrt/test_gelu_grad.py b/python/oneflow/compatible/single_client/test/xrt/test_gelu_grad.py
new file mode 100644
index 0000000000000000000000000000000000000000..48ced8ae7914ecbaaa6f853107f26e58260b9965
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/xrt/test_gelu_grad.py
@@ -0,0 +1,90 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+config = flow.function_config()
+
+
+def make_job(shape, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def gelu_grad_job(
+        x=flow.FixedTensorDef(shape, dtype=dtype),
+        dy=flow.FixedTensorDef(shape, dtype=dtype),
+    ):
+        return flow.math.gelu_grad(x, dy)
+
+    return gelu_grad_job
+
+
+def make_xla_job(shape, dtype=flow.float32):
+    config.use_xla_jit(True)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def xla_gelu_grad_job(
+        x=flow.FixedTensorDef(shape, dtype=dtype),
+        dy=flow.FixedTensorDef(shape, dtype=dtype),
+    ):
+        return flow.math.gelu_grad(x, dy)
+
+    return xla_gelu_grad_job
+
+
+class TestGeluGrad(unittest.TestCase):
+    def _test_body(self, x, dy, dtype=np.float32):
+        f1 = make_job(x.shape, dtype=flow.float32)
+        f2 = make_xla_job(x.shape, dtype=flow.float32)
+        a = f1(x, dy).get()
+        b = f2(x, dy).get()
+        print("without xla: ", a)
+        print("with xla", b)
+        self.assertTrue(np.allclose(a.numpy(), b.numpy(), rtol=0.001, atol=1e-05))
+        flow.clear_default_session()
+
+    def _test_ones_body(self, shape, dtype=np.float32):
+        x = np.ones(shape, dtype=dtype)
+        dy = np.ones(shape, dtype=dtype)
+        self._test_body(x, dy, dtype=dtype)
+
+    def _test_random_body(self, shape, dtype=np.float32):
+        x = np.random.random(shape).astype(dtype)
+        dy = np.random.random(shape).astype(dtype)
+        self._test_body(x, dy, dtype=dtype)
+
+    def test_ones_input(self):
+        self._test_ones_body(1)
+        self._test_ones_body((1, 10))
+        self._test_ones_body((2, 10, 2))
+        self._test_ones_body((2, 5, 2, 2))
+
+    def test_random_input(self):
+        self._test_random_body(1)
+        self._test_random_body((1, 10))
+        self._test_random_body((2, 10, 2))
+        self._test_random_body((2, 5, 2, 2))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/xrt/test_identity.py b/python/oneflow/compatible/single_client/test/xrt/test_identity.py
new file mode 100644
index 0000000000000000000000000000000000000000..d09fc9abad8f0c64f3e1350ed8d497bcfc827a13
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/xrt/test_identity.py
@@ -0,0 +1,97 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+config = flow.function_config()
+
+
+def make_job(input_shape, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def identity_job(x=flow.FixedTensorDef(input_shape, dtype=dtype)):
+        return flow.identity(x)
+
+    return identity_job
+
+
+def make_xla_job(input_shape, dtype=flow.float32):
+    config.use_xla_jit(True)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def xla_identity_job(x=flow.FixedTensorDef(input_shape, dtype=dtype)):
+        return flow.identity(x)
+
+    return xla_identity_job
+
+
+def make_trt_job(input_shape, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(True)
+
+    @flow.global_function(config)
+    def trt_identity_job(x=flow.FixedTensorDef(input_shape, dtype=dtype)):
+        return flow.identity(x)
+
+    return trt_identity_job
+
+
+class TestIdentity(unittest.TestCase):
+    def _test_body(self, x, dtype=np.float32):
+        f1 = make_job(x.shape, dtype=flow.float32)
+        f2 = make_xla_job(x.shape, dtype=flow.float32)
+        f3 = make_trt_job(x.shape, dtype=flow.float32)
+        a = f1(x).get()
+        b = f2(x).get()
+        c = f3(x).get()
+        print("without xla: ", a)
+        print("with xla: ", b)
+        print("with tensorrt: ", c)
+        self.assertTrue(np.allclose(a.numpy(), b.numpy(), rtol=0.001, atol=1e-05))
+        self.assertTrue(np.allclose(a.numpy(), c.numpy(), rtol=0.001, atol=1e-05))
+        flow.clear_default_session()
+
+    def _test_ones_body(self, shape, dtype=np.float32):
+        x = np.ones(shape, dtype=dtype)
+        self._test_body(x, dtype=dtype)
+
+    def _test_random_body(self, shape, dtype=np.float32):
+        x = np.random.random(shape).astype(dtype)
+        self._test_body(x, dtype=dtype)
+
+    def test_ones_input(self):
+        self._test_ones_body(1)
+        self._test_ones_body((1, 10))
+        self._test_ones_body((2, 10, 2))
+        self._test_ones_body((2, 5, 2, 2))
+
+    def test_random_input(self):
+        self._test_random_body(1)
+        self._test_random_body((1, 10))
+        self._test_random_body((2, 10, 2))
+        self._test_random_body((2, 5, 2, 2))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/xrt/test_layer_norm.py b/python/oneflow/compatible/single_client/test/xrt/test_layer_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa96283fb7ce898325822d83b80879eca665c5b1
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/xrt/test_layer_norm.py
@@ -0,0 +1,91 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+config = flow.function_config()
+
+
+def make_job(input_shape, norm_axis, params_axis, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def layer_norm_job(x=flow.FixedTensorDef(input_shape, dtype=dtype)):
+        return flow.layers.layer_norm(
+            x, begin_norm_axis=norm_axis, begin_params_axis=params_axis
+        )
+
+    return layer_norm_job
+
+
+def make_xla_job(input_shape, norm_axis, params_axis, dtype=flow.float32):
+    config.use_xla_jit(True)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def xla_layer_norm_job(x=flow.FixedTensorDef(input_shape, dtype=dtype)):
+        return flow.layers.layer_norm(
+            x, begin_norm_axis=norm_axis, begin_params_axis=params_axis
+        )
+
+    return xla_layer_norm_job
+
+
+@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+class TestLayerNorm(unittest.TestCase):
+    def _test_body(self, x, norm_axis, params_axis, dtype=np.float32):
+        f1 = make_job(x.shape, norm_axis, params_axis, dtype=flow.float32)
+        f2 = make_xla_job(x.shape, norm_axis, params_axis, dtype=flow.float32)
+        check_point = flow.train.CheckPoint()
+        check_point.init()
+        a = f1(x).get()
+        b = f2(x).get()
+        print("without xla: ", a.numpy())
+        print("with xla", b.numpy())
+        self.assertTrue(
+            np.allclose(a.numpy(), b.numpy(), rtol=0.05, atol=0.05),
+            a.numpy() - b.numpy(),
+        )
+        flow.clear_default_session()
+
+    def _test_ones_body(self, shape, norm_axis=-1, params_axis=-1, dtype=np.float32):
+        x = np.ones(shape, dtype=dtype)
+        self._test_body(x, norm_axis, params_axis, dtype=dtype)
+
+    def _test_random_body(self, shape, norm_axis=-1, params_axis=-1, dtype=np.float32):
+        x = (10 * np.random.random(shape)).astype(dtype)
+        self._test_body(x, norm_axis, params_axis, dtype=dtype)
+
+    def test_ones_input(self):
+        self._test_ones_body((1, 10))
+        self._test_ones_body((2, 10, 2))
+        self._test_ones_body((2, 5, 2, 2))
+
+    def test_random_input(self):
+        self._test_random_body((1, 10))
+        self._test_random_body((2, 10, 2))
+        self._test_random_body((2, 5, 2, 2))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/xrt/test_layer_norm_grad.py b/python/oneflow/compatible/single_client/test/xrt/test_layer_norm_grad.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f2e023e4aad1ea83157a7f52d47a72438d37013
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/xrt/test_layer_norm_grad.py
@@ -0,0 +1,108 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+config = flow.function_config()
+
+
+def make_job(shape, mean_shape, norm_axis, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def layer_norm_grad_job(
+        dy=flow.FixedTensorDef(shape, dtype=dtype),
+        x=flow.FixedTensorDef(shape, dtype=dtype),
+        mean=flow.FixedTensorDef(mean_shape, dtype=dtype),
+        inv_variance=flow.FixedTensorDef(mean_shape, dtype=dtype),
+    ):
+        return flow.layers.layer_norm_grad(
+            dy, x, mean, inv_variance, begin_norm_axis=norm_axis
+        )
+
+    return layer_norm_grad_job
+
+
+def make_xla_job(shape, mean_shape, norm_axis, dtype=flow.float32):
+    config.use_xla_jit(True)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def xla_layer_norm_grad_job(
+        dy=flow.FixedTensorDef(shape, dtype=dtype),
+        x=flow.FixedTensorDef(shape, dtype=dtype),
+        mean=flow.FixedTensorDef(mean_shape, dtype=dtype),
+        inv_variance=flow.FixedTensorDef(mean_shape, dtype=dtype),
+    ):
+        return flow.layers.layer_norm_grad(
+            dy, x, mean, inv_variance, begin_norm_axis=norm_axis
+        )
+
+    return xla_layer_norm_grad_job
+
+
+@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+class TestLayerNormGrad(unittest.TestCase):
+    def _test_body(self, dy, x, mean, inv_variance, norm_axis, dtype=np.float32):
+        f1 = make_job(x.shape, mean.shape, norm_axis, dtype=flow.float32)
+        f2 = make_xla_job(x.shape, mean.shape, norm_axis, dtype=flow.float32)
+        a = f1(dy, x, mean, inv_variance).get()
+        b = f2(dy, x, mean, inv_variance).get()
+        print("without xla: ", a)
+        print("with xla", b)
+        self.assertTrue(np.allclose(a.numpy(), b.numpy(), rtol=0.001, atol=1e-05))
+        flow.clear_default_session()
+
+    def _test_ones_body(self, shape, norm_axis=-1, dtype=np.float32):
+        dy = np.ones(shape, dtype=dtype)
+        x = np.ones(shape, dtype=dtype)
+        if norm_axis < 0:
+            norm_axis += len(shape)
+        mean_shape = shape[:norm_axis]
+        mean = np.ones(mean_shape, dtype=dtype)
+        inv_variance = np.ones(mean_shape, dtype=dtype)
+        self._test_body(dy, x, mean, inv_variance, norm_axis, dtype=dtype)
+
+    def _test_random_body(self, shape, norm_axis=-1, dtype=np.float32):
+        dy = np.random.random(shape).astype(dtype)
+        x = np.random.random(shape).astype(dtype)
+        if norm_axis < 0:
+            norm_axis += len(shape)
+        mean_shape = shape[:norm_axis]
+        mean = np.random.random(mean_shape).astype(dtype)
+        inv_variance = np.random.random(mean_shape).astype(dtype)
+        self._test_body(dy, x, mean, inv_variance, norm_axis, dtype=dtype)
+
+    def test_ones_input(self):
+        self._test_ones_body((1, 10))
+        self._test_ones_body((2, 10, 2))
+        self._test_ones_body((2, 5, 2, 2))
+
+    def test_random_input(self):
+        self._test_random_body((1, 10))
+        self._test_random_body((2, 10, 2))
+        self._test_random_body((2, 5, 2, 2))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/xrt/test_layer_norm_param_grad.py b/python/oneflow/compatible/single_client/test/xrt/test_layer_norm_param_grad.py
new file mode 100644
index 0000000000000000000000000000000000000000..03090422f85f2cec9f17e1b7a145564ef21f7041
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/xrt/test_layer_norm_param_grad.py
@@ -0,0 +1,126 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+config = flow.function_config()
+
+
+def make_job(shape, gamma_shape, params_axis, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def layer_norm_param_grad_job(
+        dy=flow.FixedTensorDef(shape, dtype=dtype),
+        norm=flow.FixedTensorDef(shape, dtype=dtype),
+        gamma=flow.FixedTensorDef(gamma_shape, dtype=dtype),
+    ):
+        return flow.layers.layer_norm_param_grad(
+            dy, norm, gamma, begin_params_axis=params_axis
+        )
+
+    return layer_norm_param_grad_job
+
+
+def make_xla_job(shape, gamma_shape, params_axis, dtype=flow.float32):
+    config.use_xla_jit(True)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def xla_layer_norm_param_grad_job(
+        dy=flow.FixedTensorDef(shape, dtype=dtype),
+        norm=flow.FixedTensorDef(shape, dtype=dtype),
+        gamma=flow.FixedTensorDef(gamma_shape, dtype=dtype),
+    ):
+        return flow.layers.layer_norm_param_grad(
+            dy, norm, gamma, begin_params_axis=params_axis
+        )
+
+    return xla_layer_norm_param_grad_job
+
+
+@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+class TestLayerNormParamGrad(unittest.TestCase):
+    def _test_body(self, dy, norm, gamma, params_axis, dtype=np.float32):
+        f1 = make_job(dy.shape, gamma.shape, params_axis, dtype=flow.float32)
+        f2 = make_xla_job(dy.shape, gamma.shape, params_axis, dtype=flow.float32)
+        (d_norm1, d_beta1, d_gamma1) = f1(dy, norm, gamma).get()
+        (d_norm2, d_beta2, d_gamma2) = f2(dy, norm, gamma).get()
+        print("normalize diff:")
+        print("    without xla: ", d_norm1)
+        print("    with xla: ", d_norm2)
+        print("beta diff:")
+        print("    without xla: ", d_beta1)
+        print("    with xla: ", d_beta2)
+        print("gamma diff:")
+        print("    without xla: ", d_gamma1)
+        print("    with xla: ", d_gamma2)
+        self.assertTrue(d_norm1.shape, d_norm2.shape)
+        self.assertTrue(d_beta1.shape, d_beta2.shape)
+        self.assertTrue(d_gamma1.shape, d_gamma2.shape)
+        self.assertTrue(
+            np.allclose(d_norm1.numpy(), d_norm2.numpy(), rtol=0.001, atol=1e-05)
+        )
+        self.assertTrue(
+            np.allclose(d_beta1.numpy(), d_beta2.numpy(), rtol=0.001, atol=1e-05)
+        )
+        self.assertTrue(
+            np.allclose(d_gamma1.numpy(), d_gamma2.numpy(), rtol=0.001, atol=1e-05)
+        )
+        flow.clear_default_session()
+
+    def _test_ones_body(self, shape, params_axis=-1, dtype=np.float32):
+        dy = np.ones(shape, dtype=dtype)
+        norm = np.ones(shape, dtype=dtype)
+        if params_axis < 0:
+            params_axis += len(shape)
+        gamma_shape = shape[params_axis:]
+        if len(gamma_shape) == 0:
+            gamma_shape = [1]
+        gamma = np.ones(gamma_shape, dtype=dtype)
+        self._test_body(dy, norm, gamma, params_axis, dtype=dtype)
+
+    def _test_random_body(self, shape, params_axis=-1, dtype=np.float32):
+        dy = np.random.random(shape).astype(dtype)
+        norm = np.random.random(shape).astype(dtype)
+        if params_axis < 0:
+            params_axis += len(shape)
+        gamma_shape = shape[params_axis:]
+        if len(gamma_shape) == 0:
+            gamma_shape = [1]
+        gamma = np.random.random(gamma_shape).astype(dtype)
+        self._test_body(dy, norm, gamma, params_axis, dtype=dtype)
+
+    def test_ones_input(self):
+        self._test_ones_body((1, 10))
+        self._test_ones_body((2, 10, 2))
+        self._test_ones_body((2, 5, 2, 2))
+
+    def test_random_input(self):
+        self._test_random_body((1, 10))
+        self._test_random_body((2, 10, 2))
+        self._test_random_body((2, 5, 2, 2))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/xrt/test_leaky_relu.py b/python/oneflow/compatible/single_client/test/xrt/test_leaky_relu.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2afe2a29ad992100518993c1237a520eca1f6a3
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/xrt/test_leaky_relu.py
@@ -0,0 +1,90 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+config = flow.function_config()
+
+
+def make_job(input_shape, alpha, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def leaky_relu_job(x=flow.FixedTensorDef(input_shape, dtype=dtype)):
+        return flow.nn.leaky_relu(x, alpha=alpha)
+
+    return leaky_relu_job
+
+
+def make_trt_job(input_shape, alpha, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(True)
+
+    @flow.global_function(config)
+    def trt_leaky_relu_job(x=flow.FixedTensorDef(input_shape, dtype=dtype)):
+        return flow.nn.leaky_relu(x, alpha=alpha)
+
+    return trt_leaky_relu_job
+
+
+class TestLeakyRelu(unittest.TestCase):
+    def _test_body(self, x, alpha, dtype=np.float32):
+        f1 = make_job(x.shape, alpha, dtype=flow.float32)
+        f2 = make_trt_job(x.shape, alpha, dtype=flow.float32)
+        a = f1(x).get()
+        b = f2(x).get()
+        print("oneflow: ", a)
+        print("oneflow with tensorrt: ", b)
+        self.assertTrue(np.allclose(a.numpy(), b.numpy(), rtol=0.001, atol=1e-05))
+        flow.clear_default_session()
+
+    def _test_ones_body(self, shape, alpha=0.1, dtype=np.float32):
+        x = np.ones(shape, dtype=dtype)
+        self._test_body(x, alpha, dtype=dtype)
+
+    def _test_random_body(self, shape, alpha=0.1, dtype=np.float32):
+        x = 100 * (np.random.random(shape).astype(dtype) - 0.5)
+        self._test_body(x, alpha, dtype=dtype)
+
+    def test_ones_input(self):
+        self._test_ones_body(1, alpha=0.1)
+        self._test_ones_body((1, 10), alpha=0.1)
+        self._test_ones_body((2, 10, 2), alpha=0.1)
+        self._test_ones_body((2, 5, 2, 2), alpha=0.1)
+        self._test_ones_body(1, alpha=0.33)
+        self._test_ones_body((1, 10), alpha=0.33)
+        self._test_ones_body((2, 10, 2), alpha=0.33)
+        self._test_ones_body((2, 5, 2, 2), alpha=0.33)
+
+    def test_random_input(self):
+        self._test_random_body(1, alpha=0.1)
+        self._test_random_body((1, 10), alpha=0.1)
+        self._test_random_body((2, 10, 2), alpha=0.1)
+        self._test_random_body((2, 5, 2, 2), alpha=0.1)
+        self._test_random_body(1, alpha=0.33)
+        self._test_random_body((1, 10), alpha=0.33)
+        self._test_random_body((2, 10, 2), alpha=0.33)
+        self._test_random_body((2, 5, 2, 2), alpha=0.33)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/xrt/test_matmul.py b/python/oneflow/compatible/single_client/test/xrt/test_matmul.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c8e50544f9a17795b59fc3227508f3aa3b1ed84
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/xrt/test_matmul.py
@@ -0,0 +1,148 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+config = flow.function_config()
+
+
+def make_job(a_shape, b_shape, trans_a=False, trans_b=False, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def matmul_job(
+        a=flow.FixedTensorDef(a_shape, dtype=dtype),
+        b=flow.FixedTensorDef(b_shape, dtype=dtype),
+    ):
+        return flow.matmul(a, b, transpose_a=trans_a, transpose_b=trans_b)
+
+    return matmul_job
+
+
+def make_xla_job(a_shape, b_shape, trans_a=False, trans_b=False, dtype=flow.float32):
+    config.use_xla_jit(True)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def xla_matmul_job(
+        a=flow.FixedTensorDef(a_shape, dtype=dtype),
+        b=flow.FixedTensorDef(b_shape, dtype=dtype),
+    ):
+        return flow.matmul(a, b, transpose_a=trans_a, transpose_b=trans_b)
+
+    return xla_matmul_job
+
+
+def make_trt_job(a_shape, b_shape, trans_a=False, trans_b=False, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(True)
+
+    @flow.global_function(config)
+    def trt_matmul_job(
+        a=flow.FixedTensorDef(a_shape, dtype=dtype),
+        b=flow.FixedTensorDef(b_shape, dtype=dtype),
+    ):
+        return flow.matmul(a, b, transpose_a=trans_a, transpose_b=trans_b)
+
+    return trt_matmul_job
+
+
+class TestMatmul(unittest.TestCase):
+    def make_shape(self, m, n, transpose):
+        if transpose:
+            return (n, m)
+        else:
+            return (m, n)
+
+    def _test_body(self, a, b, trans_a, trans_b, dtype=np.float32):
+        f1 = make_job(a.shape, b.shape, trans_a, trans_b)
+        f2 = make_xla_job(a.shape, b.shape, trans_a, trans_b)
+        f3 = make_trt_job(a.shape, b.shape, trans_a, trans_b)
+        x = f1(a, b).get()
+        y = f2(a, b).get()
+        z = f3(a, b).get()
+        print("without xla: ", x)
+        print("with xla: ", y)
+        print("with tensorrt: ", y)
+        self.assertTrue(np.allclose(x.numpy(), y.numpy(), rtol=0.001, atol=1e-05))
+        self.assertTrue(np.allclose(x.numpy(), z.numpy(), rtol=0.001, atol=1e-05))
+        flow.clear_default_session()
+
+    def _test_ones_body(self, m, k, n, trans_a, trans_b, dtype=np.float32):
+        shape_a = self.make_shape(m, k, trans_a)
+        shape_b = self.make_shape(k, n, trans_b)
+        a = np.ones(shape_a, dtype=dtype)
+        b = np.ones(shape_b, dtype=dtype)
+        self._test_body(a, b, trans_a, trans_b, dtype=dtype)
+
+    def _test_random_body(self, m, k, n, trans_a, trans_b, dtype=np.float32):
+        shape_a = self.make_shape(m, k, trans_a)
+        shape_b = self.make_shape(k, n, trans_b)
+        a = np.random.random(shape_a).astype(dtype)
+        b = np.random.random(shape_b).astype(dtype)
+        self._test_body(a, b, trans_a, trans_b, dtype=dtype)
+
+    def test_ones1x1x1_input(self):
+        print("run test_ones1x1x1_input: ")
+        self._test_ones_body(1, 1, 1, False, False)
+        self._test_ones_body(1, 1, 1, False, True)
+        self._test_ones_body(1, 1, 1, True, False)
+        self._test_ones_body(1, 1, 1, True, True)
+
+    def test_random1x1x1_input(self):
+        print("test_random1x1x1_input: ")
+        self._test_random_body(1, 1, 1, False, False)
+        self._test_random_body(1, 1, 1, False, True)
+        self._test_random_body(1, 1, 1, True, False)
+        self._test_random_body(1, 1, 1, True, True)
+
+    def test_ones1x10x1_input(self):
+        print("test_ones1x10x1_input: ")
+        self._test_ones_body(1, 10, 1, False, False)
+        self._test_ones_body(1, 10, 1, False, True)
+        self._test_ones_body(1, 10, 1, True, False)
+        self._test_ones_body(1, 10, 1, True, True)
+
+    def test_random1x10x1_input(self):
+        print("test_random1x10x1_input: ")
+        self._test_random_body(1, 10, 1, False, False)
+        self._test_random_body(1, 10, 1, False, True)
+        self._test_random_body(1, 10, 1, True, False)
+        self._test_random_body(1, 10, 1, True, True)
+
+    def test_ones10x10x2_input(self):
+        print("test_ones10x10x2_input: ")
+        self._test_ones_body(10, 10, 2, False, False)
+        self._test_ones_body(10, 10, 2, False, True)
+        self._test_ones_body(10, 10, 2, True, False)
+        self._test_ones_body(10, 10, 2, True, True)
+
+    def test_random10x10x2_input(self):
+        print("run test_random10x10x2_input: ")
+        self._test_random_body(10, 10, 2, False, False)
+        self._test_random_body(10, 10, 2, False, True)
+        self._test_random_body(10, 10, 2, True, False)
+        self._test_random_body(10, 10, 2, True, True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/xrt/test_matmul_add.py b/python/oneflow/compatible/single_client/test/xrt/test_matmul_add.py
new file mode 100644
index 0000000000000000000000000000000000000000..049850f66d83fb2fae600ef1a6e784edb93ce6a5
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/xrt/test_matmul_add.py
@@ -0,0 +1,219 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+config = flow.function_config()
+
+
+def make_job(
+    a_shape,
+    b_shape,
+    trans_a=False,
+    trans_b=False,
+    fuse_add_to_output=True,
+    dtype=flow.float32,
+):
+    config.use_xla_jit(False)
+    config.use_tensorrt(False)
+    config.enable_fuse_add_to_output(fuse_add_to_output)
+
+    @flow.global_function(config)
+    def matmul_job(
+        a=flow.FixedTensorDef(a_shape, dtype=dtype),
+        b=flow.FixedTensorDef(b_shape, dtype=dtype),
+    ):
+        out = flow.matmul(a, b, transpose_a=trans_a, transpose_b=trans_b)
+        c = flow.get_variable(
+            "c",
+            shape=out.shape,
+            dtype=flow.float,
+            initializer=flow.ones_initializer(),
+            trainable=True,
+        )
+        out = flow.math.add_n([out, c])
+        return out
+
+    return matmul_job
+
+
+def make_xla_job(
+    a_shape,
+    b_shape,
+    trans_a=False,
+    trans_b=False,
+    fuse_add_to_output=True,
+    dtype=flow.float32,
+):
+    config.use_xla_jit(True)
+    config.use_tensorrt(False)
+    config.enable_fuse_add_to_output(fuse_add_to_output)
+
+    @flow.global_function(config)
+    def xla_matmul_job(
+        a=flow.FixedTensorDef(a_shape, dtype=dtype),
+        b=flow.FixedTensorDef(b_shape, dtype=dtype),
+    ):
+        out = flow.matmul(a, b, transpose_a=trans_a, transpose_b=trans_b)
+        c = flow.get_variable(
+            "c",
+            shape=out.shape,
+            dtype=flow.float,
+            initializer=flow.ones_initializer(),
+            trainable=True,
+        )
+        out = flow.math.add_n([out, c])
+        return out
+
+    return xla_matmul_job
+
+
+def make_trt_job(
+    a_shape,
+    b_shape,
+    trans_a=False,
+    trans_b=False,
+    fuse_add_to_output=True,
+    dtype=flow.float32,
+):
+    config.use_xla_jit(False)
+    config.use_tensorrt(True)
+    config.enable_fuse_add_to_output(fuse_add_to_output)
+
+    @flow.global_function(config)
+    def trt_matmul_job(
+        a=flow.FixedTensorDef(a_shape, dtype=dtype),
+        b=flow.FixedTensorDef(b_shape, dtype=dtype),
+    ):
+        out = flow.matmul(a, b, transpose_a=trans_a, transpose_b=trans_b)
+        c = flow.get_variable(
+            "c",
+            shape=out.shape,
+            dtype=flow.float,
+            initializer=flow.ones_initializer(),
+            trainable=True,
+        )
+        out = flow.math.add_n([out, c])
+        return out
+
+    return trt_matmul_job
+
+
+class TestMatmul(unittest.TestCase):
+    def make_shape(self, m, n, transpose):
+        if transpose:
+            return (n, m)
+        else:
+            return (m, n)
+
+    def _test_body(self, a, b, trans_a, trans_b, fuse_add_to_output, dtype=np.float32):
+        f1 = make_job(a.shape, b.shape, trans_a, trans_b, fuse_add_to_output)
+        f2 = make_xla_job(a.shape, b.shape, trans_a, trans_b, fuse_add_to_output)
+        f3 = make_trt_job(a.shape, b.shape, trans_a, trans_b, fuse_add_to_output)
+        x = f1(a, b).get()
+        y = f2(a, b).get()
+        z = f3(a, b).get()
+        print("without xla: ", x.numpy())
+        print("with xla: ", y.numpy())
+        print("with tensorrt: ", z.numpy())
+        self.assertTrue(np.allclose(x.numpy(), y.numpy(), rtol=0.001, atol=1e-05))
+        self.assertTrue(np.allclose(x.numpy(), z.numpy(), rtol=0.001, atol=1e-05))
+        flow.clear_default_session()
+
+    def _test_ones_body(
+        self, m, k, n, trans_a, trans_b, fuse_add_to_output, dtype=np.float32
+    ):
+        shape_a = self.make_shape(m, k, trans_a)
+        shape_b = self.make_shape(k, n, trans_b)
+        a = np.ones(shape_a, dtype=dtype)
+        b = np.ones(shape_b, dtype=dtype)
+        self._test_body(a, b, trans_a, trans_b, fuse_add_to_output, dtype=dtype)
+
+    def _test_random_body(
+        self, m, k, n, trans_a, trans_b, fuse_add_to_output, dtype=np.float32
+    ):
+        shape_a = self.make_shape(m, k, trans_a)
+        shape_b = self.make_shape(k, n, trans_b)
+        a = np.random.random(shape_a).astype(dtype)
+        b = np.random.random(shape_b).astype(dtype)
+        self._test_body(a, b, trans_a, trans_b, fuse_add_to_output, dtype=dtype)
+
+    def test_ones1x1x1_input(self):
+        print("run test_ones1x1x1_input: ")
+        self._test_ones_body(1, 1, 1, False, False, True)
+        self._test_ones_body(1, 1, 1, False, False, False)
+        self._test_ones_body(1, 1, 1, False, True, True)
+        self._test_ones_body(1, 1, 1, False, True, False)
+        self._test_ones_body(1, 1, 1, True, False, True)
+        self._test_ones_body(1, 1, 1, True, False, False)
+        self._test_ones_body(1, 1, 1, True, True, True)
+        self._test_ones_body(1, 1, 1, True, True, False)
+
+    def test_random1x1x1_input(self):
+        print("test_random1x1x1_input: ")
+        self._test_random_body(1, 1, 1, False, False, True)
+        self._test_random_body(1, 1, 1, False, False, False)
+        self._test_random_body(1, 1, 1, False, True, True)
+        self._test_random_body(1, 1, 1, False, True, False)
+        self._test_random_body(1, 1, 1, True, False, True)
+        self._test_random_body(1, 1, 1, True, False, False)
+        self._test_random_body(1, 1, 1, True, True, True)
+        self._test_random_body(1, 1, 1, True, True, False)
+
+
+def test_ones1x10x1_input(self):
+    print("test_ones1x10x1_input: ")
+    self._test_ones_body(1, 10, 1, False, False, True)
+    self._test_ones_body(1, 10, 1, False, False, False)
+    self._test_ones_body(1, 10, 1, False, True, True)
+    self._test_ones_body(1, 10, 1, False, True, False)
+    self._test_ones_body(1, 10, 1, True, False, True)
+    self._test_ones_body(1, 10, 1, True, False, False)
+    self._test_ones_body(1, 10, 1, True, True, True)
+    self._test_ones_body(1, 10, 1, True, True, False)
+
+
+def test_random1x10x1_input(self):
+    print("test_random1x10x1_input: ")
+    self._test_random_body(1, 10, 1, False, False, True)
+    self._test_random_body(1, 10, 1, False, True, True)
+    self._test_random_body(1, 10, 1, True, False, True)
+    self._test_random_body(1, 10, 1, True, True, True)
+
+
+def test_ones10x10x2_input(self):
+    print("test_ones10x10x2_input: ")
+    self._test_ones_body(10, 10, 2, False, False, True)
+    self._test_ones_body(10, 10, 2, False, True, True)
+    self._test_ones_body(10, 10, 2, True, False, True)
+    self._test_ones_body(10, 10, 2, True, True, True)
+
+
+def test_random10x10x2_input(self):
+    print("run test_random10x10x2_input: ")
+    self._test_random_body(10, 10, 2, False, False, True)
+    self._test_random_body(10, 10, 2, False, True, True)
+    self._test_random_body(10, 10, 2, True, False, True)
+    self._test_random_body(10, 10, 2, True, True, True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/xrt/test_multiply.py b/python/oneflow/compatible/single_client/test/xrt/test_multiply.py
new file mode 100644
index 0000000000000000000000000000000000000000..32157a17dd270421d0f850b031dd01c4a3530e88
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/xrt/test_multiply.py
@@ -0,0 +1,88 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+config = flow.function_config()
+
+
+def make_job(x_shape, y_shape, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def multiply_job(
+        x=flow.FixedTensorDef(x_shape, dtype=dtype),
+        y=flow.FixedTensorDef(y_shape, dtype=dtype),
+    ):
+        return flow.math.multiply(x, y)
+
+    return multiply_job
+
+
+def make_trt_job(x_shape, y_shape, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(True)
+
+    @flow.global_function(config)
+    def trt_multiply_job(
+        x=flow.FixedTensorDef(x_shape, dtype=dtype),
+        y=flow.FixedTensorDef(y_shape, dtype=dtype),
+    ):
+        return flow.math.multiply(x, y)
+
+    return trt_multiply_job
+
+
+class TestMultiply(unittest.TestCase):
+    def _test_body(self, x, y, dtype=np.float32):
+        f1 = make_job(x.shape, y.shape, dtype=flow.float32)
+        f2 = make_trt_job(x.shape, y.shape, dtype=flow.float32)
+        a = f1(x, y).get()
+        b = f2(x, y).get()
+        print("without tensorrt: ", a)
+        print("with tensorrt", b)
+        self.assertTrue(np.allclose(a.numpy(), b.numpy(), rtol=0.001, atol=1e-05))
+        flow.clear_default_session()
+
+    def _test_ones_body(self, x_shape, y_shape, dtype=np.float32):
+        x = np.ones(x_shape, dtype=dtype)
+        y = np.ones(y_shape, dtype=dtype)
+        self._test_body(x, y, dtype=dtype)
+
+    def _test_random_body(self, x_shape, y_shape, dtype=np.float32):
+        x = np.random.random(x_shape).astype(dtype)
+        y = np.random.random(y_shape).astype(dtype)
+        self._test_body(x, y, dtype=dtype)
+
+    def test_ones_input(self):
+        self._test_ones_body((1, 10), (1, 10))
+        self._test_ones_body((2, 10, 2), (2, 10, 2))
+        self._test_ones_body((2, 5, 2, 2), (2, 5, 2, 2))
+
+    def test_random_input(self):
+        self._test_random_body((1, 10), (1, 10))
+        self._test_random_body((2, 10, 2), (2, 10, 2))
+        self._test_random_body((2, 5, 2, 2), (2, 5, 2, 2))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/xrt/test_online_int8.py b/python/oneflow/compatible/single_client/test/xrt/test_online_int8.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc28f1d6c78e6ef5ff6a22d9eba707fcd269124d
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/xrt/test_online_int8.py
@@ -0,0 +1,153 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+config = flow.function_config()
+
+
+def make_trt_job(
+    x_shape,
+    w_shape,
+    kernel_size=None,
+    strides=None,
+    padding="valid",
+    data_format="NCHW",
+    dilation_rate=None,
+    dtype=flow.float32,
+):
+    config.use_xla_jit(False)
+    config.use_tensorrt(True)
+    config.tensorrt.use_int8()
+
+    @flow.global_function(config)
+    def trt_conv2d_job(
+        x=flow.FixedTensorDef(x_shape, dtype=dtype),
+        weight=flow.FixedTensorDef(w_shape, dtype=dtype),
+    ):
+        return flow.nn.conv2d(
+            x, weight, strides, padding, None, data_format, dilation_rate
+        )
+
+    return trt_conv2d_job
+
+
+class TestConv2d(unittest.TestCase):
+    def make_filter_shape(self, shape, filters, kernel_size, data_format):
+        if data_format == "NCHW":
+            return [filters, shape[1], kernel_size, kernel_size]
+        else:
+            return [filters, kernel_size, kernel_size, shape[3]]
+
+    def _test_body(
+        self,
+        x,
+        filters,
+        kernel_size,
+        strides,
+        padding,
+        data_format,
+        dilation_rate,
+        dtype=np.float32,
+    ):
+        f2 = make_trt_job(
+            x.shape,
+            filters.shape,
+            kernel_size,
+            strides,
+            padding,
+            data_format,
+            dilation_rate,
+            dtype=flow.float32,
+        )
+        for i in range(1):
+            b = f2(x, filters).get()
+            print("with tensorrt float32: ", b)
+        flow.tensorrt.cache_int8_calibration()
+        for i in range(1):
+            b = f2(x, filters).get()
+            print("with tensorrt int8: ", b)
+        flow.clear_default_session()
+
+    def _test_ones_body(
+        self,
+        shape,
+        filters,
+        kernel_size,
+        strides,
+        padding,
+        data_format,
+        dilation_rate,
+        dtype=np.float32,
+    ):
+        assert len(shape) == 4
+        x = np.ones(shape, dtype=dtype)
+        w_shape = self.make_filter_shape(shape, filters, kernel_size, data_format)
+        weight = np.random.random(w_shape).astype(dtype)
+        self._test_body(
+            x,
+            weight,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            dilation_rate=dilation_rate,
+        )
+
+    def _test_random_body(
+        self,
+        shape,
+        filters,
+        kernel_size,
+        strides,
+        padding,
+        data_format,
+        dilation_rate,
+        dtype=np.float32,
+    ):
+        assert len(shape) == 4
+        x = np.random.random(shape).astype(dtype)
+        w_shape = self.make_filter_shape(shape, filters, kernel_size, data_format)
+        weight = np.random.random(w_shape).astype(dtype)
+        self._test_body(
+            x,
+            weight,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            dilation_rate=dilation_rate,
+        )
+
+    def test_random_kernel_1x1(self):
+        self._test_random_body(
+            shape=[3, 3, 5, 5],
+            filters=1,
+            kernel_size=1,
+            strides=1,
+            padding="VALID",
+            data_format="NCHW",
+            dilation_rate=1,
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/xrt/test_pooling.py b/python/oneflow/compatible/single_client/test/xrt/test_pooling.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c9cff4023d41d7e5c89fd5a298995dd0c7a9a42
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/xrt/test_pooling.py
@@ -0,0 +1,181 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+config = flow.function_config()
+
+
+class TestPooling(unittest.TestCase):
+    run_test = False
+
+    def _test_body(self, x, ksize, strides, padding, data_format, dtype=np.float32):
+        if not self.run_test:
+            return
+        f1 = self.make_job(
+            x.shape, ksize, strides, padding, data_format, dtype=flow.float32
+        )
+        f2 = self.make_trt_job(
+            x.shape, ksize, strides, padding, data_format, dtype=flow.float32
+        )
+        a = f1(x).get()
+        b = f2(x).get()
+        print("without trt: ", a)
+        print("with tensorrt", b)
+        self.assertTrue(a.shape == b.shape)
+        self.assertTrue(np.allclose(a.numpy(), b.numpy(), rtol=0.001, atol=1e-05))
+        flow.clear_default_session()
+
+    def _test_ones_body(
+        self, shape, ksize, strides, padding, data_format, dtype=np.float32
+    ):
+        x = np.ones(shape, dtype=dtype)
+        self._test_body(
+            x,
+            ksize=ksize,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            dtype=dtype,
+        )
+
+    def _test_random_body(
+        self, shape, ksize, strides, padding, data_format, dtype=np.float32
+    ):
+        x = np.random.random(shape).astype(dtype)
+        self._test_body(
+            x,
+            ksize=ksize,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            dtype=dtype,
+        )
+
+    def test_ones_input(self):
+        print("test ones input: ")
+        self._test_ones_body((1, 1, 6, 6), 1, 1, "VALID", "NCHW")
+        self._test_ones_body((1, 3, 6, 6), 3, 2, "SAME", "NCHW")
+        self._test_ones_body((1, 1, 3, 3), 1, 1, "VALID", "NCHW")
+        self._test_ones_body((1, 5, 9, 9), 3, 1, "SAME", "NCHW")
+        self._test_ones_body((1, 7, 9, 9), 1, 1, "SAME", "NCHW")
+        self._test_ones_body((1, 5, 3, 3), 1, 1, "VALID", "NCHW")
+        self._test_ones_body((1, 1, 6, 6), 2, 2, "SAME", "NCHW")
+        self._test_ones_body((1, 1, 6, 6), 2, 2, "VALID", "NCHW")
+        self._test_ones_body((1, 1, 9, 9), 2, 2, "SAME", "NCHW")
+        self._test_ones_body((1, 1, 9, 9), 2, 2, "VALID", "NCHW")
+
+    def test_random_input(self):
+        print("test random input: ")
+        self._test_random_body((1, 1, 6, 6), 1, 1, "VALID", "NCHW")
+        self._test_random_body((1, 3, 6, 6), 3, 2, "SAME", "NCHW")
+        self._test_random_body((1, 5, 6, 6), 3, 2, "VALID", "NCHW")
+        self._test_random_body((1, 7, 6, 6), 3, 2, "SAME", "NCHW")
+        self._test_random_body((1, 3, 3, 3), 1, 1, "VALID", "NCHW")
+        self._test_random_body((1, 3, 6, 6), 3, 2, "SAME", "NCHW")
+        self._test_random_body((1, 1, 6, 6), 2, 2, "SAME", "NCHW")
+        self._test_random_body((1, 1, 6, 6), 2, 2, "VALID", "NCHW")
+        self._test_random_body((1, 1, 9, 9), 2, 2, "SAME", "NCHW")
+        self._test_random_body((1, 1, 9, 9), 2, 2, "VALID", "NCHW")
+
+
+class TestMaxPooling(TestPooling):
+    run_test = True
+
+    def make_job(
+        self, x_shape, ksize, strides, padding, data_format, dtype=flow.float32
+    ):
+        config.use_xla_jit(False)
+        config.use_tensorrt(False)
+
+        @flow.global_function(config)
+        def max_pooling_job(x=flow.FixedTensorDef(x_shape, dtype=dtype)):
+            return flow.nn.max_pool2d(
+                x,
+                ksize=ksize,
+                strides=strides,
+                padding=padding,
+                data_format=data_format,
+            )
+
+        return max_pooling_job
+
+    def make_trt_job(
+        self, x_shape, ksize, strides, padding, data_format, dtype=flow.float32
+    ):
+        config.use_xla_jit(False)
+        config.use_tensorrt(True)
+
+        @flow.global_function(config)
+        def trt_max_pooling_job(x=flow.FixedTensorDef(x_shape, dtype=dtype)):
+            return flow.nn.max_pool2d(
+                x,
+                ksize=ksize,
+                strides=strides,
+                padding=padding,
+                data_format=data_format,
+            )
+
+        return trt_max_pooling_job
+
+
+class TestAveragePooling(TestPooling):
+    run_test = True
+
+    def make_job(
+        self, x_shape, ksize, strides, padding, data_format, dtype=flow.float32
+    ):
+        config.use_xla_jit(False)
+        config.use_tensorrt(False)
+
+        @flow.global_function(config)
+        def avg_pooling_job(x=flow.FixedTensorDef(x_shape, dtype=dtype)):
+            return flow.nn.avg_pool2d(
+                x,
+                ksize=ksize,
+                strides=strides,
+                padding=padding,
+                data_format=data_format,
+            )
+
+        return avg_pooling_job
+
+    def make_trt_job(
+        self, x_shape, ksize, strides, padding, data_format, dtype=flow.float32
+    ):
+        config.use_xla_jit(False)
+        config.use_tensorrt(True)
+
+        @flow.global_function(config)
+        def trt_avg_pooling_job(x=flow.FixedTensorDef(x_shape, dtype=dtype)):
+            return flow.nn.avg_pool2d(
+                x,
+                ksize=ksize,
+                strides=strides,
+                padding=padding,
+                data_format=data_format,
+            )
+
+        return trt_avg_pooling_job
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/xrt/test_reduce_op.py b/python/oneflow/compatible/single_client/test/xrt/test_reduce_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..b01a58498721fe148c3789b14d3b8d1a37f78147
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/xrt/test_reduce_op.py
@@ -0,0 +1,143 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+config = flow.function_config()
+
+
+class TestReduce(unittest.TestCase):
+    run_test = False
+
+    def _test_body(self, x, axis, keepdims, dtype=np.float32):
+        if not self.run_test:
+            return
+        f1 = self.make_job(x.shape, axis, keepdims, dtype=flow.float32)
+        f2 = self.make_xla_job(x.shape, axis, keepdims, dtype=flow.float32)
+        a = f1(x).get()
+        b = f2(x).get()
+        print("without xla: ", a)
+        print("with xla: ", b)
+        self.assertTrue(a.shape == b.shape)
+        self.assertTrue(np.allclose(a.numpy(), b.numpy(), rtol=0.001, atol=1e-05))
+        flow.clear_default_session()
+        f3 = self.make_trt_job(x.shape, axis, keepdims, dtype=flow.float32)
+        c = f3(x).get()
+        print("with tensorrt: ", c)
+        self.assertTrue(a.shape == c.shape)
+        self.assertTrue(np.allclose(a.numpy(), c.numpy(), rtol=0.001, atol=1e-05))
+        flow.clear_default_session()
+
+    def _test_ones_body(self, shape, axis, keepdims, dtype=np.float32):
+        x = np.ones(shape, dtype=dtype)
+        self._test_body(x, axis, keepdims, dtype=dtype)
+
+    def _test_random_body(self, shape, axis, keepdims, dtype=np.float32):
+        x = np.random.random(shape).astype(dtype)
+        self._test_body(x, axis, keepdims, dtype=dtype)
+
+    def test_ones_input(self):
+        self._test_ones_body(1, [0], True)
+        self._test_ones_body((1, 10), [1], False)
+        self._test_ones_body((1, 10), [1], True)
+        self._test_ones_body((1, 10), [0, 1], True)
+        self._test_ones_body((2, 10, 2), [1, 2], False)
+        self._test_ones_body((2, 10, 2), [1, 2], True)
+
+    def test_random_input(self):
+        self._test_random_body(1, [0], True)
+        self._test_random_body((1, 10), [1], False)
+        self._test_random_body((1, 10), [1], True)
+        self._test_random_body((1, 10), [0, 1], True)
+        self._test_random_body((2, 10, 2), [1, 2], False)
+        self._test_random_body((2, 10, 2), [1, 2], True)
+
+
+class TestReduceSum(TestReduce):
+    run_test = True
+
+    def make_job(self, x_shape, axis, keepdims, dtype=flow.float32):
+        config.use_xla_jit(False)
+        config.use_tensorrt(False)
+
+        @flow.global_function(config)
+        def reduce_sum_job(x=flow.FixedTensorDef(x_shape, dtype=dtype)):
+            return flow.math.reduce_sum(x, axis=axis, keepdims=keepdims)
+
+        return reduce_sum_job
+
+    def make_xla_job(self, x_shape, axis, keepdims, dtype=flow.float32):
+        config.use_xla_jit(True)
+        config.use_tensorrt(False)
+
+        @flow.global_function(config)
+        def xla_reduce_sum_job(x=flow.FixedTensorDef(x_shape, dtype=dtype)):
+            return flow.math.reduce_sum(x, axis=axis, keepdims=keepdims)
+
+        return xla_reduce_sum_job
+
+    def make_trt_job(self, x_shape, axis, keepdims, dtype=flow.float32):
+        config.use_xla_jit(False)
+        config.use_tensorrt(True)
+
+        @flow.global_function(config)
+        def trt_reduce_sum_job(x=flow.FixedTensorDef(x_shape, dtype=dtype)):
+            return flow.math.reduce_sum(x, axis=axis, keepdims=keepdims)
+
+        return trt_reduce_sum_job
+
+
+class TestReduceMean(TestReduce):
+    run_test = True
+
+    def make_job(self, x_shape, axis, keepdims, dtype=flow.float32):
+        config.use_xla_jit(False)
+        config.use_tensorrt(False)
+
+        @flow.global_function(config)
+        def reduce_mean_job(x=flow.FixedTensorDef(x_shape, dtype=dtype)):
+            return flow.math.reduce_mean(x, axis=axis, keepdims=keepdims)
+
+        return reduce_mean_job
+
+    def make_xla_job(self, x_shape, axis, keepdims, dtype=flow.float32):
+        config.use_xla_jit(True)
+        config.use_tensorrt(False)
+
+        @flow.global_function(config)
+        def xla_reduce_mean_job(x=flow.FixedTensorDef(x_shape, dtype=dtype)):
+            return flow.math.reduce_mean(x, axis=axis, keepdims=keepdims)
+
+        return xla_reduce_mean_job
+
+    def make_trt_job(self, x_shape, axis, keepdims, dtype=flow.float32):
+        config.use_xla_jit(False)
+        config.use_tensorrt(True)
+
+        @flow.global_function(config)
+        def trt_reduce_mean_job(x=flow.FixedTensorDef(x_shape, dtype=dtype)):
+            return flow.math.reduce_mean(x, axis=axis, keepdims=keepdims)
+
+        return trt_reduce_mean_job
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/xrt/test_relu.py b/python/oneflow/compatible/single_client/test/xrt/test_relu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c11777bb921d5a1fd084ec8ad0b7a5edd01610d5
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/xrt/test_relu.py
@@ -0,0 +1,98 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+config = flow.function_config()
+
+
+def make_job(input_shape, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def relu_job(x=flow.FixedTensorDef(input_shape, dtype=dtype)):
+        return flow.math.relu(x)
+
+    return relu_job
+
+
+def make_xla_job(input_shape, dtype=flow.float32):
+    config.use_xla_jit(True)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def xla_relu_job(x=flow.FixedTensorDef(input_shape, dtype=dtype)):
+        return flow.math.relu(x)
+
+    return xla_relu_job
+
+
+def make_trt_job(input_shape, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(True)
+
+    @flow.global_function(config)
+    def trt_relu_job(x=flow.FixedTensorDef(input_shape, dtype=dtype)):
+        return flow.math.relu(x)
+
+    return trt_relu_job
+
+
+class TestRelu(unittest.TestCase):
+    def _test_body(self, x, dtype=np.float32):
+        f1 = make_job(x.shape, dtype=flow.float32)
+        f2 = make_xla_job(x.shape, dtype=flow.float32)
+        a = f1(x).get()
+        b = f2(x).get()
+        print("without xla: ", a)
+        print("with xla: ", b)
+        self.assertTrue(np.allclose(a.numpy(), b.numpy(), rtol=0.001, atol=1e-05))
+        flow.clear_default_session()
+        f3 = make_trt_job(x.shape, dtype=flow.float32)
+        c = f3(x).get()
+        print("with tensorrt: ", c)
+        self.assertTrue(np.allclose(a.numpy(), c.numpy(), rtol=0.001, atol=1e-05))
+        flow.clear_default_session()
+
+    def _test_ones_body(self, shape, dtype=np.float32):
+        x = np.ones(shape, dtype=dtype)
+        self._test_body(x, dtype=dtype)
+
+    def _test_random_body(self, shape, dtype=np.float32):
+        x = np.random.random(shape).astype(dtype)
+        self._test_body(x, dtype=dtype)
+
+    def test_ones_input(self):
+        self._test_ones_body(1)
+        self._test_ones_body((1, 10))
+        self._test_ones_body((2, 10, 2))
+        self._test_ones_body((2, 5, 2, 2))
+
+    def test_random_input(self):
+        self._test_random_body(1)
+        self._test_random_body((1, 10))
+        self._test_random_body((2, 10, 2))
+        self._test_random_body((2, 5, 2, 2))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/xrt/test_reshape.py b/python/oneflow/compatible/single_client/test/xrt/test_reshape.py
new file mode 100644
index 0000000000000000000000000000000000000000..58ccdb2462433e056775a38e2c4fdbf0799b4781
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/xrt/test_reshape.py
@@ -0,0 +1,98 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+config = flow.function_config()
+
+
+def make_job(x_shape, shape, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def reshape_job(x=flow.FixedTensorDef(x_shape, dtype=dtype)):
+        return flow.reshape(x, shape)
+
+    return reshape_job
+
+
+def make_xla_job(x_shape, shape, dtype=flow.float32):
+    config.use_xla_jit(True)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def xla_reshape_job(x=flow.FixedTensorDef(x_shape, dtype=dtype)):
+        return flow.reshape(x, shape)
+
+    return xla_reshape_job
+
+
+def make_trt_job(x_shape, shape, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(True)
+
+    @flow.global_function(config)
+    def trt_reshape_job(x=flow.FixedTensorDef(x_shape, dtype=dtype)):
+        return flow.reshape(x, shape)
+
+    return trt_reshape_job
+
+
+class TestReshape(unittest.TestCase):
+    def _test_body(self, x, shape, dtype=np.float32):
+        f1 = make_job(x.shape, shape, dtype=flow.float32)
+        f2 = make_xla_job(x.shape, shape, dtype=flow.float32)
+        a = f1(x).get()
+        b = f2(x).get()
+        print("without xla: ", a)
+        print("with xla: ", b)
+        self.assertTrue(a.shape == b.shape)
+        self.assertTrue(np.allclose(a.numpy(), b.numpy(), rtol=0.001, atol=1e-05))
+        flow.clear_default_session()
+        f3 = make_trt_job(x.shape, shape, dtype=flow.float32)
+        c = f3(x).get()
+        print("with tensorrt: ", c)
+        self.assertTrue(a.shape == c.shape)
+        self.assertTrue(np.allclose(a.numpy(), c.numpy(), rtol=0.001, atol=1e-05))
+        flow.clear_default_session()
+
+    def _test_ones_body(self, x_shape, shape, dtype=np.float32):
+        x = np.ones(x_shape, dtype=dtype)
+        self._test_body(x, shape, dtype=dtype)
+
+    def _test_random_body(self, x_shape, shape, dtype=np.float32):
+        x = np.random.random(x_shape).astype(dtype)
+        self._test_body(x, shape, dtype=dtype)
+
+    def test_ones_input(self):
+        self._test_ones_body((1, 10), (10,))
+        self._test_ones_body((2, 10, 2), (4, 10))
+        self._test_ones_body((2, 5, 2, 2), (2, 5, 4))
+
+    def test_random_input(self):
+        self._test_random_body((1, 10), (10,))
+        self._test_random_body((2, 10, 2), (4, 10))
+        self._test_random_body((2, 5, 2, 2), (2, 5, 4))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/xrt/test_reshape_like.py b/python/oneflow/compatible/single_client/test/xrt/test_reshape_like.py
new file mode 100644
index 0000000000000000000000000000000000000000..79a09315ec631c819eccd18c9015490e68b8ea14
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/xrt/test_reshape_like.py
@@ -0,0 +1,109 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+config = flow.function_config()
+
+
+def make_job(x_shape, like_shape, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def reshape_like_job(
+        x=flow.FixedTensorDef(x_shape, dtype=dtype),
+        like=flow.FixedTensorDef(like_shape, dtype=dtype),
+    ):
+        return flow.reshape_like(x, like)
+
+    return reshape_like_job
+
+
+def make_xla_job(x_shape, like_shape, dtype=flow.float32):
+    config.use_xla_jit(True)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def xla_reshape_like_job(
+        x=flow.FixedTensorDef(x_shape, dtype=dtype),
+        like=flow.FixedTensorDef(like_shape, dtype=dtype),
+    ):
+        return flow.reshape_like(x, like)
+
+    return xla_reshape_like_job
+
+
+def make_trt_job(x_shape, like_shape, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(True)
+
+    @flow.global_function(config)
+    def trt_reshape_like_job(
+        x=flow.FixedTensorDef(x_shape, dtype=dtype),
+        like=flow.FixedTensorDef(like_shape, dtype=dtype),
+    ):
+        return flow.reshape_like(x, like)
+
+    return trt_reshape_like_job
+
+
+class TestReshapeLike(unittest.TestCase):
+    def _test_body(self, x, like, dtype=np.float32):
+        f1 = make_job(x.shape, like.shape, dtype=flow.float32)
+        f2 = make_xla_job(x.shape, like.shape, dtype=flow.float32)
+        a = f1(x, like).get()
+        b = f2(x, like).get()
+        print("without xla: ", a)
+        print("with xla: ", b)
+        self.assertTrue(a.shape == b.shape)
+        self.assertTrue(np.allclose(a.numpy(), b.numpy(), rtol=0.001, atol=1e-05))
+        flow.clear_default_session()
+        f3 = make_trt_job(x.shape, like.shape, dtype=flow.float32)
+        c = f3(x, like).get()
+        print("with tensorrt: ", c)
+        self.assertTrue(a.shape == c.shape)
+        self.assertTrue(np.allclose(a.numpy(), c.numpy(), rtol=0.001, atol=1e-05))
+        flow.clear_default_session()
+
+    def _test_ones_body(self, x_shape, like_shape, dtype=np.float32):
+        x = np.ones(x_shape, dtype=dtype)
+        like = np.ones(like_shape, dtype=dtype)
+        self._test_body(x, like, dtype=dtype)
+
+    def _test_random_body(self, x_shape, like_shape, dtype=np.float32):
+        x = np.random.random(x_shape).astype(dtype)
+        like = np.random.random(like_shape).astype(dtype)
+        self._test_body(x, like, dtype=dtype)
+
+    def test_ones_input(self):
+        self._test_ones_body((1, 10), (10,))
+        self._test_ones_body((2, 10, 2), (4, 10))
+        self._test_ones_body((2, 5, 2, 2), (2, 5, 4))
+
+    def test_random_input(self):
+        self._test_random_body((1, 10), (10,))
+        self._test_random_body((2, 10, 2), (4, 10))
+        self._test_random_body((2, 5, 2, 2), (2, 5, 4))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/xrt/test_rsqrt.py b/python/oneflow/compatible/single_client/test/xrt/test_rsqrt.py
new file mode 100644
index 0000000000000000000000000000000000000000..81a86cde3cb61d1be8dea5d9b0c5f3d27ed9f7a4
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/xrt/test_rsqrt.py
@@ -0,0 +1,80 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+config = flow.function_config()
+
+
+def make_job(x_shape, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def rsqrt_job(x=flow.FixedTensorDef(x_shape, dtype=dtype)):
+        return flow.math.rsqrt(x)
+
+    return rsqrt_job
+
+
+def make_xla_job(x_shape, dtype=flow.float32):
+    config.use_xla_jit(True)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def xla_rsqrt_job(x=flow.FixedTensorDef(x_shape, dtype=dtype)):
+        return flow.math.rsqrt(x)
+
+    return xla_rsqrt_job
+
+
+class TestAdd(unittest.TestCase):
+    def _test_body(self, x, dtype=np.float32):
+        f1 = make_job(x.shape, dtype=flow.float32)
+        f2 = make_xla_job(x.shape, dtype=flow.float32)
+        a = f1(x).get()
+        b = f2(x).get()
+        print("without xla: ", a)
+        print("with xla", b)
+        self.assertTrue(np.allclose(a.numpy(), b.numpy(), rtol=0.001, atol=1e-05))
+        flow.clear_default_session()
+
+    def _test_ones_body(self, x_shape, dtype=np.float32):
+        x = np.ones(x_shape, dtype=dtype)
+        self._test_body(x, dtype=dtype)
+
+    def _test_random_body(self, x_shape, dtype=np.float32):
+        x = np.random.random(x_shape).astype(dtype)
+        self._test_body(x, dtype=dtype)
+
+    def test_ones_input(self):
+        self._test_ones_body((1, 10))
+        self._test_ones_body((2, 10, 2))
+        self._test_ones_body((2, 5, 2, 2))
+
+    def test_random_input(self):
+        self._test_random_body((1, 10))
+        self._test_random_body((2, 10, 2))
+        self._test_random_body((2, 5, 2, 2))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/xrt/test_scalar_op.py b/python/oneflow/compatible/single_client/test/xrt/test_scalar_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..edaea9475365311fcccd10a8b914f5db0a1bc8f9
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/xrt/test_scalar_op.py
@@ -0,0 +1,110 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+config = flow.function_config()
+
+
+class TestScalarOp(unittest.TestCase):
+    run_test = False
+
+    def _test_body(self, x, scalar, dtype=np.float32):
+        if not self.run_test:
+            return
+        f1 = self.make_job(x.shape, scalar, dtype=flow.float32)
+        f2 = self.make_xla_job(x.shape, scalar, dtype=flow.float32)
+        a = f1(x).get()
+        b = f2(x).get()
+        print("without xla: ", a)
+        print("with xla", b)
+        self.assertTrue(np.allclose(a.numpy(), b.numpy(), rtol=0.001, atol=1e-05))
+        flow.clear_default_session()
+
+    def _test_ones_body(self, x_shape, scalar, dtype=np.float32):
+        x = np.ones(x_shape, dtype=dtype)
+        self._test_body(x, scalar, dtype=dtype)
+
+    def _test_random_body(self, x_shape, scalar, dtype=np.float32):
+        x = np.random.random(x_shape).astype(dtype)
+        self._test_body(x, scalar, dtype=dtype)
+
+    def test_ones_input(self):
+        self._test_ones_body((1, 10), 2.0)
+        self._test_ones_body((2, 10, 2), 2.0)
+        self._test_ones_body((2, 5, 2, 2), 2.0)
+
+    def test_random_input(self):
+        self._test_random_body((1, 10), 2.0)
+        self._test_random_body((2, 10, 2), 2.0)
+        self._test_random_body((2, 5, 2, 2), 2.0)
+
+
+class TestScalarAddOp(TestScalarOp):
+    run_test = True
+
+    def make_job(self, x_shape, scalar, dtype=flow.float32):
+        config.use_xla_jit(False)
+        config.use_tensorrt(False)
+
+        @flow.global_function(config)
+        def scalar_add_job(x=flow.FixedTensorDef(x_shape, dtype=dtype)):
+            return flow.math.add(x, scalar)
+
+        return scalar_add_job
+
+    def make_xla_job(self, x_shape, scalar, dtype=flow.float32):
+        config.use_xla_jit(True)
+        config.use_tensorrt(False)
+
+        @flow.global_function(config)
+        def xla_scalar_add_job(x=flow.FixedTensorDef(x_shape, dtype=dtype)):
+            return flow.math.add(x, scalar)
+
+        return xla_scalar_add_job
+
+
+class TestScalarMulOp(TestScalarOp):
+    run_test = True
+
+    def make_job(self, x_shape, scalar, dtype=flow.float32):
+        config.use_xla_jit(False)
+        config.use_tensorrt(False)
+
+        @flow.global_function(config)
+        def scalar_mul_job(x=flow.FixedTensorDef(x_shape, dtype=dtype)):
+            return flow.math.multiply(x, scalar)
+
+        return scalar_mul_job
+
+    def make_xla_job(self, x_shape, scalar, dtype=flow.float32):
+        config.use_xla_jit(True)
+        config.use_tensorrt(False)
+
+        @flow.global_function(config)
+        def xla_scalar_mul_job(x=flow.FixedTensorDef(x_shape, dtype=dtype)):
+            return flow.math.multiply(x, scalar)
+
+        return xla_scalar_mul_job
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/xrt/test_sigmoid.py b/python/oneflow/compatible/single_client/test/xrt/test_sigmoid.py
new file mode 100644
index 0000000000000000000000000000000000000000..95c83d0373008c0e41e91a0661256e64753e5f19
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/xrt/test_sigmoid.py
@@ -0,0 +1,98 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+config = flow.function_config()
+
+
+def make_job(input_shape, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def sigmoid_job(x=flow.FixedTensorDef(input_shape, dtype=dtype)):
+        return flow.math.sigmoid(x)
+
+    return sigmoid_job
+
+
+def make_xla_job(input_shape, dtype=flow.float32):
+    config.use_xla_jit(True)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def xla_sigmoid_job(x=flow.FixedTensorDef(input_shape, dtype=dtype)):
+        return flow.math.sigmoid(x)
+
+    return xla_sigmoid_job
+
+
+def make_trt_job(input_shape, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(True)
+
+    @flow.global_function(config)
+    def trt_sigmoid_job(x=flow.FixedTensorDef(input_shape, dtype=dtype)):
+        return flow.math.sigmoid(x)
+
+    return trt_sigmoid_job
+
+
+class TestSigmoid(unittest.TestCase):
+    def _test_body(self, x, dtype=np.float32):
+        f1 = make_job(x.shape, dtype=flow.float32)
+        f2 = make_xla_job(x.shape, dtype=flow.float32)
+        a = f1(x).get()
+        b = f2(x).get()
+        print("without xla: ", a)
+        print("with xla: ", b)
+        self.assertTrue(np.allclose(a.numpy(), b.numpy(), rtol=0.001, atol=1e-05))
+        flow.clear_default_session()
+        f3 = make_trt_job(x.shape, dtype=flow.float32)
+        c = f3(x).get()
+        print("with tensorrt: ", c)
+        self.assertTrue(np.allclose(a.numpy(), c.numpy(), rtol=0.001, atol=1e-05))
+        flow.clear_default_session()
+
+    def _test_ones_body(self, shape, dtype=np.float32):
+        x = np.ones(shape, dtype=dtype)
+        self._test_body(x, dtype=dtype)
+
+    def _test_random_body(self, shape, dtype=np.float32):
+        x = np.random.random(shape).astype(dtype)
+        self._test_body(x, dtype=dtype)
+
+    def test_ones_input(self):
+        self._test_ones_body(1)
+        self._test_ones_body((1, 10))
+        self._test_ones_body((2, 10, 2))
+        self._test_ones_body((2, 5, 2, 2))
+
+    def test_random_input(self):
+        self._test_random_body(1)
+        self._test_random_body((1, 10))
+        self._test_random_body((2, 10, 2))
+        self._test_random_body((2, 5, 2, 2))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/xrt/test_softmax.py b/python/oneflow/compatible/single_client/test/xrt/test_softmax.py
new file mode 100644
index 0000000000000000000000000000000000000000..a80460e50b32cab74db5f6e3ece07cde67fd3337
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/xrt/test_softmax.py
@@ -0,0 +1,98 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+config = flow.function_config()
+
+
+def make_job(input_shape, axis, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def softmax_job(x=flow.FixedTensorDef(input_shape, dtype=dtype)):
+        return flow.nn.softmax(x, axis=axis)
+
+    return softmax_job
+
+
+def make_xla_job(input_shape, axis, dtype=flow.float32):
+    config.use_xla_jit(True)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def xla_softmax_job(x=flow.FixedTensorDef(input_shape, dtype=dtype)):
+        return flow.nn.softmax(x, axis=axis)
+
+    return xla_softmax_job
+
+
+def make_trt_job(input_shape, axis, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(True)
+
+    @flow.global_function(config)
+    def trt_softmax_job(x=flow.FixedTensorDef(input_shape, dtype=dtype)):
+        return flow.nn.softmax(x, axis=axis)
+
+    return trt_softmax_job
+
+
+class TestSoftmax(unittest.TestCase):
+    def _test_body(self, x, axis, dtype=np.float32):
+        f1 = make_job(x.shape, axis, dtype=flow.float32)
+        f2 = make_xla_job(x.shape, axis, dtype=flow.float32)
+        a = f1(x).get()
+        b = f2(x).get()
+        print("without xla: ", a)
+        print("with xla: ", b)
+        self.assertTrue(np.allclose(a.numpy(), b.numpy(), rtol=0.001, atol=1e-05))
+        flow.clear_default_session()
+        f3 = make_trt_job(x.shape, axis, dtype=flow.float32)
+        c = f3(x).get()
+        print("with tensorrt: ", c)
+        self.assertTrue(np.allclose(a.numpy(), c.numpy(), rtol=0.001, atol=1e-05))
+        flow.clear_default_session()
+
+    def _test_ones_body(self, shape, axis, dtype=np.float32):
+        x = np.ones(shape, dtype=dtype)
+        self._test_body(x, axis, dtype=dtype)
+
+    def _test_random_body(self, shape, axis, dtype=np.float32):
+        x = np.random.random(shape).astype(dtype)
+        self._test_body(x, axis, dtype=dtype)
+
+    def test_ones_input(self):
+        self._test_ones_body((2, 5), axis=1)
+        self._test_ones_body((2, 5), axis=-1)
+        self._test_ones_body((1, 5, 2), axis=1)
+        self._test_ones_body((1, 5, 2), axis=2)
+
+    def test_random_input(self):
+        self._test_random_body((2, 5), axis=1)
+        self._test_random_body((2, 5), axis=-1)
+        self._test_random_body((1, 5, 2), axis=1)
+        self._test_random_body((1, 5, 2), axis=2)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/xrt/test_softmax_grad.py b/python/oneflow/compatible/single_client/test/xrt/test_softmax_grad.py
new file mode 100644
index 0000000000000000000000000000000000000000..819679ef60af050e7e44f07d596bb6897630f14b
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/xrt/test_softmax_grad.py
@@ -0,0 +1,91 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+config = flow.function_config()
+
+
+def make_job(shape, axis, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def softmax_grad_job(
+        y=flow.FixedTensorDef(shape, dtype=dtype),
+        dy=flow.FixedTensorDef(shape, dtype=dtype),
+    ):
+        return flow.nn.softmax_grad(y, dy, axis=axis)
+
+    return softmax_grad_job
+
+
+def make_xla_job(shape, axis, dtype=flow.float32):
+    config.use_xla_jit(True)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def xla_softmax_grad_job(
+        y=flow.FixedTensorDef(shape, dtype=dtype),
+        dy=flow.FixedTensorDef(shape, dtype=dtype),
+    ):
+        return flow.nn.softmax_grad(y, dy, axis=axis)
+
+    return xla_softmax_grad_job
+
+
+class TestSoftmaxGrad(unittest.TestCase):
+    def _test_body(self, y, dy, axis, dtype=np.float32):
+        f1 = make_job(y.shape, axis, dtype=flow.float32)
+        f2 = make_xla_job(y.shape, axis, dtype=flow.float32)
+        a = f1(y, dy).get()
+        b = f2(y, dy).get()
+        print("without xla: ", a)
+        print("with xla", b)
+        self.assertTrue(a.shape == b.shape)
+        self.assertTrue(np.allclose(a.numpy(), b.numpy(), rtol=0.001, atol=1e-05))
+        flow.clear_default_session()
+
+    def _test_ones_body(self, shape, axis, dtype=np.float32):
+        y = np.ones(shape, dtype=dtype)
+        dy = np.ones(shape, dtype=dtype)
+        self._test_body(y, dy, axis, dtype=dtype)
+
+    def _test_random_body(self, shape, axis, dtype=np.float32):
+        y = np.random.random(shape).astype(dtype)
+        dy = np.random.random(shape).astype(dtype)
+        self._test_body(y, dy, axis, dtype=dtype)
+
+    def test_ones_input(self):
+        self._test_ones_body((2, 5), axis=1)
+        self._test_ones_body((2, 5), axis=-1)
+        self._test_ones_body((1, 5, 2), axis=1)
+        self._test_ones_body((1, 5, 2), axis=2)
+
+    def test_random_input(self):
+        self._test_random_body((2, 5), axis=1)
+        self._test_random_body((2, 5), axis=-1)
+        self._test_random_body((1, 5, 2), axis=1)
+        self._test_random_body((1, 5, 2), axis=2)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/xrt/test_square_sum.py b/python/oneflow/compatible/single_client/test/xrt/test_square_sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..45c2d3dd77620dfb07c4ebf1cd596ca36b38b7b1
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/xrt/test_square_sum.py
@@ -0,0 +1,80 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+config = flow.function_config()
+
+
+def make_job(x_shape, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def square_sum_job(x: flow.typing.Numpy.Placeholder(x_shape, dtype=dtype)):
+        return flow.experimental.square_sum(x)
+
+    return square_sum_job
+
+
+def make_xla_job(x_shape, dtype=flow.float32):
+    config.use_xla_jit(True)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def xla_square_sum_job(x: flow.typing.Numpy.Placeholder(x_shape, dtype=dtype)):
+        return flow.experimental.square_sum(x)
+
+    return xla_square_sum_job
+
+
+class TestAdd(unittest.TestCase):
+    def _test_body(self, x, dtype=np.float32):
+        f1 = make_job(x.shape, dtype=flow.float32)
+        f2 = make_xla_job(x.shape, dtype=flow.float32)
+        a = f1(x).get()
+        b = f2(x).get()
+        print("without xla: ", a)
+        print("with xla", b)
+        self.assertTrue(np.allclose(a.numpy(), b.numpy(), rtol=0.001, atol=1e-05))
+        flow.clear_default_session()
+
+    def _test_ones_body(self, x_shape, dtype=np.float32):
+        x = np.ones(x_shape, dtype=dtype)
+        self._test_body(x, dtype=dtype)
+
+    def _test_random_body(self, x_shape, dtype=np.float32):
+        x = np.random.random(x_shape).astype(dtype)
+        self._test_body(x, dtype=dtype)
+
+    def test_ones_input(self):
+        self._test_ones_body((1, 10))
+        self._test_ones_body((2, 10, 2))
+        self._test_ones_body((2, 5, 2, 2))
+
+    def test_random_input(self):
+        self._test_random_body((1, 10))
+        self._test_random_body((2, 10, 2))
+        self._test_random_body((2, 5, 2, 2))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/xrt/test_tanh.py b/python/oneflow/compatible/single_client/test/xrt/test_tanh.py
new file mode 100644
index 0000000000000000000000000000000000000000..23bcc94443e19ee52cd842ed1179c02b7f0e6c27
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/xrt/test_tanh.py
@@ -0,0 +1,98 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+config = flow.function_config()
+
+
+def make_job(input_shape, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def tanh_job(x=flow.FixedTensorDef(input_shape, dtype=dtype)):
+        return flow.math.tanh(x)
+
+    return tanh_job
+
+
+def make_xla_job(input_shape, dtype=flow.float32):
+    config.use_xla_jit(True)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def xla_tanh_job(x=flow.FixedTensorDef(input_shape, dtype=dtype)):
+        return flow.math.tanh(x)
+
+    return xla_tanh_job
+
+
+def make_trt_job(input_shape, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(True)
+
+    @flow.global_function(config)
+    def trt_tanh_job(x=flow.FixedTensorDef(input_shape, dtype=dtype)):
+        return flow.math.tanh(x)
+
+    return trt_tanh_job
+
+
+class TestTanh(unittest.TestCase):
+    def _test_body(self, x, dtype=np.float32):
+        f1 = make_job(x.shape, dtype=flow.float32)
+        f2 = make_xla_job(x.shape, dtype=flow.float32)
+        a = f1(x).get()
+        b = f2(x).get()
+        print("without xla: ", a)
+        print("with xla: ", b)
+        self.assertTrue(np.allclose(a.numpy(), b.numpy(), rtol=0.001, atol=1e-05))
+        flow.clear_default_session()
+        f3 = make_trt_job(x.shape, dtype=flow.float32)
+        c = f3(x).get()
+        print("with tensorrt: ", c)
+        self.assertTrue(np.allclose(a.numpy(), c.numpy(), rtol=0.001, atol=1e-05))
+        flow.clear_default_session()
+
+    def _test_ones_body(self, shape, dtype=np.float32):
+        x = np.ones(shape, dtype=dtype)
+        self._test_body(x, dtype=dtype)
+
+    def _test_random_body(self, shape, dtype=np.float32):
+        x = np.random.random(shape).astype(dtype)
+        self._test_body(x, dtype=dtype)
+
+    def test_ones_input(self):
+        self._test_ones_body(1)
+        self._test_ones_body((1, 10))
+        self._test_ones_body((2, 10, 2))
+        self._test_ones_body((2, 5, 2, 2))
+
+    def test_random_input(self):
+        self._test_random_body(1)
+        self._test_random_body((1, 10))
+        self._test_random_body((2, 10, 2))
+        self._test_random_body((2, 5, 2, 2))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/test/xrt/test_transpose.py b/python/oneflow/compatible/single_client/test/xrt/test_transpose.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f46de633ce2440a0fa28d709654596b7d30b51c
--- /dev/null
+++ b/python/oneflow/compatible/single_client/test/xrt/test_transpose.py
@@ -0,0 +1,102 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow.compatible.single_client.unittest
+from oneflow.compatible import single_client as flow
+
+config = flow.function_config()
+
+
+def make_job(input_shape, permute, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def transpose_job(x=flow.FixedTensorDef(input_shape, dtype=dtype)):
+        return flow.transpose(x, perm=permute)
+
+    return transpose_job
+
+
+def make_xla_job(input_shape, permute, dtype=flow.float32):
+    config.use_xla_jit(True)
+    config.use_tensorrt(False)
+
+    @flow.global_function(config)
+    def xla_transpose_job(x=flow.FixedTensorDef(input_shape, dtype=dtype)):
+        return flow.transpose(x, perm=permute)
+
+    return xla_transpose_job
+
+
+def make_trt_job(input_shape, permute, dtype=flow.float32):
+    config.use_xla_jit(False)
+    config.use_tensorrt(True)
+
+    @flow.global_function(config)
+    def trt_transpose_job(x=flow.FixedTensorDef(input_shape, dtype=dtype)):
+        return flow.transpose(x, perm=permute)
+
+    return trt_transpose_job
+
+
+class TestTranspose(unittest.TestCase):
+    def _test_body(self, x, permute, dtype=flow.float32):
+        f1 = make_job(x.shape, permute, dtype=dtype)
+        f2 = make_xla_job(x.shape, permute, dtype=dtype)
+        a = f1(x).get()
+        b = f2(x).get()
+        print("without xla: ", a)
+        print("with xla: ", b)
+        self.assertTrue(a.shape == b.shape)
+        self.assertTrue(np.allclose(a.numpy(), b.numpy(), rtol=0.001, atol=1e-05))
+        flow.clear_default_session()
+        f3 = make_trt_job(x.shape, permute, dtype=dtype)
+        c = f3(x).get()
+        print("with tensorrt: ", c)
+        self.assertTrue(a.shape == c.shape)
+        self.assertTrue(np.allclose(a.numpy(), c.numpy(), rtol=0.001, atol=1e-05))
+        flow.clear_default_session()
+
+    def _test_ones_body(self, shape, permute, dtype=flow.float32):
+        np_dtype = flow.convert_oneflow_dtype_to_numpy_dtype(dtype)
+        x = np.ones(shape, dtype=np_dtype)
+        self._test_body(x, permute, dtype=dtype)
+
+    def _test_random_body(self, shape, permute, dtype=flow.float32):
+        np_dtype = flow.convert_oneflow_dtype_to_numpy_dtype(dtype)
+        x = np.random.random(shape).astype(np_dtype)
+        self._test_body(x, permute, dtype=dtype)
+
+    def test_ones_input(self):
+        self._test_ones_body((1, 2), (1, 0))
+        self._test_ones_body((2, 2, 2), (0, 2, 1))
+        self._test_ones_body((2, 2, 2), (1, 0, 2))
+        self._test_ones_body((2, 2, 2), (1, 2, 0))
+
+    def test_random_input(self):
+        self._test_random_body((1, 2), (1, 0))
+        self._test_random_body((2, 2, 2), (0, 2, 1))
+        self._test_random_body((2, 2, 2), (1, 0, 2))
+        self._test_random_body((2, 2, 2), (1, 2, 0))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/compatible/single_client/train.py b/python/oneflow/compatible/single_client/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..510eb38378bbf5311a4b1e48c60c878bd9674927
--- /dev/null
+++ b/python/oneflow/compatible/single_client/train.py
@@ -0,0 +1,19 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.framework.check_point import (
+    CheckPoint,
+    SimpleCheckPointManager,
+)
diff --git a/python/oneflow/compatible/single_client/typing.py b/python/oneflow/compatible/single_client/typing.py
new file mode 100644
index 0000000000000000000000000000000000000000..47785180a683e2cf387e95721f318ba9677f945c
--- /dev/null
+++ b/python/oneflow/compatible/single_client/typing.py
@@ -0,0 +1,21 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.framework.typing import (
+    Bundle,
+    Callback,
+    ListNumpy,
+    Numpy,
+)
diff --git a/python/oneflow/compatible/single_client/unittest/__init__.py b/python/oneflow/compatible/single_client/unittest/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c3b09cd1be045ceff75c8e2d57e18eea020fc59
--- /dev/null
+++ b/python/oneflow/compatible/single_client/unittest/__init__.py
@@ -0,0 +1,28 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.framework.unittest import (
+    TestCase,
+    num_nodes_required,
+    register_test_cases,
+    skip_unless_1n1d,
+    skip_unless_1n2d,
+    skip_unless_1n4d,
+    skip_unless_2n1d,
+    skip_unless_2n2d,
+    skip_unless_2n4d,
+)
+
+from . import env
diff --git a/python/oneflow/compatible/single_client/unittest/env.py b/python/oneflow/compatible/single_client/unittest/env.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2d94835df9490d10f1d69c9fa22c2cc292125f7
--- /dev/null
+++ b/python/oneflow/compatible/single_client/unittest/env.py
@@ -0,0 +1,25 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.framework.unittest import (
+    device_num,
+    eager_execution_enabled,
+    has_node_list,
+    has_world_size,
+    node_list,
+    node_size,
+    typing_check_enabled,
+    world_size,
+)
diff --git a/python/oneflow/compatible/single_client/util.py b/python/oneflow/compatible/single_client/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a664d82d84d4f79ca9552345d932849047343fb
--- /dev/null
+++ b/python/oneflow/compatible/single_client/util.py
@@ -0,0 +1,16 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.compatible.single_client.framework.id_util import UniqueStr as unique_str
diff --git a/python/oneflow/compatible/single_client/vm.py b/python/oneflow/compatible/single_client/vm.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/python/oneflow/config/__init__.py b/python/oneflow/config/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0701886826282b44a7dd07c3a6903493efdcb469
--- /dev/null
+++ b/python/oneflow/config/__init__.py
@@ -0,0 +1,67 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.framework.config_util import api_collect_act_event as collect_act_event
+from oneflow.framework.config_util import api_comm_net_worker_num as comm_net_worker_num
+from oneflow.framework.config_util import (
+    api_compute_thread_pool_size as compute_thread_pool_size,
+)
+from oneflow.framework.config_util import api_cpu_device_num as cpu_device_num
+from oneflow.framework.config_util import (
+    api_disable_group_boxing_by_dst_parallel as disable_group_boxing_by_dst_parallel,
+)
+from oneflow.framework.config_util import api_enable_debug_mode as enable_debug_mode
+from oneflow.framework.config_util import (
+    api_enable_legacy_model_io as enable_legacy_model_io,
+)
+from oneflow.framework.config_util import (
+    api_enable_mem_chain_merge as enable_mem_chain_merge,
+)
+from oneflow.framework.config_util import api_enable_model_io_v2 as enable_model_io_v2
+from oneflow.framework.config_util import (
+    api_enable_tensor_float_32_compute as enable_tensor_float_32_compute,
+)
+from oneflow.framework.config_util import api_gpu_device_num as gpu_device_num
+from oneflow.framework.config_util import (
+    api_legacy_model_io_enabled as legacy_model_io_enabled,
+)
+from oneflow.framework.config_util import api_load_library as load_library
+from oneflow.framework.config_util import api_load_library_now as load_library_now
+from oneflow.framework.config_util import api_machine_num as machine_num
+from oneflow.framework.config_util import (
+    api_max_mdsave_worker_num as max_mdsave_worker_num,
+)
+from oneflow.framework.config_util import (
+    api_nccl_use_compute_stream as nccl_use_compute_stream,
+)
+from oneflow.framework.config_util import (
+    api_numa_aware_cuda_malloc_host as enable_numa_aware_cuda_malloc_host,
+)
+from oneflow.framework.config_util import (
+    api_rdma_mem_block_mbyte as rdma_mem_block_mbyte,
+)
+from oneflow.framework.config_util import (
+    api_rdma_recv_msg_buf_mbyte as rdma_recv_msg_buf_mbyte,
+)
+from oneflow.framework.config_util import (
+    api_reserved_device_mem_mbyte as reserved_device_mem_mbyte,
+)
+from oneflow.framework.config_util import (
+    api_reserved_host_mem_mbyte as reserved_host_mem_mbyte,
+)
+from oneflow.framework.config_util import (
+    api_thread_enable_local_message_queue as thread_enable_local_message_queue,
+)
+from oneflow.framework.config_util import api_use_rdma as use_rdma
diff --git a/python/oneflow/config/collective_boxing.py b/python/oneflow/config/collective_boxing.py
new file mode 100644
index 0000000000000000000000000000000000000000..52c78eca5160e8d45b9b24a9b74523de1cd69997
--- /dev/null
+++ b/python/oneflow/config/collective_boxing.py
@@ -0,0 +1,46 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.framework.config_util import api_enable_fusion as enable_fusion
+from oneflow.framework.config_util import (
+    api_nccl_enable_all_to_all as nccl_enable_all_to_all,
+)
+from oneflow.framework.config_util import (
+    api_nccl_enable_mixed_fusion as nccl_enable_mixed_fusion,
+)
+from oneflow.framework.config_util import (
+    api_nccl_fusion_all_gather as nccl_fusion_all_gather,
+)
+from oneflow.framework.config_util import (
+    api_nccl_fusion_all_reduce as nccl_fusion_all_reduce,
+)
+from oneflow.framework.config_util import (
+    api_nccl_fusion_all_reduce_use_buffer as nccl_fusion_all_reduce_use_buffer,
+)
+from oneflow.framework.config_util import (
+    api_nccl_fusion_broadcast as nccl_fusion_broadcast,
+)
+from oneflow.framework.config_util import api_nccl_fusion_max_ops as nccl_fusion_max_ops
+from oneflow.framework.config_util import api_nccl_fusion_reduce as nccl_fusion_reduce
+from oneflow.framework.config_util import (
+    api_nccl_fusion_reduce_scatter as nccl_fusion_reduce_scatter,
+)
+from oneflow.framework.config_util import (
+    api_nccl_fusion_threshold_mb as nccl_fusion_threshold_mb,
+)
+from oneflow.framework.config_util import api_nccl_num_streams as nccl_num_streams
+from oneflow.framework.config_util import (
+    api_num_callback_threads as num_callback_threads,
+)
diff --git a/python/oneflow/contrib/__init__.py b/python/oneflow/contrib/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb5daf782a0d43a4e7d17d9e2462194ac5658caa
--- /dev/null
+++ b/python/oneflow/contrib/__init__.py
@@ -0,0 +1,16 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from .tensorrt import *
diff --git a/python/oneflow/contrib/tensorrt/__init__.py b/python/oneflow/contrib/tensorrt/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/python/oneflow/contrib/tensorrt/tensorrt_api.py b/python/oneflow/contrib/tensorrt/tensorrt_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..6add6042f7859b6e133879fb846eada5a08bc573
--- /dev/null
+++ b/python/oneflow/contrib/tensorrt/tensorrt_api.py
@@ -0,0 +1,32 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import traceback
+
+import oneflow._oneflow_internal
+
+
+def write_int8_calibration(path):
+    try:
+        oneflow._oneflow_internal.WriteInt8Calibration(path)
+    except oneflow._oneflow_internal.exception.CompileOptionWrongException:
+        traceback.print_exc()
+
+
+def cache_int8_calibration():
+    try:
+        oneflow._oneflow_internal.CacheInt8Calibration()
+    except oneflow._oneflow_internal.exception.CompileOptionWrongException:
+        traceback.print_exc()
diff --git a/python/oneflow/data.py b/python/oneflow/data.py
new file mode 100644
index 0000000000000000000000000000000000000000..1767f9f5d2bcff7059c7523ebffcf1b88713a2fc
--- /dev/null
+++ b/python/oneflow/data.py
@@ -0,0 +1,16 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.experimental.load_mnist import load_mnist
diff --git a/python/oneflow/deprecated/__init__.py b/python/oneflow/deprecated/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..105967f29d1b20a685b285c4cbe337398d9ac896
--- /dev/null
+++ b/python/oneflow/deprecated/__init__.py
@@ -0,0 +1,21 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from oneflow.deprecated.init_cluster_env import (
+    delete_worker_by_bootstrap,
+    delete_worker_of_multi_process,
+)
+from oneflow.experimental.namescope import deprecated_name_scope as variable_scope
diff --git a/python/oneflow/deprecated/init_cluster_env.py b/python/oneflow/deprecated/init_cluster_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..5975ec24714701634e9f2673e1861fdd19196dba
--- /dev/null
+++ b/python/oneflow/deprecated/init_cluster_env.py
@@ -0,0 +1,63 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import getpass
+import os
+import subprocess
+import sys
+import uuid
+from tempfile import NamedTemporaryFile
+
+import google.protobuf.text_format as pbtxt
+
+import oneflow.framework.env_util as env_util
+from oneflow.core.control.ctrl_bootstrap_pb2 import BootstrapConf
+from oneflow.core.job.env_pb2 import EnvProto
+
+
+def delete_worker_by_bootstrap(ssh_port=22) -> None:
+    ssh_port_arg = " -p {} ".format(ssh_port)
+    bootstrap_conf_list = env_util.global_ctrl_bootstrap_confs
+    assert isinstance(bootstrap_conf_list, list)
+    global _temp_run_dir
+    assert _temp_run_dir != ""
+    for bootstrap_conf in bootstrap_conf_list:
+        assert isinstance(bootstrap_conf, BootstrapConf)
+        if bootstrap_conf.rank == 0:
+            continue
+        ssh_prefix = (
+            "ssh {} ".format(ssh_port_arg)
+            + getpass.getuser()
+            + "@"
+            + bootstrap_conf.host
+            + " "
+        )
+        if os.getenv("ONEFLOW_WORKER_KEEP_LOG"):
+            print("worker log kept at: {}".format(bootstrap_conf.host), flush=True)
+        else:
+            _SystemCall(ssh_prefix + '"rm -r ' + _temp_run_dir + '"')
+            print("temp run dir removed at: {}".format(bootstrap_conf.host), flush=True)
+
+
+def delete_worker_of_multi_process(run_dir) -> None:
+    assert run_dir != ""
+    if os.getenv("ONEFLOW_WORKER_KEEP_LOG"):
+        print("worker log kept at localhost:" + run_dir, flush=True)
+    else:
+        os.system("rm -r " + run_dir)
+        print("temp run dir removed at localhost:" + run_dir, flush=True)
+
+
+_temp_run_dir = ""
diff --git a/python/oneflow/deprecated/initializer_util.py b/python/oneflow/deprecated/initializer_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e5d01bfe5d31bfb6004a3eec9e1c3f6377bd673
--- /dev/null
+++ b/python/oneflow/deprecated/initializer_util.py
@@ -0,0 +1,26 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow.core.common.data_type_pb2 as data_type_conf_util
+import oneflow.core.job.initializer_conf_pb2 as initializer_conf_util
+import oneflow.core.operator.op_conf_pb2 as op_conf_util
+
+
+def truncated_normal_initializer(
+    stddev: float = 1.0,
+) -> initializer_conf_util.InitializerConf:
+    initializer = initializer_conf_util.InitializerConf()
+    setattr(initializer.truncated_normal_conf, "std", float(stddev))
+    return initializer
diff --git a/python/oneflow/deprecated/nn.py b/python/oneflow/deprecated/nn.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d4232d220966581375c950577b08471bded51c7
--- /dev/null
+++ b/python/oneflow/deprecated/nn.py
@@ -0,0 +1,16 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.framework.module import Module
diff --git a/python/oneflow/distribute.py b/python/oneflow/distribute.py
new file mode 100644
index 0000000000000000000000000000000000000000..9964e7399adc5a1f45f01904797e8a63e76d0d80
--- /dev/null
+++ b/python/oneflow/distribute.py
@@ -0,0 +1,29 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.framework.distribute import assert_is_valid_distribute, auto, broadcast
+from oneflow.framework.distribute import (
+    deprecated_consistent_strategy as consistent_strategy,
+)
+from oneflow.framework.distribute import (
+    deprecated_consistent_strategy_enabled as consistent_strategy_enabled,
+)
+from oneflow.framework.distribute import (
+    deprecated_mirrored_strategy as mirrored_strategy,
+)
+from oneflow.framework.distribute import (
+    deprecated_mirrored_strategy_enabled as mirrored_strategy_enabled,
+)
+from oneflow.framework.distribute import split
diff --git a/python/oneflow/distributed/__init__.py b/python/oneflow/distributed/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6064c7a752b48fa08b9c4ccae105ddb56448e1c2
--- /dev/null
+++ b/python/oneflow/distributed/__init__.py
@@ -0,0 +1,21 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.framework.distribute import (
+    get_local_rank,
+    get_rank,
+    get_world_size,
+    is_multi_client,
+)
diff --git a/python/oneflow/distributed/launch.py b/python/oneflow/distributed/launch.py
new file mode 100644
index 0000000000000000000000000000000000000000..078689e6c167acbc385ee45197e609fba3940ceb
--- /dev/null
+++ b/python/oneflow/distributed/launch.py
@@ -0,0 +1,211 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+"""
+This file is mostly copied from PyTorch v1.8.1 torch/distributed/launch.py
+"""
+import os
+import signal
+import subprocess
+import sys
+import time
+from argparse import REMAINDER, ArgumentParser
+from typing import IO, Any, List, Optional
+
+stdout_filename = "stdout"
+stderr_filename = "stderr"
+
+
+def parse_args():
+    """
+    Helper function parsing the command line options
+    @retval ArgumentParser
+    """
+    parser = ArgumentParser(
+        description="PyTorch distributed training launch helper utility that will spawn up multiple distributed processes"
+    )
+    parser.add_argument(
+        "--nnodes",
+        type=int,
+        default=1,
+        help="The number of nodes to use for distributed training",
+    )
+    parser.add_argument(
+        "--node_rank",
+        type=int,
+        default=0,
+        help="The rank of the node for multi-node distributed training",
+    )
+    parser.add_argument(
+        "--nproc_per_node",
+        type=int,
+        default=1,
+        help="The number of processes to launch on each node, for GPU training, this is recommended to be set to the number of GPUs in your system so that each process can be bound to a single GPU.",
+    )
+    parser.add_argument(
+        "--master_addr",
+        default="127.0.0.1",
+        type=str,
+        help="Master node (rank 0)'s address, should be either the IP address or the hostname of node 0, for single node multi-proc training, the --master_addr can simply be 127.0.0.1",
+    )
+    parser.add_argument(
+        "--master_port",
+        default=29500,
+        type=int,
+        help="Master node (rank 0)'s free port that needs to be used for communication during distributed training",
+    )
+    parser.add_argument(
+        "-m",
+        "--module",
+        default=False,
+        action="store_true",
+        help="Changes each process to interpret the launch script as a python module, executing with the same behavior as'python -m'.",
+    )
+    parser.add_argument(
+        "--no_python",
+        default=False,
+        action="store_true",
+        help='Do not prepend the training script with "python" - just exec it directly. Useful when the script is not a Python script.',
+    )
+    parser.add_argument(
+        "--redirect_stdout_and_stderr",
+        default=False,
+        action="store_true",
+        help=f"write the stdout and stderr to files\n                    '{stdout_filename}' and '{stderr_filename}'. Only available when logdir is set",
+    )
+    parser.add_argument(
+        "--logdir",
+        default=None,
+        type=str,
+        help=f"Relative path to write subprocess logs to. Passing in a relative\n        path will create a directory if needed. Note that\n        successive runs with the same path to write logs to will overwrite existing logs,\n        so be sure to save logs as needed.",
+    )
+    parser.add_argument(
+        "training_script",
+        type=str,
+        help="The full path to the single GPU training program/script to be launched in parallel, followed by all the arguments for the training script",
+    )
+    parser.add_argument("training_script_args", nargs=REMAINDER)
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    dist_world_size = args.nproc_per_node * args.nnodes
+    current_env = os.environ.copy()
+    current_env["MASTER_ADDR"] = args.master_addr
+    current_env["MASTER_PORT"] = str(args.master_port)
+    current_env["WORLD_SIZE"] = str(dist_world_size)
+    processes: List[Any] = []
+    if args.logdir:
+        if os.path.exists(args.logdir):
+            if not os.path.isdir(args.logdir):
+                raise ValueError("argument --logdir must be a path to a directory.")
+        else:
+            os.mkdir(os.path.join(os.getcwd(), args.logdir))
+    subprocess_file_handles = []
+    for local_rank in range(0, args.nproc_per_node):
+        dist_rank = args.nproc_per_node * args.node_rank + local_rank
+        current_env["RANK"] = str(dist_rank)
+        current_env["LOCAL_RANK"] = str(local_rank)
+        with_python = not args.no_python
+        cmd = []
+        if with_python:
+            cmd = [sys.executable, "-u"]
+            if args.module:
+                cmd.append("-m")
+        elif args.module:
+            raise ValueError(
+                "Don't use both the '--no_python' flag and the '--module' flag at the same time."
+            )
+        cmd.append(args.training_script)
+        cmd.extend(args.training_script_args)
+        stdout_handle: Optional[IO]
+        stderr_handle: Optional[IO]
+        if args.logdir:
+            directory_path = os.path.join(
+                os.getcwd(), args.logdir, f"local_rank_{local_rank}"
+            )
+            os.makedirs(directory_path, exist_ok=True)
+            current_env["GLOG_log_dir"] = directory_path
+        if args.redirect_stdout_and_stderr:
+            if not args.logdir:
+                raise ValueError(
+                    "'redirect_stdout_and_stderr' is only available when 'logdir' is set."
+                )
+            node_rank = args.node_rank
+            stdout_handle = open(os.path.join(directory_path, stdout_filename), "w")
+            stderr_handle = open(os.path.join(directory_path, stderr_filename), "w")
+            subprocess_file_handles.append((stdout_handle, stderr_handle))
+            stdout_name = stdout_handle.name
+            stderr_name = stderr_handle.name
+            print(
+                f"Note: Stdout and stderr for node {node_rank} rank {local_rank} will\n            be written to {stdout_name}, {stderr_name} respectively."
+            )
+        sig_names = {2: "SIGINT", 15: "SIGTERM"}
+        last_return_code = None
+
+        def sigkill_handler(signum, frame):
+            for process in processes:
+                print(f"Killing subprocess {process.pid}")
+                try:
+                    process.kill()
+                except Exception:
+                    pass
+            if last_return_code is not None:
+                raise subprocess.CalledProcessError(
+                    returncode=last_return_code, cmd=cmd
+                )
+            if signum in sig_names:
+                print(f"Main process received {sig_names[signum]}, exiting")
+            sys.exit(1)
+
+        signal.signal(signal.SIGINT, sigkill_handler)
+        signal.signal(signal.SIGTERM, sigkill_handler)
+        stdout_handle = (
+            None
+            if not subprocess_file_handles
+            else subprocess_file_handles[local_rank][0]
+        )
+        stderr_handle = (
+            None
+            if not subprocess_file_handles
+            else subprocess_file_handles[local_rank][1]
+        )
+        process = subprocess.Popen(
+            cmd, env=current_env, stdout=stdout_handle, stderr=stderr_handle
+        )
+        processes.append(process)
+    try:
+        alive_processes = set(processes)
+        while len(alive_processes):
+            finished_processes = []
+            for process in alive_processes:
+                if process.poll() is None:
+                    continue
+                elif process.returncode != 0:
+                    last_return_code = process.returncode
+                    sigkill_handler(signal.SIGTERM, None)
+                else:
+                    finished_processes.append(process)
+            alive_processes = set(alive_processes) - set(finished_processes)
+            time.sleep(1)
+    finally:
+        for (stdout_handle, stderr_handle) in subprocess_file_handles:
+            stdout_handle.close()
+            stderr_handle.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/oneflow/eager/__init__.py b/python/oneflow/eager/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/python/oneflow/eager/blob_register.py b/python/oneflow/eager/blob_register.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bc839f945f19e9a57e36c9668ae6940a687b5e8
--- /dev/null
+++ b/python/oneflow/eager/blob_register.py
@@ -0,0 +1,35 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from contextlib import contextmanager
+
+import oneflow
+import oneflow._oneflow_internal
+
+
+@contextmanager
+def BnInOp2BlobObjectScope(blob_register, op_attribute):
+    bn_in_op2blob_object = oneflow._oneflow_internal.deprecated.BnInOp2BlobObject()
+    for ibn in op_attribute.input_bns:
+        lbi = op_attribute.arg_signature.bn_in_op2lbi[ibn]
+        bn_in_op2blob_object[ibn] = blob_register.GetObject4BlobName(
+            "%s/%s" % (lbi.op_name, lbi.blob_name)
+        )
+    yield bn_in_op2blob_object
+    for obn in op_attribute.output_bns:
+        lbi = op_attribute.arg_signature.bn_in_op2lbi[obn]
+        blob_register.SetObject4BlobName(
+            "%s/%s" % (lbi.op_name, lbi.blob_name), bn_in_op2blob_object[obn]
+        )
diff --git a/python/oneflow/eager/boxing_hob.py b/python/oneflow/eager/boxing_hob.py
new file mode 100644
index 0000000000000000000000000000000000000000..31bc0902cf994e5d05444f44f29ae0c674595f2e
--- /dev/null
+++ b/python/oneflow/eager/boxing_hob.py
@@ -0,0 +1,163 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow._oneflow_internal
+from oneflow.support.high_order_bool import BoolFunctor, bool_functor, hob_context_attr
+
+
+class BoxingHobContext(object):
+    def __init__(self, produced_blob_object, consumer_op_arg_parallel_attr):
+        self.produced_blob_object_ = produced_blob_object
+        self.consumer_op_arg_parallel_attr_ = consumer_op_arg_parallel_attr
+        self.composer2lhs_context = {}
+        self.composer2rhs_context = {}
+        self.composer2middle_op_arg_parallel_attr = {}
+
+    @property
+    def produced_blob_object(self):
+        return self.produced_blob_object_
+
+    @property
+    def consumer_op_arg_parallel_attr(self):
+        return self.consumer_op_arg_parallel_attr_
+
+
+class ComposeHob(BoolFunctor):
+    def __init__(
+        self, lhs_hob, rhs_hob, get_middle_op_arg_parallel_attr, middle_verbose_str=None
+    ):
+        self.get_middle_op_arg_parallel_attr_ = get_middle_op_arg_parallel_attr
+        self.lhs_hob_ = lhs_hob
+        self.rhs_hob_ = rhs_hob
+        self.ctx_id2middle_op_arg_parallel_attr_ = {}
+        self.middle_verbose_str_ = middle_verbose_str
+
+    def verbose_debug_str(self, ctx, display_result=True):
+        left_display = self.lhs_hob_.debug_str(self._GetLhsContext(ctx), display_result)
+        display_result = display_result and self.lhs_hob_(self._GetLhsContext(ctx))
+        right_display = self.rhs_hob_.debug_str(
+            self._GetRhsContext(ctx), display_result
+        )
+        return "%s -> %s" % (left_display, right_display)
+
+    def __call__(self, ctx):
+        return self.lhs_hob_(self._GetLhsContext(ctx)) and self.rhs_hob_(
+            self._GetRhsContext(ctx)
+        )
+
+    def _GetLhsContext(self, ctx):
+        if self not in ctx.composer2lhs_context:
+            blob_object = oneflow._oneflow_internal.BlobObject(
+                ctx.produced_blob_object.object_id,
+                ctx.produced_blob_object.op_arg_parallel_attr,
+                ctx.produced_blob_object.op_arg_blob_attr,
+            )
+            value = BoxingHobContext(blob_object, self._GetMiddleOpArgParallelAttr(ctx))
+            ctx.composer2lhs_context[self] = value
+        return ctx.composer2lhs_context[self]
+
+    def _GetRhsContext(self, ctx):
+        if self not in ctx.composer2rhs_context:
+            middle_blob_object = oneflow._oneflow_internal.BlobObject(
+                ctx.produced_blob_object.object_id,
+                self._GetMiddleOpArgParallelAttr(ctx),
+                ctx.produced_blob_object.op_arg_blob_attr,
+            )
+            value = BoxingHobContext(
+                middle_blob_object, ctx.consumer_op_arg_parallel_attr
+            )
+            ctx.composer2rhs_context[self] = value
+        return ctx.composer2rhs_context[self]
+
+    def _GetMiddleOpArgParallelAttr(self, ctx):
+        if self not in ctx.composer2middle_op_arg_parallel_attr:
+            value = self.get_middle_op_arg_parallel_attr_(
+                None, ctx.produced_blob_object, ctx.consumer_op_arg_parallel_attr
+            )
+            if self.middle_verbose_str_ is not None:
+                print("=== %s ===" % self.middle_verbose_str_)
+                print(value)
+            ctx.composer2middle_op_arg_parallel_attr[self] = value
+        return ctx.composer2middle_op_arg_parallel_attr[self]
+
+
+@bool_functor("SingleMachine")
+def SingleMachine(ctx):
+    blob_device_ids = dict(
+        ctx.produced_blob_object.parallel_desc_symbol.machine_id2device_id_list
+    )
+    arg_parallel_desc_symbol = ctx.consumer_op_arg_parallel_attr.parallel_desc_symbol
+    op_arg_device_ids = dict(arg_parallel_desc_symbol.machine_id2device_id_list)
+    return list(blob_device_ids.keys()) == [0] and list(op_arg_device_ids.keys()) == [0]
+
+
+@bool_functor("MatchDeviceOneToOnePerMachine")
+def MatchDeviceOneToOnePerMachine(ctx):
+    blob_device_ids = dict(
+        ctx.produced_blob_object.parallel_desc_symbol.machine_id2device_id_list
+    )
+    arg_parallel_desc_symbol = ctx.consumer_op_arg_parallel_attr.parallel_desc_symbol
+    op_arg_device_ids = dict(arg_parallel_desc_symbol.machine_id2device_id_list)
+    if blob_device_ids.keys() != op_arg_device_ids.keys():
+        return False
+    for key in blob_device_ids.keys():
+        if len(blob_device_ids[key]) != len(op_arg_device_ids[key]):
+            return False
+    return True
+
+
+@bool_functor("Verbose")
+def Verbose(ctx):
+    print("============[producer]============")
+    print(ctx.produced_blob_object.op_arg_parallel_attr.parallel_desc_symbol)
+    print(ctx.produced_blob_object.op_arg_parallel_attr.sbp_parallel)
+    print("============[consumer]============")
+    print(ctx.consumer_op_arg_parallel_attr.parallel_desc_symbol)
+    print(ctx.consumer_op_arg_parallel_attr.sbp_parallel)
+    return True
+
+
+@bool_functor("producer's devices contained in consumer's devices")
+def ProducerDevicesContainedInConsumerDevices(ctx):
+    return ctx.consumer_op_arg_parallel_attr.parallel_desc_symbol.Containing(
+        ctx.produced_blob_object.parallel_desc_symbol
+    )
+
+
+@bool_functor("consumer's devices contained in producer's devices")
+def ConsumerDevicesContainedInProducerDevices(ctx):
+    return ctx.produced_blob_object.parallel_desc_symbol.Containing(
+        ctx.consumer_op_arg_parallel_attr.parallel_desc_symbol
+    )
+
+
+@hob_context_attr("consumer_sbp_parallel")
+def consumer_sbp_parallel(ctx):
+    return ctx.consumer_op_arg_parallel_attr.sbp_parallel
+
+
+@hob_context_attr("producer_sbp_parallel")
+def producer_sbp_parallel(ctx):
+    return ctx.produced_blob_object.op_arg_parallel_attr.sbp_parallel
+
+
+@hob_context_attr("producer_parallel_desc")
+def producer_parallel_desc(ctx):
+    return ctx.produced_blob_object.op_arg_parallel_attr.parallel_desc_symbol
+
+
+@hob_context_attr("consumer_parallel_desc")
+def consumer_parallel_desc(ctx):
+    return ctx.consumer_op_arg_parallel_attr.parallel_desc_symbol
diff --git a/python/oneflow/eager/boxing_middle.py b/python/oneflow/eager/boxing_middle.py
new file mode 100644
index 0000000000000000000000000000000000000000..0108722429139717658f6682bad0ecbafd18159a
--- /dev/null
+++ b/python/oneflow/eager/boxing_middle.py
@@ -0,0 +1,173 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import random
+
+import oneflow._oneflow_internal
+import oneflow._oneflow_internal.oneflow.core.common.shape as shape_proto_cfg
+import oneflow._oneflow_internal.oneflow.core.job.placement as placement_cfg
+import oneflow.core.job.sbp_parallel_pb2 as sbp_parallel_pb
+import oneflow.eager.symbol as symbol_util
+
+
+class BoxingToMiddle(object):
+    def __init__(
+        self,
+        boxing_method,
+        get_middle_parallel_desc_symbol,
+        get_middle_sbp_parallel,
+        verbose=False,
+    ):
+        self.boxing_method_ = boxing_method
+        self.get_middle_op_arg_parallel_attr_ = MiddleOpArgParallelAttr(
+            get_middle_parallel_desc_symbol, get_middle_sbp_parallel
+        )
+        self.verbose_ = verbose
+
+    @property
+    def boxing_method(self):
+        return self.boxing_method_
+
+    @property
+    def get_middle_op_arg_parallel_attr(self):
+        return self.get_middle_op_arg_parallel_attr_
+
+    @property
+    def verbose(self):
+        return self.verbose_
+
+
+def MiddleOpArgParallelAttr(get_parallel_desc_symbol, get_sbp_parallel):
+    def GetOpArgParallelAttr(
+        builder, produced_blob_object, consumer_op_arg_parallel_attr
+    ):
+        return oneflow._oneflow_internal.OpArgParallelAttribute(
+            get_parallel_desc_symbol(
+                builder, produced_blob_object, consumer_op_arg_parallel_attr
+            ),
+            str(
+                get_sbp_parallel(
+                    builder, produced_blob_object, consumer_op_arg_parallel_attr
+                )
+            ),
+            str(produced_blob_object.op_arg_parallel_attr.opt_mirrored_parallel),
+        )
+
+    return GetOpArgParallelAttr
+
+
+def ReplaceProducerDeviceTag(new_device_tag):
+    def Getter(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+        x_parallel_attr = produced_blob_object.op_arg_parallel_attr
+        return TryReplaceDeviceTag(
+            builder, x_parallel_attr.parallel_desc_symbol, new_device_tag
+        )
+
+    return Getter
+
+
+def ProducerRandomParallelIdPerMachine(device_tag=None):
+    def Getter(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+        return RandomParallelIdPerMachine(
+            produced_blob_object.parallel_desc_symbol,
+            device_tag=device_tag,
+            builder=builder,
+        )
+
+    return Getter
+
+
+def ConsumerRandomParallelIdPerMachine(device_tag=None):
+    def Getter(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+        return RandomParallelIdPerMachine(
+            consumer_op_arg_parallel_attr.parallel_desc_symbol,
+            device_tag=device_tag,
+            builder=builder,
+        )
+
+    return Getter
+
+
+def ProducerParallelDesc(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+    return produced_blob_object.parallel_desc_symbol
+
+
+def ConsumerParallelDesc(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+    return consumer_op_arg_parallel_attr.parallel_desc_symbol
+
+
+def ReplaceConsumerDeviceTag(new_device_tag):
+    def Getter(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+        parallel_desc_sym = consumer_op_arg_parallel_attr.parallel_desc_symbol
+        return TryReplaceDeviceTag(builder, parallel_desc_sym, new_device_tag)
+
+    return Getter
+
+
+def BroadcastParallel(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+    sbp_parallel = sbp_parallel_pb.SbpParallel()
+    sbp_parallel.broadcast_parallel.SetInParent()
+    return sbp_parallel
+
+
+def ProducerSbpParallel(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+    return produced_blob_object.op_arg_parallel_attr.sbp_parallel
+
+
+def ConsumerSbpParallel(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+    return consumer_op_arg_parallel_attr.sbp_parallel
+
+
+def TryReplaceDeviceTag(builder, parallel_desc_symbol, device_tag):
+    if parallel_desc_symbol.device_tag == device_tag:
+        return parallel_desc_symbol
+    else:
+        return ReplaceDeviceTag(parallel_desc_symbol, device_tag, builder=builder)
+
+
+def ReplaceDeviceTag(parallel_desc_symbol, device_tag, builder=None):
+    assert parallel_desc_symbol.device_tag != device_tag
+    parallel_conf = placement_cfg.ParallelConf()
+    parallel_conf.set_device_tag(device_tag)
+    for device_name in parallel_desc_symbol.parallel_conf.device_name():
+        parallel_conf.add_device_name(device_name)
+    hierarchy = shape_proto_cfg.ShapeProto()
+    for dim in parallel_desc_symbol.hierarchy:
+        hierarchy.add_dim(dim)
+    assert hierarchy.dim_size() > 0
+    parallel_conf.mutable_hierarchy().CopyFrom(hierarchy)
+    if builder is None:
+        return oneflow._oneflow_internal.PlacementSymbol(
+            parallel_desc_symbol.symbol_id, parallel_conf
+        )
+    else:
+        return builder.GetParallelDescSymbol(parallel_conf)
+
+
+def RandomParallelIdPerMachine(parallel_desc_symbol, device_tag=None, builder=None):
+    if device_tag is None:
+        device_tag = parallel_desc_symbol.parallel_conf.device_tag()
+    assert device_tag is not None
+    parallel_conf = placement_cfg.ParallelConf()
+    parallel_conf.set_device_tag(device_tag)
+    for (machine_id, dev_ids) in parallel_desc_symbol.machine_id2device_id_list.items():
+        dev_id = dev_ids[random.randint(0, len(dev_ids) - 1)]
+        parallel_conf.add_device_name("@%s:%s" % (machine_id, dev_id))
+    if builder is None:
+        return oneflow._oneflow_internal.PlacementSymbol(
+            parallel_desc_symbol.symbol_id, parallel_conf
+        )
+    else:
+        return builder.GetParallelDescSymbol(parallel_conf)
diff --git a/python/oneflow/eager/boxing_util.py b/python/oneflow/eager/boxing_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9c2cb3dafdc149a908934a887ae85356b2cb1ca
--- /dev/null
+++ b/python/oneflow/eager/boxing_util.py
@@ -0,0 +1,981 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import random
+from contextlib import contextmanager
+
+import oneflow
+import oneflow._oneflow_internal
+import oneflow._oneflow_internal.oneflow.core.common.shape as shape_proto_cfg
+import oneflow._oneflow_internal.oneflow.core.job.placement as placement_cfg
+import oneflow.core.job.sbp_parallel_pb2 as sbp_parallel_pb
+import oneflow.core.operator.op_attribute_pb2 as op_attribute_pb
+import oneflow.core.operator.op_conf_pb2 as op_conf_pb
+import oneflow.core.register.logical_blob_id_pb2 as logical_blob_id_util
+import oneflow.eager.boxing_hob as boxing_hob
+import oneflow.eager.boxing_middle as boxing_middle
+import oneflow.eager.op_infer_util as op_infer_util
+import oneflow.eager.symbol as symbol_util
+import oneflow.framework.balanced_splitter as balanced_splitter
+import oneflow.framework.c_api_util as c_api_util
+import oneflow.framework.id_util as id_util
+import oneflow.support.enable_if as enable_if
+import oneflow.support.high_order_bool as high_order_bool
+from oneflow.eager.boxing_hob import BoxingHobContext
+
+
+def BoxingTo(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+    hob_context = BoxingHobContext(produced_blob_object, consumer_op_arg_parallel_attr)
+    if enable_if.get_condition_hob(NoBoxing)(hob_context):
+        return produced_blob_object
+    producer_opt_mirrored_parallel = (
+        produced_blob_object.op_arg_parallel_attr.opt_mirrored_parallel
+    )
+    consumer_opt_mirrored_parallel = consumer_op_arg_parallel_attr.opt_mirrored_parallel
+    assert producer_opt_mirrored_parallel == consumer_opt_mirrored_parallel, (
+        "\nproducer_op_arg_parallel_attr: %s\nconsumer_op_arg_parallel_attr: %s"
+        % (produced_blob_object.op_arg_parallel_attr, consumer_op_arg_parallel_attr)
+    )
+
+    def default(get_failed_info, *args, **kwargs):
+        raise NotImplementedError(
+            "%s\nno boxing method found.\nlogical_blob_name: %s\nx_arg_attribute: %s\nconsumer_op_arg_parallel_attr: %s\n"
+            % (
+                get_failed_info(),
+                produced_blob_object.op_arg_blob_attr.logical_blob_name,
+                produced_blob_object.op_arg_parallel_attr,
+                consumer_op_arg_parallel_attr,
+            )
+        )
+
+    global conditional_function_table
+    function = enable_if.unique(
+        conditional_function_table,
+        context=BoxingHobContext(produced_blob_object, consumer_op_arg_parallel_attr),
+        default=default,
+    )
+    return function(builder, produced_blob_object, consumer_op_arg_parallel_attr)
+
+
+def boxing_condition(hob_expr, verbose=False):
+    def Decorator(func):
+        func.__oneflow_condition_hob__ = hob_expr
+        if not verbose:
+            hob_expr.__debug_str__ = GetBoxingDebugString(func)
+        return func
+
+    return Decorator
+
+
+def FirstMatchedBoxing(*boxing_methods):
+    hob_expr = enable_if.get_condition_hob(boxing_methods[0])
+    for boxing_method in boxing_methods[1:]:
+        hob_expr = hob_expr | enable_if.get_condition_hob(boxing_method)
+
+    @enable_if.condition(hob_expr)
+    def FirstMatched(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+        ctx = BoxingHobContext(produced_blob_object, consumer_op_arg_parallel_attr)
+        for boxing_method in boxing_methods:
+            hob_expr = enable_if.get_condition_hob(boxing_method)
+            if not hob_expr(ctx):
+                continue
+            return boxing_method(
+                builder, produced_blob_object, consumer_op_arg_parallel_attr
+            )
+
+    boxing_methods_names = [GetBoxingDebugString(m) for m in boxing_methods]
+    FirstMatched.__debug_str__ = "(%s)" % " | ".join(boxing_methods_names)
+    return FirstMatched
+
+
+def OptionalBoxing(boxing_method):
+    opt_boxing_method = FirstMatchedBoxing(boxing_method, NoBoxing)
+    debug_str = "Optional(%s)" % GetBoxingDebugString(boxing_method)
+    opt_boxing_method.__debug_str__ = debug_str
+    return opt_boxing_method
+
+
+def ComposeBoxing(
+    lhs_boxing, rhs_boxing, get_middle_op_arg_parallel_attr, middle_verbose_str=None
+):
+    composed_hob = boxing_hob.ComposeHob(
+        enable_if.get_condition_hob(lhs_boxing),
+        enable_if.get_condition_hob(rhs_boxing),
+        get_middle_op_arg_parallel_attr=get_middle_op_arg_parallel_attr,
+        middle_verbose_str=middle_verbose_str,
+    )
+
+    @enable_if.condition(composed_hob)
+    def Composed(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+        tmp_op_arg_parallel_attr = get_middle_op_arg_parallel_attr(
+            builder, produced_blob_object, consumer_op_arg_parallel_attr
+        )
+        tmp = lhs_boxing(builder, produced_blob_object, tmp_op_arg_parallel_attr)
+        return rhs_boxing(builder, tmp, consumer_op_arg_parallel_attr)
+
+    Composed.__debug_str__ = "%s->%s" % (
+        GetBoxingDebugString(lhs_boxing),
+        GetBoxingDebugString(rhs_boxing),
+    )
+    Composed.__left_debug_str__ = GetBoxingLeftDebugString(lhs_boxing)
+    Composed.__right_debug_str__ = GetBoxingRightDebugString(rhs_boxing)
+    return Composed
+
+
+def GetBoxingDebugString(boxing_method):
+    if hasattr(boxing_method, "__debug_str__"):
+        return boxing_method.__debug_str__
+    else:
+        return boxing_method.__name__
+
+
+def GetBoxingLeftDebugString(boxing_method):
+    if hasattr(boxing_method, "__left_debug_str__"):
+        return boxing_method.__left_debug_str__
+    else:
+        return GetBoxingDebugString(boxing_method)
+
+
+def GetBoxingRightDebugString(boxing_method):
+    if hasattr(boxing_method, "__right_debug_str__"):
+        return boxing_method.__right_debug_str__
+    else:
+        return GetBoxingDebugString(boxing_method)
+
+
+def Sequential(*boxing_methods, exclude=tuple(), middle_verbose=False):
+    assert not isinstance(boxing_methods[-1], boxing_middle.BoxingToMiddle)
+    composed = boxing_methods[-1]
+    for boxing_to_middle in boxing_methods[-2::-1]:
+        assert isinstance(boxing_to_middle, boxing_middle.BoxingToMiddle)
+        if middle_verbose:
+            middle_verbose_str = "middle op_arg_parallel_attr of %s->%s:" % (
+                GetBoxingDebugString(boxing_to_middle.boxing_method),
+                GetBoxingLeftDebugString(composed),
+            )
+        else:
+            middle_verbose_str = None
+        composed = ComposeBoxing(
+            boxing_to_middle.boxing_method,
+            composed,
+            boxing_to_middle.get_middle_op_arg_parallel_attr,
+            middle_verbose_str=middle_verbose_str,
+        )
+    if len(exclude) > 0:
+        exclude_hob = enable_if.get_condition_hob(exclude[0])
+        for method in exclude[1:]:
+            exclude_hob = exclude_hob | enable_if.get_condition_hob(method)
+        old_hob = enable_if.get_condition_hob(composed)
+        enable_if.set_condition_hob(composed, old_hob & ~exclude_hob)
+    return composed
+
+
+MatchCopyH2D = (
+    (
+        boxing_hob.producer_parallel_desc.machine_id2device_id_list
+        == boxing_hob.consumer_parallel_desc.machine_id2device_id_list
+    )
+    & (
+        (boxing_hob.producer_sbp_parallel == boxing_hob.consumer_sbp_parallel)
+        | (boxing_hob.producer_parallel_desc.parallel_num == 1)
+    )
+    & (boxing_hob.producer_parallel_desc.device_tag == "cpu")
+    & (boxing_hob.consumer_parallel_desc.device_tag == "gpu")
+)
+
+
+@boxing_condition(MatchCopyH2D)
+def CopyH2D(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+    return CopyHD(builder, produced_blob_object, consumer_op_arg_parallel_attr)
+
+
+MatchCopyD2H = (
+    (
+        boxing_hob.producer_parallel_desc.machine_id2device_id_list
+        == boxing_hob.consumer_parallel_desc.machine_id2device_id_list
+    )
+    & (
+        (boxing_hob.producer_sbp_parallel == boxing_hob.consumer_sbp_parallel)
+        | (boxing_hob.producer_parallel_desc.parallel_num == 1)
+    )
+    & (boxing_hob.producer_parallel_desc.device_tag == "gpu")
+    & (boxing_hob.consumer_parallel_desc.device_tag == "cpu")
+)
+
+
+@boxing_condition(MatchCopyD2H)
+def CopyD2H(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+    return CopyHD(builder, produced_blob_object, consumer_op_arg_parallel_attr)
+
+
+def CopyHD(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+    arg_parallel_desc_symbol = consumer_op_arg_parallel_attr.parallel_desc_symbol
+    op_device_tag = arg_parallel_desc_symbol.device_tag
+    return BuildCopyHdInstruction(builder, produced_blob_object, op_device_tag)
+
+
+BlobIsPartialSum = boxing_hob.producer_sbp_parallel.HasField("partial_sum_parallel")
+OpArgIsBroadcast = boxing_hob.consumer_sbp_parallel.HasField("broadcast_parallel")
+MatchInterNodeOneToMany = (
+    ~boxing_hob.SingleMachine
+    & (boxing_hob.producer_parallel_desc.device_tag == "cpu")
+    & (boxing_hob.consumer_parallel_desc.device_tag == "cpu")
+    & (boxing_hob.producer_parallel_desc.parallel_num == 1)
+    & (boxing_hob.consumer_parallel_desc.parallel_num > 1)
+    & OpArgIsBroadcast
+)
+
+
+@boxing_condition(MatchInterNodeOneToMany)
+def InterNodeOneToMany(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+    out_blobs = []
+    consumer_dev_ids = (
+        consumer_op_arg_parallel_attr.parallel_desc_symbol.machine_id2device_id_list
+    )
+    for (machine_id, device_ids) in consumer_dev_ids.items():
+        for device_id in device_ids:
+            parallel_conf = placement_cfg.ParallelConf()
+            parallel_conf.set_device_tag("cpu")
+            parallel_conf.add_device_name("@%s:%s" % (machine_id, device_id))
+            parallel_desc_symbol = builder.GetParallelDescSymbol(parallel_conf)
+            out_blob = builder.Build121To(produced_blob_object, parallel_desc_symbol)
+            out_blobs.append(out_blob)
+    return PackPhysicalBoxingBlobObjectsToLogical(
+        builder,
+        out_blobs,
+        consumer_op_arg_parallel_attr,
+        produced_blob_object.op_arg_blob_attr,
+    )
+
+
+MatchInterNodeOneToOne = (
+    (boxing_hob.producer_parallel_desc.device_tag == "cpu")
+    & (boxing_hob.consumer_parallel_desc.device_tag == "cpu")
+    & (boxing_hob.producer_parallel_desc != boxing_hob.consumer_parallel_desc)
+    & (
+        boxing_hob.producer_parallel_desc.parallel_num
+        == boxing_hob.consumer_parallel_desc.parallel_num
+    )
+    & ~boxing_hob.MatchDeviceOneToOnePerMachine
+    & (
+        (boxing_hob.producer_sbp_parallel == boxing_hob.consumer_sbp_parallel)
+        | (boxing_hob.producer_parallel_desc.parallel_num == 1)
+    )
+)
+
+
+@boxing_condition(MatchInterNodeOneToOne)
+def InterNodeOneToOne(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+    return builder.Build121To(
+        produced_blob_object, consumer_op_arg_parallel_attr.parallel_desc_symbol
+    )
+
+
+MatchCpuBroadcastOneToOne = (
+    (boxing_hob.producer_parallel_desc.device_tag == "cpu")
+    & (boxing_hob.consumer_parallel_desc.device_tag == "cpu")
+    & (boxing_hob.producer_parallel_desc != boxing_hob.consumer_parallel_desc)
+    & boxing_hob.MatchDeviceOneToOnePerMachine
+    & (
+        (boxing_hob.producer_sbp_parallel == boxing_hob.consumer_sbp_parallel)
+        | (boxing_hob.producer_parallel_desc.parallel_num == 1)
+    )
+)
+
+
+@boxing_condition(MatchCpuBroadcastOneToOne)
+def CpuBroadcastOneToOne(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+    def get_identity_physical_in_blob_objects(
+        builder,
+        produced_blob_object,
+        consumer_op_arg_parallel_attr,
+        physical_in_blob_objects,
+        boxing_parallel_desc_symbol,
+        out_parallel_num,
+    ):
+        return physical_in_blob_objects
+
+    return NaiveCpuRefPhysicalBlobObjectsScope(
+        builder,
+        produced_blob_object,
+        consumer_op_arg_parallel_attr,
+        get_physical_out_blob_objects=get_identity_physical_in_blob_objects,
+    )
+
+
+MatchNoBoxing = (
+    boxing_hob.producer_parallel_desc == boxing_hob.consumer_parallel_desc
+) & (
+    (boxing_hob.producer_sbp_parallel == boxing_hob.consumer_sbp_parallel)
+    | (boxing_hob.producer_parallel_desc.parallel_num == 1)
+)
+
+
+@boxing_condition(MatchNoBoxing)
+def NoBoxing(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+    return produced_blob_object
+
+
+@boxing_condition(boxing_hob.Verbose & MatchNoBoxing)
+def VerboseNoBoxing(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+    return produced_blob_object
+
+
+def VerboseOptionalBoxing(boxing_method):
+    opt_boxing_method = FirstMatchedBoxing(boxing_method, VerboseNoBoxing)
+    debug_str = "VerboseOptional(%s)" % GetBoxingDebugString(boxing_method)
+    opt_boxing_method.__debug_str__ = debug_str
+    return opt_boxing_method
+
+
+MatchNcclAllReduce = (
+    boxing_hob.SingleMachine
+    & (boxing_hob.producer_parallel_desc.device_tag == "gpu")
+    & (boxing_hob.producer_parallel_desc == boxing_hob.consumer_parallel_desc)
+    & (boxing_hob.consumer_parallel_desc.parallel_num > 1)
+    & BlobIsPartialSum
+    & OpArgIsBroadcast
+)
+
+
+@boxing_condition(MatchNcclAllReduce)
+def GpuNcclAllReduce(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+    parallel_conf = consumer_op_arg_parallel_attr.parallel_desc_symbol.parallel_conf
+    bn_in_op2blob_object = oneflow._oneflow_internal.deprecated.BnInOp2BlobObject()
+    bn_in_op2blob_object["in_0"] = produced_blob_object
+    op_attribute = _GetEagerNcclAllReduce(parallel_conf, bn_in_op2blob_object)
+    cfg_op_attribute = oneflow._oneflow_internal.deprecated.MakeOpAttributeByString(
+        str(op_attribute)
+    )
+    builder.NoBoxingStatelessCall(cfg_op_attribute, parallel_conf, bn_in_op2blob_object)
+    y_blob_object = bn_in_op2blob_object["out_0"]
+    y_blob_object.op_arg_parallel_attr.Assign(consumer_op_arg_parallel_attr)
+    return y_blob_object
+
+
+MatchSplitOneToMany = (
+    (boxing_hob.producer_parallel_desc.parallel_num == 1)
+    & (boxing_hob.consumer_parallel_desc.parallel_num > 1)
+    & boxing_hob.consumer_sbp_parallel.HasField("split_parallel")
+)
+MatchConcatManyToOne = (
+    (boxing_hob.consumer_parallel_desc.parallel_num == 1)
+    & (boxing_hob.producer_parallel_desc.parallel_num > 1)
+    & boxing_hob.producer_sbp_parallel.HasField("split_parallel")
+)
+MatchConcatManyToSplitMany = (
+    (boxing_hob.producer_parallel_desc.parallel_num > 1)
+    & (boxing_hob.consumer_parallel_desc.parallel_num > 1)
+    & boxing_hob.producer_sbp_parallel.HasField("split_parallel")
+    & boxing_hob.consumer_sbp_parallel.HasField("split_parallel")
+    & (
+        (boxing_hob.producer_sbp_parallel != boxing_hob.consumer_sbp_parallel)
+        | (
+            boxing_hob.producer_parallel_desc.parallel_num
+            != boxing_hob.consumer_parallel_desc.parallel_num
+        )
+    )
+)
+MatchNaiveCpuSplitToSplit = (
+    (boxing_hob.producer_parallel_desc.device_tag == "cpu")
+    & (boxing_hob.consumer_parallel_desc.device_tag == "cpu")
+    & (MatchSplitOneToMany | MatchConcatManyToOne | MatchConcatManyToSplitMany)
+)
+
+
+@boxing_condition(MatchNaiveCpuSplitToSplit)
+def NaiveCpuSplitToSplit(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+    return NaiveCpuRefPhysicalBlobObjectsScope(
+        builder,
+        produced_blob_object,
+        consumer_op_arg_parallel_attr,
+        get_physical_out_blob_objects=NaiveBoxingToPhysicalBlobObjects,
+    )
+
+
+MatchNaiveCpuPartialSumToSplit = (
+    (boxing_hob.producer_parallel_desc.device_tag == "cpu")
+    & (boxing_hob.consumer_parallel_desc.device_tag == "cpu")
+    & (boxing_hob.producer_parallel_desc.parallel_num > 1)
+    & boxing_hob.producer_sbp_parallel.HasField("partial_sum_parallel")
+    & (
+        (boxing_hob.consumer_parallel_desc.parallel_num == 1)
+        | boxing_hob.consumer_sbp_parallel.HasField("split_parallel")
+    )
+)
+
+
+@boxing_condition(MatchNaiveCpuPartialSumToSplit)
+def NaiveCpuPartialSumToSplit(
+    builder, produced_blob_object, consumer_op_arg_parallel_attr
+):
+    return NaiveCpuRefPhysicalBlobObjectsScope(
+        builder,
+        produced_blob_object,
+        consumer_op_arg_parallel_attr,
+        get_physical_out_blob_objects=NaiveBoxingToPhysicalBlobObjects,
+    )
+
+
+def NaiveCpuRefPhysicalBlobObjectsScope(
+    builder,
+    produced_blob_object,
+    consumer_op_arg_parallel_attr,
+    get_physical_out_blob_objects,
+):
+    physical_in_blob_objects = UnpackLogicalBoxingBlobObjectToPhysical(
+        builder, produced_blob_object
+    )
+    consumer_parallel_desc_symbol = consumer_op_arg_parallel_attr.parallel_desc_symbol
+    out_parallel_num = consumer_parallel_desc_symbol.parallel_num
+    boxing_parallel_desc_symbol = GetConcatSplitBoxingParallelDescSymbol(
+        builder,
+        consumer_parallel_desc_symbol,
+        max(len(physical_in_blob_objects), out_parallel_num),
+    )
+    physical_output_blob_objects = get_physical_out_blob_objects(
+        builder=builder,
+        produced_blob_object=produced_blob_object,
+        consumer_op_arg_parallel_attr=consumer_op_arg_parallel_attr,
+        physical_in_blob_objects=physical_in_blob_objects,
+        boxing_parallel_desc_symbol=boxing_parallel_desc_symbol,
+        out_parallel_num=out_parallel_num,
+    )
+    phy_parallel_desc_symbols = builder.GetPhysicalParallelDescSymbols(
+        consumer_op_arg_parallel_attr.parallel_desc_symbol
+    )
+    physical_output_blob_objects = RefBlobObjectWithParallelDesc(
+        builder, physical_output_blob_objects, phy_parallel_desc_symbols
+    )
+    return PackPhysicalBoxingBlobObjectsToLogical(
+        builder,
+        physical_output_blob_objects,
+        consumer_op_arg_parallel_attr,
+        produced_blob_object.op_arg_blob_attr,
+    )
+
+
+def NaiveBoxingToPhysicalBlobObjects(
+    builder,
+    produced_blob_object,
+    consumer_op_arg_parallel_attr,
+    physical_in_blob_objects,
+    boxing_parallel_desc_symbol,
+    out_parallel_num,
+):
+    op_attribute = ConstructNaiveBoxingOpConf(
+        produced_blob_object,
+        consumer_op_arg_parallel_attr,
+        len(physical_in_blob_objects),
+        out_parallel_num,
+    )
+    return BuildNaiveCpuBoxing(
+        builder,
+        op_attribute,
+        physical_in_blob_objects,
+        boxing_parallel_desc_symbol,
+        out_parallel_num,
+    )
+
+
+def RefBlobObjectWithParallelDesc(
+    builder, physical_blob_objects, phy_parallel_desc_symbols
+):
+    assert len(physical_blob_objects) == len(
+        phy_parallel_desc_symbols
+    ), "%s v.s. %s" % (len(physical_blob_objects), len(phy_parallel_desc_symbols))
+
+    def RefWithParallelDesc(physical_blob_object, phy_parallel_desc_symbol):
+        if physical_blob_object.parallel_desc_symbol == phy_parallel_desc_symbol:
+            return physical_blob_object
+        return builder.BroadcastBlobReference(
+            physical_blob_object, phy_parallel_desc_symbol
+        )
+
+    return [
+        RefWithParallelDesc(*pair)
+        for pair in zip(physical_blob_objects, phy_parallel_desc_symbols)
+    ]
+
+
+def PackPhysicalBoxingBlobObjectsToLogical(
+    builder, physical_blob_objects, op_arg_parallel_attr, op_arg_blob_attr
+):
+    if len(physical_blob_objects) == 1:
+        return physical_blob_objects[0]
+    return builder.PackPhysicalBlobsToLogicalBlob(
+        physical_blob_objects, op_arg_parallel_attr, op_arg_blob_attr
+    )
+
+
+def BuildNaiveCpuBoxing(
+    builder,
+    op_attribute,
+    physical_in_blob_objects,
+    boxing_parallel_desc_symbol,
+    out_parallel_num,
+):
+    bn_in_op2blob_object = oneflow._oneflow_internal.deprecated.BnInOp2BlobObject()
+    for i in range(len(physical_in_blob_objects)):
+        bn_in_op2blob_object["in_%s" % i] = physical_in_blob_objects[i]
+    cfg_op_attribute = oneflow._oneflow_internal.deprecated.MakeOpAttributeByString(
+        str(op_attribute)
+    )
+    builder.NoBoxingStatelessCall(
+        cfg_op_attribute,
+        boxing_parallel_desc_symbol.parallel_conf,
+        bn_in_op2blob_object,
+    )
+    return [bn_in_op2blob_object["out_%s" % i] for i in range(out_parallel_num)]
+
+
+def ConstructNaiveBoxingOpConf(
+    produced_blob_object,
+    consumer_op_arg_parallel_attr,
+    in_parallel_num,
+    out_parallel_num,
+):
+    op_conf = op_conf_pb.OperatorConf()
+    op_conf.name = "undefined_boxing_op_name"
+    op_conf.device_tag = "cpu"
+    op_conf.boxing_conf.lbi.op_name = "undefined_boxing_op_name"
+    op_conf.boxing_conf.lbi.blob_name = "undefined_boxing_blob_name"
+    op_conf.boxing_conf.in_num = in_parallel_num
+    op_conf.boxing_conf.out_num = out_parallel_num
+    in_sbp_parallel = produced_blob_object.op_arg_parallel_attr.sbp_parallel
+    if in_sbp_parallel.has_split_parallel():
+        op_conf.boxing_conf.concat_box.axis = in_sbp_parallel.split_parallel().axis()
+    elif in_parallel_num == 1:
+        op_conf.boxing_conf.concat_box.axis = 0
+    else:
+        assert in_sbp_parallel.has_partial_sum_parallel()
+        op_conf.boxing_conf.add_box.SetInParent()
+    out_sbp_parallel = consumer_op_arg_parallel_attr.sbp_parallel
+    if out_sbp_parallel.has_split_parallel():
+        out_axis = out_sbp_parallel.split_parallel().axis()
+    else:
+        assert out_parallel_num == 1
+        out_axis = 0
+    op_conf.boxing_conf.split_box.axis = out_axis
+    shape = produced_blob_object.op_arg_blob_attr.shape
+    op_conf.boxing_conf.split_box.part_num.extend(
+        balanced_splitter.BalancedPartNums(shape[out_axis], out_parallel_num)
+    )
+    bn_in_op2blob_object = oneflow._oneflow_internal.deprecated.BnInOp2BlobObject()
+    for i in range(in_parallel_num):
+        bn_in_op2blob_object["in_%s" % i] = produced_blob_object
+    return op_infer_util.Infer(op_conf, bn_in_op2blob_object)
+
+
+def GetConcatSplitBoxingParallelDescSymbol(
+    builder, blob_parallel_desc_symbol, max_parallel_num
+):
+    random_rank_id = random.randint(0, max_parallel_num - 1)
+    parallel_conf = placement_cfg.ParallelConf()
+    parallel_conf.set_device_tag("cpu")
+    for (machine_id, _) in blob_parallel_desc_symbol.machine_id2device_id_list.items():
+        parallel_conf.add_device_name("@%s:%s" % (machine_id, random_rank_id))
+    return builder.GetParallelDescSymbol(parallel_conf)
+
+
+def UnpackLogicalBoxingBlobObjectToPhysical(builder, produced_blob_object):
+    if produced_blob_object.parallel_desc_symbol.parallel_num == 1:
+        return [produced_blob_object]
+    return builder.UnpackLogicalBlobToPhysicalBlobs(produced_blob_object)
+
+
+MatchCpuBroadcastOneToMany = (
+    boxing_hob.SingleMachine
+    & (boxing_hob.producer_parallel_desc.device_tag == "cpu")
+    & (boxing_hob.consumer_parallel_desc.device_tag == "cpu")
+    & boxing_hob.ProducerDevicesContainedInConsumerDevices
+    & (boxing_hob.producer_parallel_desc.parallel_num == 1)
+    & (boxing_hob.consumer_parallel_desc.parallel_num > 1)
+    & boxing_hob.consumer_sbp_parallel.HasField("broadcast_parallel")
+)
+
+
+@boxing_condition(MatchCpuBroadcastOneToMany)
+def CpuBroadcastOneToMany(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+    return CpuOneToManyBroadcastBlobReference(
+        builder,
+        produced_blob_object,
+        consumer_op_arg_parallel_attr.parallel_desc_symbol,
+    )
+
+
+MatchBroadcastManyToOne = (
+    (
+        boxing_hob.producer_parallel_desc.device_tag
+        == boxing_hob.consumer_parallel_desc.device_tag
+    )
+    & boxing_hob.ConsumerDevicesContainedInProducerDevices
+    & (boxing_hob.producer_parallel_desc.parallel_num > 1)
+    & (boxing_hob.consumer_parallel_desc.parallel_num == 1)
+    & boxing_hob.producer_sbp_parallel.HasField("broadcast_parallel")
+)
+
+
+@boxing_condition(MatchBroadcastManyToOne)
+def BroadcastManyToOne(builder, produced_blob_object, consumer_op_arg_parallel_attr):
+    y_blob_objects = builder.UnpackLogicalBlobToPhysicalBlobs(produced_blob_object)
+    for y in y_blob_objects:
+        if y.parallel_desc_symbol == consumer_op_arg_parallel_attr.parallel_desc_symbol:
+            return y
+    raise NotImplementedError("op_arg's devices is not contained in blob's devices")
+
+
+def Assign(builder, ref_blob_object, value_blob_object):
+    return BuildAssignInstruction(
+        builder, ref_blob_object, value_blob_object, _AssignOpConf()
+    )
+
+
+def CpuOneToManyBroadcastBlobReference(
+    builder, produced_blob_object, to_parallel_desc_symbol
+):
+    x_parallel_desc_symbol = produced_blob_object.parallel_desc_symbol
+    x_machine_ids = list(dict(x_parallel_desc_symbol.machine_id2device_id_list).keys())
+    to_machine_ids = list(
+        dict(to_parallel_desc_symbol.machine_id2device_id_list).keys()
+    )
+    assert x_machine_ids == to_machine_ids, (x_machine_ids, to_machine_ids)
+    x_first_device_ids = x_parallel_desc_symbol.machine_id2device_id_list[
+        x_machine_ids[0]
+    ]
+    assert len(x_first_device_ids) == 1, x_first_device_ids
+    if x_parallel_desc_symbol == to_parallel_desc_symbol:
+        return produced_blob_object
+    return builder.BroadcastBlobReference(produced_blob_object, to_parallel_desc_symbol)
+
+
+def BuildCopyHdInstruction(builder, produced_blob_object, to_device_tag):
+    (op_conf, lbi) = _MakeCopyHdOpConfAndRetLbi()
+    return _BuildCopyInstruction(builder, produced_blob_object, op_conf, to_device_tag)
+
+
+def _MakeCopyHdOpConfAndRetLbi():
+    op_conf = op_conf_pb.OperatorConf()
+    op_conf.name = "copy_hd"
+    op_conf.device_tag = "gpu"
+    setattr(op_conf.copy_conf, "in", "%s/in" % op_conf.name)
+    op_conf.copy_conf.out = "out"
+    lbi = logical_blob_id_util.LogicalBlobId()
+    lbi.op_name = op_conf.name
+    lbi.blob_name = "out"
+    return (op_conf, lbi)
+
+
+@contextmanager
+def _CudaHostPinBlob(build, blob_object):
+    build.CudaHostRegisterBlob(blob_object)
+    try:
+        yield
+    finally:
+        build.CudaHostUnregisterBlob(blob_object)
+
+
+def _BuildCopyInstruction(builder, produced_blob_object, op_conf, to_device_tag):
+    x_devices = produced_blob_object.parallel_desc_symbol.machine_id2device_id_list
+    x_device_tag = produced_blob_object.parallel_desc_symbol.device_tag
+    bn_in_op2blob_object = oneflow._oneflow_internal.deprecated.BnInOp2BlobObject()
+    bn_in_op2blob_object["in"] = produced_blob_object
+    op_attribute = op_infer_util.Infer(op_conf, bn_in_op2blob_object)
+    assert to_device_tag != x_device_tag, (to_device_tag, x_device_tag)
+    cfg_op_attribute = oneflow._oneflow_internal.deprecated.MakeOpAttributeByString(
+        str(op_attribute)
+    )
+    if to_device_tag == "cpu" and x_device_tag == "gpu":
+        x_parallel_conf = produced_blob_object.parallel_desc_symbol.parallel_conf
+        builder.NoBoxingCudaD2HStatelessCall(
+            cfg_op_attribute, x_parallel_conf, bn_in_op2blob_object, TryReplaceDeviceTag
+        )
+    elif to_device_tag == "gpu" and x_device_tag == "cpu":
+        out_parallel_desc_symbol = TryReplaceDeviceTag(
+            builder, produced_blob_object.parallel_desc_symbol, to_device_tag
+        )
+        out_parallel_conf = out_parallel_desc_symbol.parallel_conf
+        with _CudaHostPinBlob(builder, produced_blob_object):
+            builder.NoBoxingCudaH2DStatelessCall(
+                cfg_op_attribute, out_parallel_conf, bn_in_op2blob_object
+            )
+    else:
+        raise NotImplementedError(
+            "invalid device found. to_device_tag: %s, x_device_tag: %s"
+            % (to_device_tag, x_device_tag)
+        )
+    sbp_parallel = bn_in_op2blob_object["out"].op_arg_parallel_attr.sbp_parallel
+    sbp_parallel.CopyFrom(produced_blob_object.op_arg_parallel_attr.sbp_parallel)
+    return bn_in_op2blob_object["out"]
+
+
+def _AssignOpConf():
+    op_conf = op_conf_pb.OperatorConf()
+    op_conf.name = "assign"
+    op_conf.assign_conf.ref = "assign/ref"
+    op_conf.assign_conf.value = "assign/value"
+    device_tag = oneflow.current_scope().device_parallel_desc_symbol.device_tag
+    op_conf.device_tag = device_tag
+    return op_conf
+
+
+def BuildAssignInstruction(builder, ref_blob_object, value_blob_object, op_conf):
+    ref_parallel_conf = ref_blob_object.parallel_desc_symbol.parallel_conf
+    ref_devices = ref_blob_object.parallel_desc_symbol.machine_id2device_id_list
+    value_devices = value_blob_object.parallel_desc_symbol.machine_id2device_id_list
+    assert ref_devices == value_devices, "\nref_devices: %s\nvalue_devices: %s" % (
+        ref_devices,
+        value_devices,
+    )
+    ref_device_tag = ref_blob_object.parallel_desc_symbol.device_tag
+    value_device_tag = value_blob_object.parallel_desc_symbol.device_tag
+    bn_in_op2blob_object = oneflow._oneflow_internal.deprecated.BnInOp2BlobObject()
+    bn_in_op2blob_object["ref"] = ref_blob_object
+    bn_in_op2blob_object["value"] = value_blob_object
+    op_attribute = op_infer_util.Infer(op_conf, bn_in_op2blob_object)
+    cfg_op_attribute = oneflow._oneflow_internal.deprecated.MakeOpAttributeByString(
+        str(op_attribute)
+    )
+    if ref_device_tag == value_device_tag:
+        builder.NoBoxingStatelessCall(
+            cfg_op_attribute, ref_parallel_conf, bn_in_op2blob_object
+        )
+    elif ref_device_tag == "cpu" and value_device_tag == "gpu":
+        value_parallel_conf = value_blob_object.parallel_desc_symbol.parallel_conf
+        builder.NoBoxingCudaD2HStatelessCall(
+            cfg_op_attribute,
+            value_parallel_conf,
+            bn_in_op2blob_object,
+            TryReplaceDeviceTag,
+        )
+    elif ref_device_tag == "gpu" and value_device_tag == "cpu":
+        with _CudaHostPinBlob(builder, value_blob_object):
+            builder.NoBoxingCudaH2DStatelessCall(
+                cfg_op_attribute, ref_parallel_conf, bn_in_op2blob_object
+            )
+    else:
+        raise NotImplementedError(
+            "invalid device found. ref_device_tag: %s, value_device_tag: %s"
+            % (ref_device_tag, value_device_tag)
+        )
+
+
+def TryReplaceDeviceTag(builder, parallel_desc_symbol, device_tag):
+    return boxing_middle.TryReplaceDeviceTag(builder, parallel_desc_symbol, device_tag)
+
+
+def ReplaceDeviceTag(parallel_desc_symbol, device_tag, builder=None):
+    return boxing_middle.ReplaceDeviceTag(
+        parallel_desc_symbol, device_tag, builder=builder
+    )
+
+
+def _GetEagerNcclAllReduce(parallel_conf, ibn2blob_object):
+    op_conf = op_conf_pb.OperatorConf()
+    op_conf.device_tag = "gpu"
+    op_conf.name = "eager_nccl_all_reduce"
+    op_conf.user_conf.op_type_name = "eager_nccl_all_reduce"
+    op_conf.user_conf.input["in"].s.append("eager_nccl_all_reduce/in_0")
+    op_conf.user_conf.output["out"].s.append("eager_nccl_all_reduce/out_0")
+    op_conf.user_conf.attr["parallel_conf"].at_string = str(parallel_conf)
+    return op_infer_util.Infer(op_conf, ibn2blob_object)
+
+
+NcclAllReduce = Sequential(
+    boxing_middle.BoxingToMiddle(
+        GpuNcclAllReduce,
+        boxing_middle.ProducerParallelDesc,
+        boxing_middle.BroadcastParallel,
+    ),
+    OptionalBoxing(CopyD2H),
+)
+BoxingIntraNodeOneToOne = Sequential(
+    boxing_middle.BoxingToMiddle(
+        OptionalBoxing(CopyD2H),
+        boxing_middle.ReplaceProducerDeviceTag("cpu"),
+        boxing_middle.ProducerSbpParallel,
+    ),
+    boxing_middle.BoxingToMiddle(
+        CpuBroadcastOneToOne,
+        boxing_middle.ReplaceConsumerDeviceTag("cpu"),
+        boxing_middle.ConsumerSbpParallel,
+    ),
+    OptionalBoxing(CopyH2D),
+)
+BoxingInterNodeOneToOne = Sequential(
+    boxing_middle.BoxingToMiddle(
+        OptionalBoxing(CopyD2H),
+        boxing_middle.ReplaceProducerDeviceTag("cpu"),
+        boxing_middle.ProducerSbpParallel,
+    ),
+    boxing_middle.BoxingToMiddle(
+        InterNodeOneToOne,
+        boxing_middle.ReplaceConsumerDeviceTag("cpu"),
+        boxing_middle.ConsumerSbpParallel,
+    ),
+    OptionalBoxing(CopyH2D),
+)
+BoxingInterNodeOneToMany = Sequential(
+    boxing_middle.BoxingToMiddle(
+        OptionalBoxing(CopyD2H),
+        boxing_middle.ReplaceProducerDeviceTag("cpu"),
+        boxing_middle.ProducerSbpParallel,
+    ),
+    boxing_middle.BoxingToMiddle(
+        InterNodeOneToMany,
+        boxing_middle.ReplaceConsumerDeviceTag("cpu"),
+        boxing_middle.ConsumerSbpParallel,
+    ),
+    OptionalBoxing(CopyH2D),
+)
+conditional_function_table = [
+    CopyH2D,
+    CopyD2H,
+    NoBoxing,
+    BoxingIntraNodeOneToOne,
+    BoxingInterNodeOneToOne,
+    BoxingInterNodeOneToMany,
+    BroadcastManyToOne,
+    Sequential(
+        boxing_middle.BoxingToMiddle(
+            OptionalBoxing(BroadcastManyToOne),
+            boxing_middle.ProducerRandomParallelIdPerMachine(),
+            boxing_middle.ProducerSbpParallel,
+        ),
+        boxing_middle.BoxingToMiddle(
+            OptionalBoxing(CopyD2H),
+            boxing_middle.ReplaceProducerDeviceTag("cpu"),
+            boxing_middle.ProducerSbpParallel,
+        ),
+        boxing_middle.BoxingToMiddle(
+            OptionalBoxing(CpuBroadcastOneToOne),
+            boxing_middle.ConsumerRandomParallelIdPerMachine("cpu"),
+            boxing_middle.BroadcastParallel,
+        ),
+        boxing_middle.BoxingToMiddle(
+            OptionalBoxing(CpuBroadcastOneToMany),
+            boxing_middle.ReplaceConsumerDeviceTag("cpu"),
+            boxing_middle.BroadcastParallel,
+        ),
+        OptionalBoxing(CopyH2D),
+        exclude=(
+            BroadcastManyToOne,
+            CopyH2D,
+            CopyD2H,
+            NoBoxing,
+            BoxingIntraNodeOneToOne,
+        ),
+    ),
+    Sequential(
+        boxing_middle.BoxingToMiddle(
+            BroadcastManyToOne,
+            boxing_middle.ProducerRandomParallelIdPerMachine(),
+            boxing_middle.ProducerSbpParallel,
+        ),
+        boxing_middle.BoxingToMiddle(
+            OptionalBoxing(CopyD2H),
+            boxing_middle.ReplaceProducerDeviceTag("cpu"),
+            boxing_middle.ProducerSbpParallel,
+        ),
+        boxing_middle.BoxingToMiddle(
+            NaiveCpuSplitToSplit,
+            boxing_middle.ReplaceConsumerDeviceTag("cpu"),
+            boxing_middle.ConsumerSbpParallel,
+        ),
+        OptionalBoxing(CopyH2D),
+    ),
+    NcclAllReduce,
+    Sequential(
+        boxing_middle.BoxingToMiddle(
+            OptionalBoxing(CopyD2H),
+            boxing_middle.ReplaceProducerDeviceTag("cpu"),
+            boxing_middle.ProducerSbpParallel,
+        ),
+        boxing_middle.BoxingToMiddle(
+            NaiveCpuPartialSumToSplit,
+            boxing_middle.ConsumerRandomParallelIdPerMachine("cpu"),
+            boxing_middle.BroadcastParallel,
+        ),
+        boxing_middle.BoxingToMiddle(
+            CpuBroadcastOneToMany,
+            boxing_middle.ReplaceConsumerDeviceTag("cpu"),
+            boxing_middle.BroadcastParallel,
+        ),
+        OptionalBoxing(CopyH2D),
+        exclude=(NcclAllReduce,),
+    ),
+    Sequential(
+        boxing_middle.BoxingToMiddle(
+            OptionalBoxing(CopyD2H),
+            boxing_middle.ReplaceProducerDeviceTag("cpu"),
+            boxing_middle.ProducerSbpParallel,
+        ),
+        boxing_middle.BoxingToMiddle(
+            NaiveCpuPartialSumToSplit,
+            boxing_middle.ReplaceConsumerDeviceTag("cpu"),
+            boxing_middle.ConsumerSbpParallel,
+        ),
+        OptionalBoxing(CopyH2D),
+    ),
+    Sequential(
+        boxing_middle.BoxingToMiddle(
+            OptionalBoxing(CopyD2H),
+            boxing_middle.ReplaceProducerDeviceTag("cpu"),
+            boxing_middle.ProducerSbpParallel,
+        ),
+        boxing_middle.BoxingToMiddle(
+            NaiveCpuSplitToSplit,
+            boxing_middle.ConsumerRandomParallelIdPerMachine("cpu"),
+            boxing_middle.BroadcastParallel,
+        ),
+        boxing_middle.BoxingToMiddle(
+            CpuBroadcastOneToMany,
+            boxing_middle.ReplaceConsumerDeviceTag("cpu"),
+            boxing_middle.BroadcastParallel,
+        ),
+        OptionalBoxing(CopyH2D),
+        exclude=(NcclAllReduce,),
+    ),
+    Sequential(
+        boxing_middle.BoxingToMiddle(
+            OptionalBoxing(CopyD2H),
+            boxing_middle.ReplaceProducerDeviceTag("cpu"),
+            boxing_middle.ProducerSbpParallel,
+        ),
+        boxing_middle.BoxingToMiddle(
+            NaiveCpuSplitToSplit,
+            boxing_middle.ReplaceConsumerDeviceTag("cpu"),
+            boxing_middle.ConsumerSbpParallel,
+        ),
+        OptionalBoxing(CopyH2D),
+    ),
+]
+
+
+class BoxingUtil(oneflow._oneflow_internal.deprecated.ForeignBoxingUtil):
+    def __init__(self):
+        oneflow._oneflow_internal.deprecated.ForeignBoxingUtil.__init__(self)
+
+    def BoxingTo(self, builder, blob_object, op_arg_parallel_attr):
+        return BoxingTo(builder, blob_object, op_arg_parallel_attr)
+
+    def TryReplaceDeviceTag(self, builder, parallel_desc_symbol, device_tag):
+        return TryReplaceDeviceTag(builder, parallel_desc_symbol, device_tag)
+
+    def Assign(self, builder, target_blob_object, source_blob_object):
+        return Assign(builder, target_blob_object, source_blob_object)
+
+
+_global_boxing_util = BoxingUtil()
diff --git a/python/oneflow/eager/eager_blob_util.py b/python/oneflow/eager/eager_blob_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..86b59fc4827cd5d0e51a6e4b96764e889b85483a
--- /dev/null
+++ b/python/oneflow/eager/eager_blob_util.py
@@ -0,0 +1,111 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow._oneflow_internal
+import oneflow.framework.blob_trait as blob_trait
+import oneflow.framework.python_callback as python_callback
+import oneflow.support.async_util as async_util
+from oneflow.framework.dtype import convert_proto_dtype_to_oneflow_dtype
+
+
+@property
+def dtype(self):
+    return convert_proto_dtype_to_oneflow_dtype(self.get_dtype())
+
+
+def numpy(self):
+    return _GetPhysicalBlobBodyCache(self.blob_object)
+
+
+def numpy_list(self):
+    return _GetPhysicalBlobBodyCache(self.blob_object)
+
+
+def RegisterMethod4EagerPhysicalBlob():
+    oneflow._oneflow_internal.EagerPhysicalBlob.dtype = dtype
+    oneflow._oneflow_internal.EagerPhysicalBlob.numpy = numpy
+    oneflow._oneflow_internal.EagerPhysicalBlob.numpy_list = numpy_list
+
+
+def FetchTensorBlobAsNumpyList(parallel_size, blob_object):
+    def AsyncFetchBlobBody(Yield):
+        fetcher = _MakeFetcherEagerBlobBodyAsNumpyFromOfBlob(Yield)
+
+        def BuildFetchBlobBodyInstruction(builder):
+            builder.FetchBlobBody(
+                blob_object, python_callback.GetIdForRegisteredCallback(fetcher)
+            )
+            builder.InsertRemoveForeignCallbackInstruction(
+                blob_object.object_id,
+                python_callback.GetIdForRegisteredCallback(fetcher),
+            )
+
+        oneflow._oneflow_internal.deprecated.PhysicalRun(BuildFetchBlobBodyInstruction)
+
+    return async_util.Await(parallel_size, AsyncFetchBlobBody)
+
+
+def _GetPhysicalBlobHeaderCache(blob_object):
+    return _FetchBlobHeader(blob_object)
+
+
+def _GetPhysicalBlobBodyCache(blob_object):
+    return _FetchPhysicalBlobBody(blob_object)
+
+
+def _FetchBlobHeader(blob_object):
+    def AsyncFetchBlobHeader(Yield):
+        fetcher = _MakeFetcherEagerPhysicalBlobHeaderFromOfBlob(Yield)
+
+        def BuildFetchBlobHeaderInstruction(builder):
+            builder.FetchBlobHeader(
+                blob_object, python_callback.GetIdForRegisteredCallback(fetcher)
+            )
+            builder.InsertRemoveForeignCallbackInstruction(
+                blob_object.object_id,
+                python_callback.GetIdForRegisteredCallback(fetcher),
+            )
+
+        oneflow._oneflow_internal.deprecated.PhysicalRun(
+            BuildFetchBlobHeaderInstruction
+        )
+
+    return async_util.Await(1, AsyncFetchBlobHeader)[0]
+
+
+def _FetchPhysicalBlobBody(blob_object):
+    return FetchTensorBlobAsNumpyList(1, blob_object)[0]
+
+
+def _MakeFetcherEagerPhysicalBlobHeaderFromOfBlob(Yield):
+    def Callback(ofblob):
+        Yield(
+            oneflow._oneflow_internal.EagerPhysicalBlobHeader(
+                ofblob.static_shape,
+                ofblob.shape,
+                oneflow._oneflow_internal.deprecated.GetProtoDtype4OfDtype(
+                    ofblob.dtype
+                ),
+            )
+        )
+
+    return Callback
+
+
+def _MakeFetcherEagerBlobBodyAsNumpyFromOfBlob(Yield):
+    def FetchFromOfBlob(ofblob):
+        Yield(ofblob.CopyToNdarray())
+
+    return FetchFromOfBlob
diff --git a/python/oneflow/eager/gradient_util.py b/python/oneflow/eager/gradient_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..4dec1c49074b83b7bb524f72a50025bc032d3cbd
--- /dev/null
+++ b/python/oneflow/eager/gradient_util.py
@@ -0,0 +1,47 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow.framework.session_context as session_ctx
+
+
+def GetDefaultBackwardBlobRegister():
+    return session_ctx.GetDefaultSession().backward_blob_register
+
+
+def ReleaseUnusedBlobObject(op_attribute, blob_register):
+    assert op_attribute.HasField("blob_last_used_signature"), op_attribute
+    signature_map = op_attribute.blob_last_used_signature.bn_in_op2blob_last_used
+    bn_in_op2lbi = op_attribute.arg_signature.bn_in_op2lbi
+    for (bn_in_op, is_blob_last_used) in signature_map.items():
+        if not is_blob_last_used:
+            continue
+        lbi = bn_in_op2lbi[bn_in_op]
+        lbn = "%s/%s" % (lbi.op_name, lbi.blob_name)
+        blob_register.ClearObject4BlobName(lbn)
+
+
+def TrySetBackwardUsedBlobObject(op_attribute, fw_blob_register, bw_blob_register):
+    assert op_attribute.HasField("blob_backward_used_signature"), op_attribute
+    signature_map = (
+        op_attribute.blob_backward_used_signature.bn_in_op2blob_backward_used
+    )
+    bn_in_op2lbi = op_attribute.arg_signature.bn_in_op2lbi
+    for (bn_in_op, is_blob_backward_used) in signature_map.items():
+        if not is_blob_backward_used:
+            continue
+        lbi = bn_in_op2lbi[bn_in_op]
+        lbn = "%s/%s" % (lbi.op_name, lbi.blob_name)
+        blob_object = fw_blob_register.GetObject4BlobName(lbn)
+        bw_blob_register.TrySetObject4BlobName(lbn, blob_object)
diff --git a/python/oneflow/eager/interpreter_callback.py b/python/oneflow/eager/interpreter_callback.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3a813307febe0924ac322c90afad94a8b506da1
--- /dev/null
+++ b/python/oneflow/eager/interpreter_callback.py
@@ -0,0 +1,99 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from google.protobuf import text_format
+
+import oneflow._oneflow_internal
+import oneflow.core.job.placement_pb2 as placement_pb
+import oneflow.core.job.scope_pb2 as scope_pb
+import oneflow.core.operator.op_attribute_pb2 as op_attribute_pb
+import oneflow.eager.gradient_util as gradient_util
+import oneflow.eager.op_executor as op_executor
+import oneflow.eager.symbol_storage as symbol_storage
+import oneflow.framework.scope_util as scope_util
+
+
+def MakeScopeSymbol(job_conf, parallel_conf, is_mirrored):
+    parallel_hierarchy = None
+    if parallel_conf.has_hierarchy():
+        parallel_hierarchy = oneflow._oneflow_internal.Size(
+            tuple(parallel_conf.hierarchy().dim())
+        )
+    return scope_util.MakeInitialScope(
+        job_conf,
+        parallel_conf.device_tag(),
+        list(parallel_conf.device_name()),
+        parallel_hierarchy,
+        is_mirrored,
+    ).symbol_id
+
+
+def MakeParallelDescSymbol(parallel_conf):
+    symbol_id = None
+
+    def BuildInstruction(builder):
+        nonlocal symbol_id
+        symbol_id = builder.GetParallelDescSymbol(parallel_conf).symbol_id
+
+    oneflow._oneflow_internal.deprecated.LogicalRun(BuildInstruction)
+    return symbol_id
+
+
+def MirroredCast(op_attribute_str, parallel_conf):
+    op_attribute = text_format.Parse(op_attribute_str, op_attribute_pb.OpAttribute())
+    blob_register = oneflow._oneflow_internal.GetDefaultBlobRegister()
+    is_cast_to_mirrored = op_attribute.op_conf.HasField("cast_to_mirrored_conf")
+    is_cast_from_mirrored = op_attribute.op_conf.HasField("cast_from_mirrored_conf")
+    assert is_cast_to_mirrored or is_cast_from_mirrored
+    _MirroredCastAndAddOutputBlobReleaser(op_attribute, blob_register)
+    bw_blob_register = gradient_util.GetDefaultBackwardBlobRegister()
+    gradient_util.TrySetBackwardUsedBlobObject(
+        op_attribute, blob_register, bw_blob_register
+    )
+
+
+def InterpretCompletedOp(op_attribute_str, parallel_conf):
+    op_attribute = text_format.Parse(op_attribute_str, op_attribute_pb.OpAttribute())
+    blob_register = gradient_util.GetDefaultBackwardBlobRegister()
+    _InterpretCompletedOp(op_attribute, parallel_conf, blob_register)
+    gradient_util.ReleaseUnusedBlobObject(op_attribute, blob_register)
+
+
+def _InterpretCompletedOp(op_attribute, parallel_conf, blob_register):
+    return op_executor.Interpret(op_attribute, parallel_conf, blob_register)
+
+
+def _MirroredCastAndAddOutputBlobReleaser(op_attribute, blob_register):
+    op_executor.MirroredCast(op_attribute, blob_register)
+    _AddOutputBlobObjectReleaser4InputBlobObject(op_attribute, blob_register)
+
+
+def _AddOutputBlobObjectReleaser4InputBlobObject(op_attribute, blob_register):
+    in_lbi = op_attribute.arg_signature.bn_in_op2lbi["in"]
+    in_lbn = "%s/%s" % (in_lbi.op_name, in_lbi.blob_name)
+    in_blob_object = blob_register.GetObject4BlobName(in_lbn)
+    release = _MakeReleaser4MirroredCastBlobObject(op_attribute, blob_register)
+    in_blob_object.add_releaser(release)
+
+
+def _MakeReleaser4MirroredCastBlobObject(op_attribute, blob_register):
+    def ReleaseMirroredBlobObject(obj):
+        for obn in op_attribute.output_bns:
+            lbi = op_attribute.arg_signature.bn_in_op2lbi[obn]
+            lbn = "%s/%s" % (lbi.op_name, lbi.blob_name)
+            blob_object = blob_register.GetObject4BlobName(lbn)
+            blob_register.ClearObject4BlobName(lbn)
+
+    return ReleaseMirroredBlobObject
diff --git a/python/oneflow/eager/op_executor.py b/python/oneflow/eager/op_executor.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7a63e947473a4f724eb90ed4c21ac366e144717
--- /dev/null
+++ b/python/oneflow/eager/op_executor.py
@@ -0,0 +1,481 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+
+import numpy as np
+from google.protobuf import text_format
+
+import oneflow
+import oneflow._oneflow_internal
+import oneflow._oneflow_internal.oneflow.core.job.placement as placement_cfg
+import oneflow._oneflow_internal.oneflow.core.register.logical_blob_id as lbi_util
+import oneflow.core.operator.interface_blob_conf_pb2 as inter_face_blob_conf_util
+import oneflow.core.operator.op_conf_pb2 as op_conf_util
+import oneflow.core.operator.op_node_signature_pb2 as op_node_signature_pb
+import oneflow.core.register.logical_blob_id_pb2 as logical_blob_id_util
+import oneflow.eager.blob_register as blob_register_util
+import oneflow.eager.boxing_util as boxing_util
+import oneflow.eager.op_infer_util as op_infer_util
+import oneflow.eager.symbol_storage as symbol_storage
+import oneflow.experimental.namescope as name_scope
+import oneflow.framework.c_api_util as c_api_util
+import oneflow.framework.python_callback as python_callback
+import oneflow.framework.remote_blob as remote_blob_util
+import oneflow.framework.scope_util as scope_util
+import oneflow.framework.session_context as session_ctx
+
+default_blob_register = oneflow._oneflow_internal.GetDefaultBlobRegister()
+
+
+def Interpret(op_attribute, parallel_conf, blob_register):
+    if op_attribute.op_conf.HasField("cast_to_mirrored_conf"):
+        return MirroredCast(op_attribute, blob_register)
+    if op_attribute.op_conf.HasField("cast_from_mirrored_conf"):
+        return MirroredCast(op_attribute, blob_register)
+    assert isinstance(parallel_conf, placement_cfg.ParallelConf)
+    if op_attribute.op_conf.HasField("distribute_split_conf"):
+        return DistributeSplitOrClone(op_attribute, parallel_conf, blob_register)
+    if op_attribute.op_conf.HasField("distribute_clone_conf"):
+        return DistributeSplitOrClone(op_attribute, parallel_conf, blob_register)
+    if op_attribute.op_conf.HasField("distribute_concat_conf"):
+        return DistributeConcatOrAdd(op_attribute, parallel_conf, blob_register)
+    if op_attribute.op_conf.HasField("distribute_add_conf"):
+        return DistributeConcatOrAdd(op_attribute, parallel_conf, blob_register)
+    if op_attribute.op_conf.HasField("variable_conf"):
+        return _FindOrCreateVarBlobObject(op_attribute, parallel_conf, blob_register)
+    if op_attribute.op_conf.HasField("foreign_watch_conf"):
+        return _Watch(op_attribute, parallel_conf, blob_register)
+    return _NaiveInterpret(op_attribute, parallel_conf, blob_register)
+
+
+def OpKernelCall(opkernel_object, op_attribute, blob_register):
+    def BuildInstruction(builder):
+        with blob_register_util.BnInOp2BlobObjectScope(
+            blob_register, op_attribute
+        ) as bn_in_op2blob_object:
+            cfg_op_attribute = oneflow._oneflow_internal.deprecated.MakeOpAttributeByString(
+                str(op_attribute)
+            )
+            builder.StatefulCall(
+                cfg_op_attribute,
+                opkernel_object,
+                bn_in_op2blob_object,
+                boxing_util.BoxingTo,
+            )
+
+    oneflow._oneflow_internal.deprecated.LogicalRun(BuildInstruction)
+
+
+def MirroredCast(op_attribute, blob_register):
+    def BuildInstruction(builder):
+        with blob_register_util.BnInOp2BlobObjectScope(
+            blob_register, op_attribute
+        ) as bn_in_op2blob_object:
+            in_blob_object = bn_in_op2blob_object["in"]
+            parallel_desc_symbol = in_blob_object.parallel_desc_symbol
+            op_arg_parallel_attr = oneflow._oneflow_internal.GetOpArgParallelAttribute(
+                parallel_desc_symbol, str(op_attribute), "out"
+            )
+            out_blob_object = builder.MakeReferenceBlobObject(
+                in_blob_object, op_arg_parallel_attr
+            )
+            bn_in_op2blob_object["out"] = out_blob_object
+
+    oneflow._oneflow_internal.deprecated.LogicalRun(BuildInstruction)
+
+
+def DistributeSplitOrClone(op_attribute, parallel_conf, blob_register):
+    parallel_sig = op_attribute.parallel_signature.bn_in_op2parallel_desc_symbol_id
+
+    def GetInBlobObject(builder, ibn, bn_in_op2blob_object):
+        origin_blob_object = bn_in_op2blob_object[ibn]
+        in_op_parallel_desc_sym = oneflow._oneflow_internal.GetPlacementSymbol(
+            parallel_sig[ibn]
+        )
+        in_op_arg_parallel_attr = oneflow._oneflow_internal.GetOpArgParallelAttribute(
+            in_op_parallel_desc_sym, str(op_attribute), ibn
+        )
+        return boxing_util.BoxingTo(
+            builder, origin_blob_object, in_op_arg_parallel_attr
+        )
+
+    def BuildInstruction(builder):
+        with blob_register_util.BnInOp2BlobObjectScope(
+            blob_register, op_attribute
+        ) as bn_in_op2blob_object:
+            physical_out_blob_objects = builder.UnpackLogicalBlobToPhysicalBlobs(
+                GetInBlobObject(builder, "in", bn_in_op2blob_object)
+            )
+            for (i, blob_object) in enumerate(physical_out_blob_objects):
+                bn_in_op2blob_object["out_%s" % i] = blob_object
+
+    oneflow._oneflow_internal.deprecated.LogicalRun(BuildInstruction)
+
+
+def DistributeConcatOrAdd(op_attribute, parallel_conf, blob_register):
+    op_parallel_desc_sym = oneflow._oneflow_internal.GetPlacementSymbol(
+        op_attribute.parallel_signature.op_parallel_desc_symbol_id
+    )
+    parallel_size = len(op_attribute.input_bns)
+    op_arg_parallel_attr = oneflow._oneflow_internal.GetOpArgParallelAttribute(
+        op_parallel_desc_sym, str(op_attribute), "out"
+    )
+    op_arg_blob_attr = oneflow._oneflow_internal.GetOpArgBlobAttribute(
+        str(op_attribute), "out"
+    )
+    parallel_sig = op_attribute.parallel_signature.bn_in_op2parallel_desc_symbol_id
+
+    def GetInBlobObject(builder, i, bn_in_op2blob_object):
+        ibn = "in_%s" % i
+        origin_blob_object = bn_in_op2blob_object[ibn]
+        in_op_parallel_desc_sym = oneflow._oneflow_internal.GetPlacementSymbol(
+            parallel_sig[ibn]
+        )
+        in_op_arg_parallel_attr = oneflow._oneflow_internal.GetOpArgParallelAttribute(
+            in_op_parallel_desc_sym, str(op_attribute), ibn
+        )
+        return boxing_util.BoxingTo(
+            builder, origin_blob_object, in_op_arg_parallel_attr
+        )
+
+    def BuildInstruction(builder):
+        with blob_register_util.BnInOp2BlobObjectScope(
+            blob_register, op_attribute
+        ) as bn_in_op2blob_object:
+
+            def GetPhysicalInBlob(i):
+                return GetInBlobObject(builder, i, bn_in_op2blob_object)
+
+            in_blob_objects = [GetPhysicalInBlob(i) for i in range(parallel_size)]
+            bn_in_op2blob_object["out"] = builder.PackPhysicalBlobsToLogicalBlob(
+                in_blob_objects, op_arg_parallel_attr, op_arg_blob_attr
+            )
+
+    oneflow._oneflow_internal.deprecated.LogicalRun(BuildInstruction)
+
+
+def _FindOrCreateVarBlobObject(op_attribute, parallel_conf, blob_register):
+    job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+    name = name_scope.GetJobNameScopePrefix(job_name) + op_attribute.op_conf.name
+    sess = session_ctx.GetDefaultSession()
+    (var_blob, _) = sess.TryGetVariableBlobOfJobFromStash(job_name, name)
+    if var_blob is not None:
+        blob_register.SetObject4BlobName(
+            var_blob.logical_blob_name, var_blob.blob_object
+        )
+        return
+    _NaiveInterpret(op_attribute, parallel_conf, blob_register)
+    var_blob = _MakeEagerLogicalBlob(op_attribute, "out", blob_register=blob_register)
+    EagerInitVariableBlob(sess, op_attribute.op_conf, var_blob)
+    sess.StashVariableBlob4Job(job_name, op_attribute.op_conf.name, var_blob)
+    return var_blob
+
+
+def _Watch(op_attribute, parallel_conf, blob_register):
+    lbi = op_attribute.arg_signature.bn_in_op2lbi["in"]
+    uuid = op_attribute.op_conf.foreign_watch_conf.handler_uuid
+    lbn = "%s/%s" % (lbi.op_name, lbi.blob_name)
+    in_blob_object = blob_register.GetObject4BlobName(lbn)
+    if not isinstance(lbi, lbi_util.LogicalBlobId):
+        cfg_lbi = lbi_util.LogicalBlobId()
+        cfg_lbi.set_op_name(lbi.op_name)
+        cfg_lbi.set_blob_name(lbi.blob_name)
+        lbi = cfg_lbi
+    if in_blob_object.op_arg_parallel_attr.is_mirrored():
+        blob = oneflow._oneflow_internal.EagerMirroredBlob(
+            lbi, in_blob_object, default_blob_register
+        )
+    else:
+        blob = oneflow._oneflow_internal.EagerConsistentBlob(
+            lbi, in_blob_object, default_blob_register
+        )
+    uuid2watch_handler = session_ctx.GetDefaultSession().uuid2watch_handler
+    assert uuid in uuid2watch_handler
+    uuid2watch_handler[uuid](blob)
+    del uuid2watch_handler[uuid]
+
+
+def _NaiveInterpret(op_attribute, parallel_conf, blob_register):
+    def BuildInstruction(builder):
+        with blob_register_util.BnInOp2BlobObjectScope(
+            blob_register, op_attribute
+        ) as bn_in_op2blob_object:
+            cfg_op_attribute = oneflow._oneflow_internal.deprecated.MakeOpAttributeByString(
+                str(op_attribute)
+            )
+            builder.StatelessCall(
+                cfg_op_attribute,
+                parallel_conf,
+                bn_in_op2blob_object,
+                boxing_util.BoxingTo,
+            )
+
+    oneflow._oneflow_internal.deprecated.LogicalRun(BuildInstruction)
+
+
+def _MakeEagerLogicalBlob(op_attribute, obn, blob_register):
+    lbi = op_attribute.arg_signature.bn_in_op2lbi[obn]
+    blob_object = blob_register.GetObject4BlobName(
+        "%s/%s" % (lbi.op_name, lbi.blob_name)
+    )
+    mirrored_sig_map = op_attribute.mirrored_signature.bn_in_op2opt_mirrored_parallel
+    if not isinstance(lbi, lbi_util.LogicalBlobId):
+        cfg_lbi = lbi_util.LogicalBlobId()
+        cfg_lbi.set_op_name(lbi.op_name)
+        cfg_lbi.set_blob_name(lbi.blob_name)
+        lbi = cfg_lbi
+    if mirrored_sig_map[obn].HasField("mirrored_parallel"):
+        return oneflow._oneflow_internal.EagerMirroredBlob(
+            lbi, blob_object, default_blob_register
+        )
+    else:
+        return oneflow._oneflow_internal.EagerConsistentBlob(
+            lbi, blob_object, default_blob_register
+        )
+
+
+def EagerInitVariableBlob(sess, var_op_conf, var_blob):
+    snapshot_path = sess.snapshot_mgr.get_snapshot_path(var_op_conf.name)
+    with oneflow.scope.placement("cpu", "0:0"):
+        if snapshot_path is None:
+            blob_object = _EagerRunModelInit(var_op_conf)
+        else:
+            blob_object = _EagerRunModelLoad(var_op_conf, snapshot_path)
+        _Assign(var_blob.blob_object, blob_object)
+
+
+def EagerSaveVariableBlob(snapshot_path):
+    var_blobs = session_ctx.GetDefaultSession().var_name2var_blob.values()
+    with oneflow.scope.placement("cpu", "0:0"):
+        _EagerRunModelSave(var_blobs, snapshot_path)
+
+
+def _Assign(var_blob_object, value_blob_object):
+    def BuildAssignInstruction(builder):
+        new_parallel_desc_symbol = boxing_util.TryReplaceDeviceTag(
+            builder, var_blob_object.parallel_desc_symbol, "cpu"
+        )
+        consumer_op_arg_parallel_attr = oneflow._oneflow_internal.OpArgParallelAttribute(
+            new_parallel_desc_symbol,
+            str(var_blob_object.op_arg_parallel_attr.sbp_parallel),
+            str(var_blob_object.op_arg_parallel_attr.opt_mirrored_parallel),
+        )
+        tmp_blob_object = boxing_util.BoxingTo(
+            builder, value_blob_object, consumer_op_arg_parallel_attr
+        )
+        boxing_util.Assign(builder, var_blob_object, tmp_blob_object)
+
+    oneflow._oneflow_internal.deprecated.LogicalRun(BuildAssignInstruction)
+
+
+def _BuildNotMirroredScope(old_scope, builder):
+    return builder.BuildScopeWithNewIsMirrored(old_scope, False)
+
+
+def _EagerRunModelInit(var_op_conf):
+    (op_conf, _) = _GenModelInitOpConfAndRetLbi(var_op_conf)
+    bn_in_op2blob_object = oneflow._oneflow_internal.deprecated.BnInOp2BlobObject()
+
+    def BuildModelInitInstruction(builder):
+        upstream_signature = op_node_signature_pb.OpNodeSignature()
+        op_conf.scope_symbol_id = oneflow.current_scope().symbol_id
+        op_attribute = c_api_util.InferOpConf(op_conf, upstream_signature)
+        parallel_conf = (
+            oneflow.current_scope().device_parallel_desc_symbol.parallel_conf
+        )
+        cfg_op_attribute = oneflow._oneflow_internal.deprecated.MakeOpAttributeByString(
+            str(op_attribute)
+        )
+        builder.StatelessCall(
+            cfg_op_attribute, parallel_conf, bn_in_op2blob_object, boxing_util.BoxingTo
+        )
+
+    sess = session_ctx.GetDefaultSession()
+    with scope_util.ScopeContext(scope_util.MakeScope(_BuildNotMirroredScope)):
+        oneflow._oneflow_internal.deprecated.LogicalRun(BuildModelInitInstruction)
+    return bn_in_op2blob_object["out_0"]
+
+
+def _MakeModelIOPathInputBuilds(op_conf, path, bn_in_op2blob_object):
+    def BuildModelIOPathInputInstruction(builder):
+        op_attribute = op_infer_util.Infer(op_conf, ibn2blob_object={})
+        parallel_conf = (
+            oneflow.current_scope().device_parallel_desc_symbol.parallel_conf
+        )
+        cfg_op_attribute = oneflow._oneflow_internal.deprecated.MakeOpAttributeByString(
+            str(op_attribute)
+        )
+        builder.StatelessCall(
+            cfg_op_attribute, parallel_conf, bn_in_op2blob_object, boxing_util.BoxingTo
+        )
+
+    def FeedPath(ofblob):
+        ofblob.CopyFromNdarray(np.frombuffer(path.encode("ascii"), dtype=np.int8))
+
+    def BuildFeedPathInstruction(builder):
+        blob_object = bn_in_op2blob_object["out"]
+        builder.FeedBlob(
+            blob_object, python_callback.GetIdForRegisteredCallback(FeedPath)
+        )
+        builder.InsertRemoveForeignCallbackInstruction(
+            blob_object.object_id, python_callback.GetIdForRegisteredCallback(FeedPath)
+        )
+
+    return (BuildModelIOPathInputInstruction, BuildFeedPathInstruction)
+
+
+def _EagerRunModelLoad(var_op_conf, snapshot_path):
+    assert isinstance(snapshot_path, str)
+    assert os.path.basename(snapshot_path) == "out"
+    snapshot_path = os.path.dirname(snapshot_path)
+    assert os.path.basename(snapshot_path) == var_op_conf.name
+    snapshot_path = os.path.dirname(snapshot_path)
+    (path_input_op_conf, path_lbi) = _GenModelIOPathInputOpConfAndRetLbi()
+    path_input_blob_objects = {}
+    (
+        BuildModelIOPathInputInstruction,
+        BuildFeedPathInstruction,
+    ) = _MakeModelIOPathInputBuilds(
+        path_input_op_conf, snapshot_path, path_input_blob_objects
+    )
+    (model_load_op_conf, _) = _GenModelLoadOpConfAndRetLbi(var_op_conf, path_lbi)
+    model_load_blob_objects = oneflow._oneflow_internal.deprecated.BnInOp2BlobObject()
+
+    def BuildModelLoadInstruction(builder):
+        path_blob_object = path_input_blob_objects["out"]
+        model_load_blob_objects["path"] = path_blob_object
+        op_attribute = op_infer_util.Infer(
+            model_load_op_conf, ibn2blob_object=model_load_blob_objects
+        )
+        parallel_conf = path_blob_object.parallel_desc_symbol.parallel_conf
+        cfg_op_attribute = oneflow._oneflow_internal.deprecated.MakeOpAttributeByString(
+            str(op_attribute)
+        )
+        builder.StatelessCall(
+            cfg_op_attribute,
+            parallel_conf,
+            model_load_blob_objects,
+            boxing_util.BoxingTo,
+        )
+
+    sess = session_ctx.GetDefaultSession()
+    with scope_util.ScopeContext(scope_util.MakeScope(_BuildNotMirroredScope)):
+        oneflow._oneflow_internal.deprecated.LogicalRun(
+            BuildModelIOPathInputInstruction
+        )
+        oneflow._oneflow_internal.deprecated.LogicalRun(BuildFeedPathInstruction)
+        oneflow._oneflow_internal.deprecated.LogicalRun(BuildModelLoadInstruction)
+    return model_load_blob_objects["out_0"]
+
+
+def _EagerRunModelSave(var_blobs, snapshot_path):
+    (path_input_op_conf, path_lbi) = _GenModelIOPathInputOpConfAndRetLbi()
+    path_input_blob_objects = oneflow._oneflow_internal.deprecated.BnInOp2BlobObject()
+    (
+        BuildModelIOPathInputInstruction,
+        BuildFeedPathInstruction,
+    ) = _MakeModelIOPathInputBuilds(
+        path_input_op_conf, snapshot_path, path_input_blob_objects
+    )
+    model_save_op_conf = _GenModelSaveOpConf(var_blobs, path_lbi)
+    model_save_blob_objects = oneflow._oneflow_internal.deprecated.BnInOp2BlobObject()
+
+    def BuildModelSaveInstruction(builder):
+        path_blob_object = path_input_blob_objects["out"]
+        model_save_blob_objects["path"] = path_blob_object
+        for (i, blob) in enumerate(var_blobs):
+            model_save_blob_objects["in_{}".format(i)] = blob.blob_object
+        op_attribute = op_infer_util.Infer(
+            model_save_op_conf, ibn2blob_object=model_save_blob_objects
+        )
+        parallel_conf = path_blob_object.parallel_desc_symbol.parallel_conf
+        cfg_op_attribute = oneflow._oneflow_internal.deprecated.MakeOpAttributeByString(
+            str(op_attribute)
+        )
+        builder.StatelessCall(
+            cfg_op_attribute,
+            parallel_conf,
+            model_save_blob_objects,
+            boxing_util.BoxingTo,
+        )
+
+    sess = session_ctx.GetDefaultSession()
+    with scope_util.ScopeContext(scope_util.MakeScope(_BuildNotMirroredScope)):
+        oneflow._oneflow_internal.deprecated.LogicalRun(
+            BuildModelIOPathInputInstruction
+        )
+        oneflow._oneflow_internal.deprecated.LogicalRun(BuildFeedPathInstruction)
+        oneflow._oneflow_internal.deprecated.LogicalRun(BuildModelSaveInstruction)
+
+
+def _GenModelInitOpConfAndRetLbi(var_op_conf):
+    variable_op_conf = op_conf_util.VariableOpConf()
+    variable_op_conf.CopyFrom(var_op_conf.variable_conf)
+    op_conf = op_conf_util.OperatorConf()
+    op_conf.name = "model_init"
+    op_conf.device_tag = "cpu"
+    op_conf.model_init_conf.out.append("out_0")
+    op_conf.model_init_conf.variable_op_name.append(var_op_conf.name)
+    op_conf.model_init_conf.original_variable_conf.append(variable_op_conf)
+    lbi = logical_blob_id_util.LogicalBlobId()
+    lbi.op_name = op_conf.name
+    lbi.blob_name = op_conf.model_init_conf.out[0]
+    return (op_conf, lbi)
+
+
+def _GenModelLoadOpConfAndRetLbi(var_op_conf, path_lbi):
+    variable_op_conf = op_conf_util.VariableOpConf()
+    variable_op_conf.CopyFrom(var_op_conf.variable_conf)
+    op_conf = op_conf_util.OperatorConf()
+    op_conf.name = "model_load"
+    op_conf.device_tag = "cpu"
+    op_conf.model_load_conf.path = "{}/{}".format(path_lbi.op_name, path_lbi.blob_name)
+    op_conf.model_load_conf.out.append("out_0")
+    op_conf.model_load_conf.variable_op_name.append(var_op_conf.name)
+    op_conf.model_load_conf.original_variable_conf.append(variable_op_conf)
+    lbi = logical_blob_id_util.LogicalBlobId()
+    lbi.op_name = op_conf.name
+    lbi.blob_name = op_conf.model_load_conf.out[0]
+    return (op_conf, lbi)
+
+
+def _GenModelIOPathInputOpConfAndRetLbi():
+    op_conf = op_conf_util.OperatorConf()
+    op_conf.name = "model_io_path_input"
+    op_conf.device_tag = "cpu"
+    op_conf.input_conf.out = "out"
+    blob_conf = inter_face_blob_conf_util.InterfaceBlobConf()
+    blob_conf.shape.dim.append(65536)
+    blob_conf.data_type = oneflow._oneflow_internal.deprecated.GetProtoDtype4OfDtype(
+        oneflow.int8
+    )
+    blob_conf.is_dynamic = True
+    op_conf.input_conf.blob_conf.CopyFrom(blob_conf)
+    lbi = logical_blob_id_util.LogicalBlobId()
+    lbi.op_name = op_conf.name
+    lbi.blob_name = op_conf.input_conf.out
+    return (op_conf, lbi)
+
+
+def _GenModelSaveOpConf(var_blobs, path_lbi):
+    op_conf = op_conf_util.OperatorConf()
+    op_conf.name = "model_save"
+    op_conf.device_tag = "cpu"
+    op_conf.model_save_conf.path = "{}/{}".format(path_lbi.op_name, path_lbi.blob_name)
+    for blob in var_blobs:
+        getattr(op_conf.model_save_conf, "in").append(blob.logical_blob_name)
+        getattr(op_conf.model_save_conf, "key").append(blob.logical_blob_name)
+    return op_conf
diff --git a/python/oneflow/eager/op_infer_util.py b/python/oneflow/eager/op_infer_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8ed06c63c15c59988af580d7d1bef08ba83e1ff
--- /dev/null
+++ b/python/oneflow/eager/op_infer_util.py
@@ -0,0 +1,41 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from google.protobuf import text_format
+
+import oneflow
+import oneflow._oneflow_internal.oneflow.core.operator.op_node_signature as op_node_signature_cfg
+import oneflow.core.operator.op_node_signature_pb2 as op_node_signature_pb
+import oneflow.framework.c_api_util as c_api_util
+
+
+def Infer(op_conf, ibn2blob_object, scope_symbol_id=None):
+    if scope_symbol_id is None:
+        scope_symbol_id = oneflow.current_scope().symbol_id
+    op_conf.scope_symbol_id = scope_symbol_id
+    upstream_signature = MakeUpstreamSignature(ibn2blob_object)
+    return c_api_util.InferOpConf(op_conf, upstream_signature)
+
+
+def MakeUpstreamSignature(ibn2blob_object):
+    upstream_signature_cfg = op_node_signature_cfg.OpNodeSignature()
+    for (ibn, blob_object) in ibn2blob_object.items():
+        blob_object.op_arg_blob_attr.DumpToOpNodeSignature(ibn, upstream_signature_cfg)
+        blob_object.op_arg_parallel_attr.DumpToOpNodeSignature(
+            ibn, upstream_signature_cfg
+        )
+    return text_format.Parse(
+        str(upstream_signature_cfg), op_node_signature_pb.OpNodeSignature()
+    )
diff --git a/python/oneflow/eager/symbol.py b/python/oneflow/eager/symbol.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b49ddbd4b60f13ee3ac10e61257fe4125a44b60
--- /dev/null
+++ b/python/oneflow/eager/symbol.py
@@ -0,0 +1,33 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import functools
+
+import oneflow.core.job.placement_pb2 as placement_pb
+import oneflow.framework.c_api_util as c_api_util
+
+
+class Symbol(object):
+    def __init__(self, symbol_id, data):
+        self.symbol_id_ = symbol_id
+        self.data_ = data
+
+    @property
+    def symbol_id(self):
+        return self.symbol_id_
+
+    @property
+    def data(self):
+        return self.data_
diff --git a/python/oneflow/eager/symbol_storage.py b/python/oneflow/eager/symbol_storage.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f5d0cddda3c664f3f7f55c9990e8437eb487fd3
--- /dev/null
+++ b/python/oneflow/eager/symbol_storage.py
@@ -0,0 +1,54 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+
+def HasSymbol4Id(symbol_id):
+    global id2symbol
+    return symbol_id in id2symbol
+
+
+def GetSymbol4Id(symbol_id):
+    global id2symbol
+    assert symbol_id in id2symbol
+    return id2symbol[symbol_id]
+
+
+def SetSymbol4Id(symbol_id, symbol):
+    global id2symbol
+    assert symbol_id not in id2symbol
+    id2symbol[symbol_id] = symbol
+
+
+id2symbol = {}
+
+
+def HasSymbol4SerializedOpConf(serialized_op_conf):
+    global serialized_op_conf2symbol
+    return serialized_op_conf in serialized_op_conf2symbol
+
+
+def GetSymbol4SerializedOpConf(serialized_op_conf):
+    global serialized_op_conf2symbol
+    return serialized_op_conf2symbol[serialized_op_conf]
+
+
+def SetSymbol4SerializedOpConf(serialized_op_conf, symbol):
+    assert not HasSymbol4SerializedOpConf(serialized_op_conf)
+    global serialized_op_conf2symbol
+    serialized_op_conf2symbol[serialized_op_conf] = symbol
+
+
+serialized_op_conf2symbol = {}
diff --git a/python/oneflow/env.py b/python/oneflow/env.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e5ac851a9097b356971e8150b2c52d8043a1f6c
--- /dev/null
+++ b/python/oneflow/env.py
@@ -0,0 +1,26 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.framework.env_util import api_all_device_placement as all_device_placement
+from oneflow.framework.env_util import api_ctrl_port as ctrl_port
+from oneflow.framework.env_util import api_data_port as data_port
+from oneflow.framework.env_util import api_env_init as init
+from oneflow.framework.env_util import api_get_current_resource as current_resource
+from oneflow.framework.env_util import api_grpc_use_no_signal as grpc_use_no_signal
+from oneflow.framework.env_util import api_init_bootstrap_confs as init_bootstrap_confs
+from oneflow.framework.env_util import api_log_dir as log_dir
+from oneflow.framework.env_util import api_logbuflevel as logbuflevel
+from oneflow.framework.env_util import api_logtostderr as logtostderr
+from oneflow.framework.env_util import api_machine as machine
diff --git a/python/oneflow/experimental/__init__.py b/python/oneflow/experimental/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1ac6df91d57d0039a65f464ac465ee38e8616cb
--- /dev/null
+++ b/python/oneflow/experimental/__init__.py
@@ -0,0 +1,33 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from oneflow.experimental.indexed_slices_ops import indexed_slices_reduce_sum
+from oneflow.experimental.interface_op_read_and_write import (
+    FeedValueToInterfaceBlob as set_interface_blob_value,
+)
+from oneflow.experimental.interface_op_read_and_write import (
+    GetInterfaceBlobValue as get_interface_blob_value,
+)
+from oneflow.experimental.namescope import deprecated_name_scope as name_scope
+from oneflow.experimental.square_sum_op import square_sum
+from oneflow.experimental.ssp_variable_proxy_op import ssp_variable_proxy
+from oneflow.experimental.typing_check import (
+    api_enable_typing_check as enable_typing_check,
+)
+from oneflow.experimental.unique_op import unique_with_counts
+from oneflow.framework.c_api_util import GetJobSet as get_job_set
+from oneflow.ops.assign_op import api_one_to_one_assign as eager_assign_121
+from oneflow.ops.util.custom_op_module import CustomOpModule as custom_op_module
diff --git a/python/oneflow/experimental/indexed_slices_ops.py b/python/oneflow/experimental/indexed_slices_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..61788ea9e1821f12d0a6c5b1d11ea123c97d25f2
--- /dev/null
+++ b/python/oneflow/experimental/indexed_slices_ops.py
@@ -0,0 +1,46 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional, Tuple
+
+import oneflow as flow
+import oneflow._oneflow_internal
+import oneflow.core.operator.op_conf_pb2 as op_conf_util
+import oneflow.core.register.logical_blob_id_pb2 as logical_blob_id_util
+import oneflow.framework.distribute as distribute_util
+import oneflow.framework.id_util as id_util
+import oneflow.framework.input_blob_def as input_blob_util
+import oneflow.framework.interpret_util as interpret_util
+import oneflow.framework.remote_blob as remote_blob_util
+
+
+def indexed_slices_reduce_sum(
+    indices: input_blob_util.ArgBlobDef,
+    values: input_blob_util.ArgBlobDef,
+    name: Optional[str] = None,
+) -> Tuple[oneflow._oneflow_internal.BlobDesc]:
+    op = (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("IndexedSlicesReduceSum_")
+        )
+        .Op("indexed_slices_reduce_sum")
+        .Input("x_indices", [indices])
+        .Input("x_values", [values])
+        .Output("y_indices")
+        .Output("y_values")
+        .Output("num_unique")
+        .Build()
+    )
+    return op.InferAndTryRun().RemoteBlobList()
diff --git a/python/oneflow/experimental/interface_op_read_and_write.py b/python/oneflow/experimental/interface_op_read_and_write.py
new file mode 100644
index 0000000000000000000000000000000000000000..86a949c78d5b233cf0a706981d81b8b795d07053
--- /dev/null
+++ b/python/oneflow/experimental/interface_op_read_and_write.py
@@ -0,0 +1,170 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+import oneflow._oneflow_internal
+import oneflow._oneflow_internal.oneflow.core.common.shape as shape_proto_cfg
+import oneflow._oneflow_internal.oneflow.core.job.placement as placement_cfg
+import oneflow._oneflow_internal.oneflow.core.register.logical_blob_id as lbi_util
+import oneflow.core.register.logical_blob_id_pb2 as logical_blob_id_util
+import oneflow.framework.dtype as dtype_util
+import oneflow.framework.input_blob_def as input_blob_def_util
+import oneflow.framework.push_util as push_util
+import oneflow.framework.remote_blob as remote_blob_util
+import oneflow.framework.runtime_mode as rt_mode
+import oneflow.framework.session_context as session_ctx
+import oneflow.support.async_util as async_util
+
+
+def sync_default_session_if_normal():
+    if rt_mode.CurrentMode() == rt_mode.NORMAL_MODE:
+        flow.sync_default_session()
+    else:
+        pass
+
+
+blob_register = oneflow._oneflow_internal.GetDefaultBlobRegister()
+
+
+def _GetInterfaceBlobObject(builder, op_name):
+    sess = session_ctx.GetDefaultSession()
+    if oneflow._oneflow_internal.EagerExecutionEnabled():
+        return sess.var_name2var_blob[op_name].blob_object
+    sess = session_ctx.GetDefaultSession()
+    op_attribute = sess.OpAttribute4InterfaceOpName(op_name)
+    cfg_op_attribute = oneflow._oneflow_internal.deprecated.MakeOpAttributeByString(
+        str(op_attribute)
+    )
+    parallel_conf = sess.ParallelConf4LazyInterfaceOpName(op_name)
+    if not isinstance(
+        parallel_conf, oneflow._oneflow_internal.oneflow.core.job.placement.ParallelConf
+    ):
+        parallel_conf_cfg = placement_cfg.ParallelConf()
+        parallel_conf_cfg.set_device_tag(parallel_conf.device_tag)
+        for device_name in parallel_conf.device_name:
+            parallel_conf_cfg.add_device_name(device_name)
+        if parallel_conf.HasField("hierarchy"):
+            hierarchy = shape_proto_cfg.ShapeProto()
+            for dim in parallel_conf.hierarchy.dim:
+                hierarchy.add_dim(dim)
+            assert hierarchy.dim_size() > 0
+            parallel_conf_cfg.mutable_hierarchy().CopyFrom(hierarchy)
+        parallel_conf = parallel_conf_cfg
+    blob_object = builder.MakeLazyRefBlobObject(
+        op_name, cfg_op_attribute, parallel_conf
+    )
+    return blob_object
+
+
+def GetEagerInterfaceBlob(op_name):
+    sync_default_session_if_normal()
+    sess = session_ctx.GetDefaultSession()
+
+    def CreateBlob():
+        job_name = sess.JobName4InterfaceOpName(op_name)
+
+        def Build(builder, Yield):
+            blob_object = _GetInterfaceBlobObject(builder, op_name)
+            lbi = lbi_util.LogicalBlobId()
+            lbi.set_op_name(op_name)
+            op_attribute = sess.OpAttribute4InterfaceOpName(op_name)
+            assert len(op_attribute.output_bns) == 1
+            lbi.set_blob_name(op_attribute.output_bns[0])
+            if blob_object.op_arg_parallel_attr.is_mirrored():
+                remote_blob = oneflow._oneflow_internal.EagerMirroredBlob(
+                    lbi, blob_object, blob_register, job_name
+                )
+            else:
+                remote_blob = oneflow._oneflow_internal.EagerConsistentBlob(
+                    lbi, blob_object, blob_register, job_name
+                )
+            Yield(remote_blob)
+
+        def AsyncGetInterfaceBlob(Yield):
+            oneflow._oneflow_internal.deprecated.LogicalRun(
+                lambda builder: Build(builder, Yield)
+            )
+
+        blob = async_util.Await(1, AsyncGetInterfaceBlob)[0]
+        return blob
+
+    return sess.FindOrCreateLazyBlob(op_name, CreateBlob)
+
+
+def GetInterfaceBlobValue(op_name):
+    sync_default_session_if_normal()
+    sess = session_ctx.GetDefaultSession()
+    job_name = sess.JobName4InterfaceOpName(op_name)
+
+    def AsyncGetInterfaceBlobValue(Yield):
+        def build(builder):
+            blob_object = GetEagerInterfaceBlob(op_name).blob_object
+            lbi = lbi_util.LogicalBlobId()
+            lbi.set_op_name(op_name)
+            op_attribute = sess.OpAttribute4InterfaceOpName(op_name)
+            assert len(op_attribute.output_bns) == 1
+            lbi.set_blob_name(op_attribute.output_bns[0])
+            if not isinstance(lbi, lbi_util.LogicalBlobId):
+                cfg_lbi = lbi_util.LogicalBlobId()
+                cfg_lbi.set_op_name(lbi.op_name)
+                cfg_lbi.set_blob_name(lbi.blob_name)
+                lbi = cfg_lbi
+            if blob_object.op_arg_parallel_attr.is_mirrored():
+                remote_blob = oneflow._oneflow_internal.EagerMirroredBlob(
+                    lbi, blob_object, blob_register, job_name
+                )
+            else:
+                remote_blob = oneflow._oneflow_internal.EagerConsistentBlob(
+                    lbi, blob_object, blob_register, job_name
+                )
+            value = remote_blob.numpy()
+            Yield(value)
+
+        oneflow._oneflow_internal.deprecated.LogicalRun(build)
+
+    return async_util.Await(1, AsyncGetInterfaceBlobValue)[0]
+
+
+def FeedValueToInterfaceBlobObject(blob_object, ndarray):
+    sync_default_session_if_normal()
+
+    def build(builder):
+        if blob_object.op_arg_parallel_attr.is_mirrored():
+            input_blob_def = input_blob_def_util.MirroredTensorDef(
+                ndarray.shape,
+                dtype=dtype_util.convert_numpy_dtype_to_oneflow_dtype(ndarray.dtype),
+            )
+        else:
+            input_blob_def = input_blob_def_util.FixedTensorDef(
+                ndarray.shape,
+                dtype=dtype_util.convert_numpy_dtype_to_oneflow_dtype(ndarray.dtype),
+            )
+        push_util.FeedValueToEagerBlob(blob_object, input_blob_def, ndarray)
+
+    oneflow._oneflow_internal.deprecated.LogicalRun(build)
+
+
+def FeedValueToInterfaceBlob(op_name, ndarray):
+    sync_default_session_if_normal()
+
+    def AsyncFeedValueToInterfaceBlob(Yield):
+        def build(builder):
+            blob_object = GetEagerInterfaceBlob(op_name).blob_object
+            FeedValueToInterfaceBlobObject(blob_object, ndarray)
+            Yield()
+
+        oneflow._oneflow_internal.deprecated.LogicalRun(build)
+
+    async_util.Await(1, AsyncFeedValueToInterfaceBlob)
diff --git a/python/oneflow/experimental/load_mnist.py b/python/oneflow/experimental/load_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cbcfbe9587718499e3ae49b7fea01c2362c6f8a
--- /dev/null
+++ b/python/oneflow/experimental/load_mnist.py
@@ -0,0 +1,101 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import hashlib
+import os
+
+import numpy as np
+import requests
+from tqdm import tqdm
+
+
+def get_sha256hash(file_path, Bytes=1024):
+    sha256hash = hashlib.sha256()
+    with open(file_path, "rb") as f:
+        while True:
+            data = f.read(Bytes)
+            if data:
+                sha256hash.update(data)
+            else:
+                break
+    ret = sha256hash.hexdigest()
+    return ret
+
+
+def download_mnist_file(out_path, url):
+    resp = requests.get(url=url, stream=True)
+    size = int(resp.headers["Content-Length"]) / 1024
+    print("File size: %.4f kb, downloading..." % size)
+    with open(out_path, "wb") as f:
+        for data in tqdm(
+            iterable=resp.iter_content(1024), total=size, unit="k", desc=out_path
+        ):
+            f.write(data)
+        print("Done!")
+
+
+def get_mnist_file(sha256, url, out_dir):
+    path = os.path.join(out_dir, "mnist.npz")
+    if not os.path.isfile(path):
+        download_mnist_file(path, url)
+    print("File mnist.npz already exist, path:", path)
+    if not get_sha256hash(path) == sha256:
+        cheksum_fail = "sha256 verification failed, remove {0} and try again".format(
+            path
+        )
+        raise Exception(cheksum_fail)
+    return path
+
+
+def load_mnist(
+    train_batch_size=100,
+    test_batch_size=100,
+    data_format="NCHW",
+    url="https://oneflow-public.oss-cn-beijing.aliyuncs.com/datasets/mnist.npz",
+    hash_check="63d4344077849053dc3036b247fa012b2b381de53fd055a66b539dffd76cf08e",
+    out_dir=".",
+):
+    """Load mnist dataset, return images and labels,
+            if  dataset doesn't exist, then download it to directory that out_dir specified
+
+    Args:
+        train_batch_size (int, optional): batch size for train. Defaults to 100.
+        test_batch_size (int, optional): batch size for test or evaluate. Defaults to 100.
+        data_format (str, optional): data format. Defaults to "NCHW".
+        url (str, optional): url to get mnist.npz. Defaults to "https://oneflow-public.oss-cn-beijing.aliyuncs.com/datasets/mnist.npz".
+        hash_check (str, optional): file hash value. Defaults to "63d4344077849053dc3036b247fa012b2b381de53fd055a66b539dffd76cf08e".
+        out_dir (str, optional): dir to save downloaded file. Defaults to "./".
+
+    Returns:
+        (train_images, train_labels), (test_images, test_labels)
+    """
+    path = get_mnist_file(hash_check, url, out_dir)
+    with np.load(path, allow_pickle=True) as f:
+        (x_train, y_train) = (f["x_train"], f["y_train"])
+        (x_test, y_test) = (f["x_test"], f["y_test"])
+
+    def normalize(x, y, batch_size):
+        x = x.astype(np.float32) / 255.0
+        y = y.astype(np.int32)
+        if data_format == "NCHW":
+            images = x.reshape((-1, batch_size, 1, x.shape[1], x.shape[2]))
+        else:
+            images = x.reshape((-1, batch_size, x.shape[1], x.shape[2], 1))
+        labels = y.reshape((-1, batch_size))
+        return (images, labels)
+
+    (train_images, train_labels) = normalize(x_train, y_train, train_batch_size)
+    (test_images, test_labels) = normalize(x_test, y_test, test_batch_size)
+    return ((train_images, train_labels), (test_images, test_labels))
diff --git a/python/oneflow/experimental/namescope.py b/python/oneflow/experimental/namescope.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d34c8d7b88174fd52133462da9385c087663077
--- /dev/null
+++ b/python/oneflow/experimental/namescope.py
@@ -0,0 +1,99 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import traceback
+from contextlib import contextmanager
+
+import oneflow._oneflow_internal
+import oneflow.framework.scope_util as scope_util
+import oneflow.framework.session_context as session_context
+from oneflow import oneflow_deprecate
+
+
+@oneflow_deprecate()
+def deprecated_name_scope(*args, **kwargs):
+    print(
+        "WARNING:",
+        "oneflow.name_scope/oneflow.experimental.name_scope/deprecated.variable_scope",
+        "will be removed in the future, use {} instead.".format(
+            "oneflow.scope.namespace"
+        ),
+    )
+    print(traceback.format_stack()[-2])
+    return name_scope(*args, **kwargs)
+
+
+@contextmanager
+def name_scope(name: str) -> None:
+    """Create a namespace. All variables within the namespace will have a prefix `[SCOPE NAME]-`. This is for convenience only and has no other effect on the system.
+    Usage::
+
+        with oneflow.scope.namespace("scope1"):
+            ...
+            with oneflow.scope.namespace("scope2"):
+                ...
+
+    Args:
+        name: Name of this namespace
+
+    """
+    assert isinstance(name, str)
+    name_scope_stack_push(name)
+
+    def BuildScope(old_scope, builder):
+        return builder.BuildScopeWithNewScopeName(old_scope, name)
+
+    sess = session_context.GetDefaultSession()
+    try:
+        with scope_util.ScopeContext(scope_util.MakeScope(BuildScope)):
+            yield
+    finally:
+        name_scope_stack_pop()
+
+
+def name_scope_stack_push(name):
+    job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+    sess = session_context.GetDefaultSession()
+    if job_name not in sess.job_name2name_scope_stack:
+        sess.job_name2name_scope_stack[job_name] = []
+    sess.job_name2name_scope_stack[job_name].append(name)
+
+
+def name_scope_stack_pop():
+    job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+    sess = session_context.GetDefaultSession()
+    assert job_name in sess.job_name2name_scope_stack
+    assert len(sess.job_name2name_scope_stack[job_name]) > 0
+    return sess.job_name2name_scope_stack[job_name].pop()
+
+
+def GetJobNameScopePrefix(job_name):
+    sess = session_context.GetDefaultSession()
+    if job_name not in sess.job_name2name_scope_stack:
+        return ""
+    if len(sess.job_name2name_scope_stack[job_name]) == 0:
+        return ""
+    return "-".join(sess.job_name2name_scope_stack[job_name]) + "-"
+
+
+def PrependOpNamePrefixIfNeed(op_conf):
+    if op_conf.HasField("variable_conf"):
+        return
+    if op_conf.HasField("decode_ofrecord_conf"):
+        return
+    if op_conf.HasField("user_conf"):
+        return
+    job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+    op_conf.name = GetJobNameScopePrefix(job_name) + op_conf.name
diff --git a/python/oneflow/experimental/scope.py b/python/oneflow/experimental/scope.py
new file mode 100644
index 0000000000000000000000000000000000000000..255e9579bcbbc811db857d65fda70bd7e5495039
--- /dev/null
+++ b/python/oneflow/experimental/scope.py
@@ -0,0 +1,16 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.framework.scope_util import api_scope_config as config
diff --git a/python/oneflow/experimental/square_sum_op.py b/python/oneflow/experimental/square_sum_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..db28b71877c615794e191f49e67cdb46e35efd49
--- /dev/null
+++ b/python/oneflow/experimental/square_sum_op.py
@@ -0,0 +1,44 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import operator
+from functools import reduce
+from typing import Optional
+
+import oneflow as flow
+import oneflow._oneflow_internal
+import oneflow.core.operator.op_conf_pb2 as op_conf_util
+import oneflow.core.register.logical_blob_id_pb2 as logical_blob_id_util
+import oneflow.framework.distribute as distribute_util
+import oneflow.framework.id_util as id_util
+import oneflow.framework.input_blob_def as input_blob_util
+import oneflow.framework.interpret_util as interpret_util
+import oneflow.framework.remote_blob as remote_blob_util
+
+
+def square_sum(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    return (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("SquareSum_")
+        )
+        .Op("square_sum")
+        .Input("x", [x])
+        .Output("y")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
diff --git a/python/oneflow/experimental/ssp_variable_proxy_op.py b/python/oneflow/experimental/ssp_variable_proxy_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fe3c139dc36ff76db5f8624bb12ae72929b55c0
--- /dev/null
+++ b/python/oneflow/experimental/ssp_variable_proxy_op.py
@@ -0,0 +1,41 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Tuple
+
+import oneflow as flow
+import oneflow._oneflow_internal
+import oneflow.framework.id_util as id_util
+import oneflow.framework.remote_blob as remote_blob_util
+
+
+def ssp_variable_proxy(
+    var: oneflow._oneflow_internal.BlobDesc, buffer_size: int = 1, name=None
+) -> Tuple[oneflow._oneflow_internal.BlobDesc, oneflow._oneflow_internal.BlobDesc]:
+    """ return ref_blob, value_blob """
+    if name is None:
+        name = id_util.UniqueStr("SspVariableProxy_")
+    blob_dict = (
+        flow.user_op_builder(name)
+        .Op("ssp_variable_proxy")
+        .Input("var", [var])
+        .Output("ref")
+        .Output("value")
+        .Attr("buffer_size", buffer_size)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobDict()
+    )
+    return (blob_dict["ref"][0], blob_dict["value"][0])
diff --git a/python/oneflow/experimental/typing_check.py b/python/oneflow/experimental/typing_check.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a0f61cd41c4359f7e1bc4709607b948b1d6da4e
--- /dev/null
+++ b/python/oneflow/experimental/typing_check.py
@@ -0,0 +1,31 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow.framework.hob as hob
+import oneflow.support.enable_if as enable_if
+
+
+def api_enable_typing_check(val: bool = True) -> None:
+    """ enable typing check for global_function """
+    return enable_if.unique([enable_typing_check])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.any_global_function_defined)
+def enable_typing_check(val):
+    global typing_check_enabled
+    typing_check_enabled = val
+
+
+typing_check_enabled = False
diff --git a/python/oneflow/experimental/unique_op.py b/python/oneflow/experimental/unique_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..57fc8e44a16c65c3253e10c59fe0c377657ba11f
--- /dev/null
+++ b/python/oneflow/experimental/unique_op.py
@@ -0,0 +1,47 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional, Tuple
+
+import oneflow as flow
+import oneflow._oneflow_internal
+import oneflow.core.operator.op_conf_pb2 as op_conf_util
+import oneflow.core.register.logical_blob_id_pb2 as logical_blob_id_util
+import oneflow.framework.distribute as distribute_util
+import oneflow.framework.id_util as id_util
+import oneflow.framework.input_blob_def as input_blob_util
+import oneflow.framework.interpret_util as interpret_util
+import oneflow.framework.remote_blob as remote_blob_util
+
+
+def unique_with_counts(
+    x: input_blob_util.ArgBlobDef,
+    out_idx: flow.dtype = flow.int32,
+    name: Optional[str] = None,
+) -> Tuple[oneflow._oneflow_internal.BlobDesc]:
+    op = (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("UniqueWithCounts_")
+        )
+        .Op("unique_with_counts")
+        .Input("x", [x])
+        .Attr("out_idx", out_idx)
+        .Output("y")
+        .Output("idx")
+        .Output("count")
+        .Output("num_unique")
+        .Build()
+    )
+    return op.InferAndTryRun().RemoteBlobList()
diff --git a/python/oneflow/framework/__init__.py b/python/oneflow/framework/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/python/oneflow/framework/attr_util.py b/python/oneflow/framework/attr_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..18c37043b3606dad101ee066df16905ecb7c7dcc
--- /dev/null
+++ b/python/oneflow/framework/attr_util.py
@@ -0,0 +1,126 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow
+import oneflow._oneflow_internal
+import oneflow._oneflow_internal.oneflow.core.common.data_type as data_type_cfg
+import oneflow._oneflow_internal.oneflow.core.common.shape as shape_cfg
+import oneflow._oneflow_internal.oneflow.core.framework.user_op_attr as user_op_attr_cfg
+
+
+def SetAttrValue(attr_value, py_value, default_attr_value):
+    if default_attr_value.HasField("at_bool"):
+        if py_value is None:
+            py_value = True
+        assert type(py_value) is bool
+        attr_value.set_at_bool(py_value)
+    elif default_attr_value.HasField("at_int64"):
+        assert type(py_value) is int
+        attr_value.set_at_int64(py_value)
+    elif default_attr_value.HasField("at_double"):
+        assert type(py_value) is float
+        attr_value.set_at_double(py_value)
+    elif default_attr_value.HasField("at_string"):
+        assert type(py_value) is str
+        attr_value.set_at_string(py_value)
+    else:
+        raise ValueError(
+            "config with type %s is invalid. supported types: [bool, int, float, str]"
+            % type(py_value)
+        )
+
+
+def convert_to_user_attr_value(op_type_name, attr_name, attr_value):
+    attribute = user_op_attr_cfg.AttrValue()
+    assert isinstance(attr_name, str)
+    attr_type = oneflow._oneflow_internal.GetUserOpAttrType(op_type_name, attr_name)
+    if attr_type == user_op_attr_cfg.kAtInt32:
+        assert isinstance(attr_value, int)
+        attribute.set_at_int32(attr_value)
+    elif attr_type == user_op_attr_cfg.kAtInt64:
+        assert isinstance(attr_value, int)
+        attribute.set_at_int64(attr_value)
+    elif attr_type == user_op_attr_cfg.kAtBool:
+        assert isinstance(attr_value, bool)
+        attribute.set_at_bool(attr_value)
+    elif attr_type == user_op_attr_cfg.kAtFloat:
+        assert isinstance(attr_value, (float, int))
+        attribute.set_at_float(attr_value)
+    elif attr_type == user_op_attr_cfg.kAtDouble:
+        assert isinstance(attr_value, (float, int))
+        attribute.set_at_double(attr_value)
+    elif attr_type == user_op_attr_cfg.kAtString:
+        assert isinstance(attr_value, str)
+        attribute.set_at_string(attr_value)
+    elif attr_type == user_op_attr_cfg.kAtShape:
+        assert isinstance(attr_value, (tuple, list))
+        attribute_mutable_at_shape = attribute.mutable_at_shape()
+        for x in attr_value:
+            assert isinstance(x, int)
+            attribute_mutable_at_shape.add_dim(x)
+    elif attr_type == user_op_attr_cfg.kAtDataType:
+        assert attr_value in oneflow.dtypes()
+        attr_value = oneflow._oneflow_internal.deprecated.GetProtoDtype4OfDtype(
+            attr_value
+        )
+        assert isinstance(attr_value, int)
+        attribute.set_at_data_type(data_type_cfg.DataType(attr_value))
+    elif attr_type == user_op_attr_cfg.kAtListInt32:
+        assert isinstance(attr_value, (tuple, list))
+        attribute_mutable_at_list_int32 = attribute.mutable_at_list_int32()
+        for x in attr_value:
+            assert isinstance(x, int)
+            attribute_mutable_at_list_int32.add_val(x)
+    elif attr_type == user_op_attr_cfg.kAtListInt64:
+        assert isinstance(attr_value, (tuple, list))
+        attribute_mutable_at_list_int64 = attribute.mutable_at_list_int64()
+        for x in attr_value:
+            assert isinstance(x, int)
+            attribute_mutable_at_list_int64.add_val(x)
+    elif attr_type == user_op_attr_cfg.kAtListFloat:
+        assert isinstance(attr_value, (tuple, list))
+        attribute_mutable_at_list_float = attribute.mutable_at_list_float()
+        for x in attr_value:
+            assert isinstance(x, (float, int))
+            attribute_mutable_at_list_float.add_val(x)
+    elif attr_type == user_op_attr_cfg.kAtListDataType:
+        assert isinstance(attr_value, (tuple, list))
+        attribute_mutable_at_list_data_type = attribute.mutable_at_list_data_type()
+        for x in attr_value:
+            assert x in oneflow.dtypes()
+            x = oneflow._oneflow_internal.deprecated.GetProtoDtype4OfDtype(x)
+            assert isinstance(x, int)
+            attribute_mutable_at_list_data_type.add_val(data_type_cfg.DataType(x))
+    elif attr_type == user_op_attr_cfg.kAtListShape:
+        assert isinstance(attr_value, (tuple, list))
+        attribute_mutable_at_list_shape = (
+            attribute.mutable_at_list_shape().mutable_val()
+        )
+        for x in attr_value:
+            assert isinstance(x, (tuple, list))
+            shape = shape_cfg.ShapeProto()
+            for dim in x:
+                assert isinstance(dim, int)
+                shape.add_dim(dim)
+            attribute_mutable_at_list_shape.Add().CopyFrom(shape)
+    elif attr_type == user_op_attr_cfg.kAtListString:
+        assert isinstance(attr_value, (tuple, list))
+        attribute_mutable_at_list_string = attribute.mutable_at_list_string()
+        for x in attr_value:
+            assert isinstance(x, str)
+            attribute_mutable_at_list_string.add_val(x)
+    else:
+        raise ValueError("Invalid op attribute type {}".format(attr_type))
+    return attribute
diff --git a/python/oneflow/framework/balanced_splitter.py b/python/oneflow/framework/balanced_splitter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9a72178fb31eca0e8569681163736bc9d4f8584
--- /dev/null
+++ b/python/oneflow/framework/balanced_splitter.py
@@ -0,0 +1,32 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+
+def BalancedPartNums(total, part_size):
+    base = int(total / part_size)
+    remainder = total % part_size
+    return [base + int(i < remainder) for i in range(part_size)]
+
+
+def BalancedRanges(total, part_size):
+    balanced_part_nums = BalancedPartNums(total, part_size)
+    ranges = []
+    start = 0
+    for part_num in balanced_part_nums:
+        end = start + part_num
+        ranges.append((start, end))
+        start == end
+    return ranges
diff --git a/python/oneflow/framework/blob.py b/python/oneflow/framework/blob.py
new file mode 100644
index 0000000000000000000000000000000000000000..07591c2bce285bcdb943afc072f4636508fee4ad
--- /dev/null
+++ b/python/oneflow/framework/blob.py
@@ -0,0 +1,67 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import numpy as np
+
+
+class Blob(object):
+    def __init__(self, ndarray=None):
+        self.ndarray_ = ndarray
+
+    def ndarray(self):
+        return self.ndarray_
+
+    def set_ndarray(self, ndarray):
+        self.ndarray_ = ndarray
+
+    def __getattr__(self, attr):
+        return getattr(self.ndarray_, attr)
+
+
+no_override_field = set(
+    [
+        "__class__",
+        "__doc__",
+        "__new__",
+        "__init__",
+        "__del__",
+        "__call__",
+        "__getattr__",
+        "__getattribute__",
+        "__setattr__",
+        "__delattr__",
+        "__dir__",
+        "__get__",
+        "__set__",
+        "__delete__",
+    ]
+)
+
+
+def MakeBlobMethod(field_name):
+    def ConvertOtherArgs(args):
+        return [x.ndarray_ if isinstance(x, Blob) else x for x in args]
+
+    return lambda self, *args: getattr(self.ndarray_, field_name)(
+        *ConvertOtherArgs(args)
+    )
+
+
+for field_name in dir(np.ndarray):
+    if field_name.startswith("__") == False:
+        continue
+    if field_name in no_override_field:
+        continue
+    setattr(Blob, field_name, MakeBlobMethod(field_name))
diff --git a/python/oneflow/framework/blob_trait.py b/python/oneflow/framework/blob_trait.py
new file mode 100644
index 0000000000000000000000000000000000000000..954824ac2092e776f593f0f7aa406e89d1893e0d
--- /dev/null
+++ b/python/oneflow/framework/blob_trait.py
@@ -0,0 +1,99 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow
+
+
+def __add__(self, rhs):
+    return oneflow.math.add(self, rhs)
+
+
+def __radd__(self, lhs):
+    return oneflow.math.add(lhs, self)
+
+
+def __sub__(self, rhs):
+    return oneflow.math.subtract(self, rhs)
+
+
+def __rsub__(self, lhs):
+    return oneflow.math.subtract(lhs, self)
+
+
+def __mul__(self, rhs):
+    return oneflow.math.multiply(self, rhs)
+
+
+def __rmul__(self, lhs):
+    return oneflow.math.multiply(lhs, self)
+
+
+def __truediv__(self, rhs):
+    return oneflow.math.divide(self, rhs)
+
+
+def __rtruediv__(self, lhs):
+    return oneflow.math.divide(lhs, self)
+
+
+def __div__(self, rhs):
+    return oneflow.math.divide(self, rhs)
+
+
+def __mod__(self, rhs):
+    return oneflow.math.mod(self, rhs)
+
+
+def __eq__(self, rhs):
+    return oneflow.math.equal(self, rhs)
+
+
+def __ne__(self, rhs):
+    return oneflow.math.not_equal(self, rhs)
+
+
+def __lt__(self, rhs):
+    return oneflow.math.less(self, rhs)
+
+
+def __le__(self, rhs):
+    return oneflow.math.less_equal(self, rhs)
+
+
+def __gt__(self, rhs):
+    return oneflow.math.greater(self, rhs)
+
+
+def __ge__(self, rhs):
+    return oneflow.math.greater_equal(self, rhs)
+
+
+def RegisterBlobOperatorTraitMethod(blob_class):
+    blob_class.__add__ = __add__
+    blob_class.__radd__ = __radd__
+    blob_class.__sub__ = __sub__
+    blob_class.__rsub__ = __rsub__
+    blob_class.__mul__ = __mul__
+    blob_class.__rmul__ = __rmul__
+    blob_class.__truediv__ = __truediv__
+    blob_class.__rtruediv__ = __rtruediv__
+    blob_class.__div__ = __div__
+    blob_class.__mod__ = __mod__
+    blob_class.__eq__ = __eq__
+    blob_class.__ne__ = __ne__
+    blob_class.__lt__ = __lt__
+    blob_class.__le__ = __le__
+    blob_class.__gt__ = __gt__
+    blob_class.__ge__ = __ge__
diff --git a/python/oneflow/framework/c_api_util.py b/python/oneflow/framework/c_api_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..82f0a993a7b5b4a2be5dbe171a0389c74f97f0fe
--- /dev/null
+++ b/python/oneflow/framework/c_api_util.py
@@ -0,0 +1,252 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from google.protobuf import text_format
+
+import oneflow
+import oneflow._oneflow_internal.oneflow.core.job.placement as placement_cfg
+import oneflow.core.common.data_type_pb2 as dtype_util
+import oneflow.core.common.error_pb2 as error_util
+import oneflow.core.job.env_pb2 as env_pb2
+import oneflow.core.job.job_pb2 as job_pb
+import oneflow.core.job.job_set_pb2 as job_set_pb
+import oneflow.core.job.placement_pb2 as placement_pb
+import oneflow.core.job.resource_pb2 as resource_util
+import oneflow.core.operator.op_attribute_pb2 as op_attribute_pb
+import oneflow.core.operator.op_conf_pb2 as op_conf_util
+import oneflow.core.record.record_pb2 as record_util
+import oneflow.core.register.logical_blob_id_pb2 as logical_blob_id_util
+from oneflow.core.framework.config_def_pb2 import ConfigDef
+from oneflow.core.job.inter_user_job_info_pb2 import InterUserJobInfo
+
+
+def CurrentResource():
+    resource = oneflow._oneflow_internal.CurrentResource()
+    return text_format.Parse(resource, resource_util.Resource())
+
+
+def EnvResource():
+    resource = oneflow._oneflow_internal.EnvResource()
+    return text_format.Parse(resource, resource_util.Resource())
+
+
+def InitEnv(env_proto, is_multi_client):
+    assert type(env_proto) is env_pb2.EnvProto
+    env_proto_str = text_format.MessageToString(env_proto)
+    oneflow._oneflow_internal.InitEnv(env_proto_str, is_multi_client)
+
+
+def InitLazyGlobalSession(config_proto):
+    assert type(config_proto) is job_set_pb.ConfigProto
+    config_proto_str = text_format.MessageToString(config_proto)
+    oneflow._oneflow_internal.InitLazyGlobalSession(config_proto_str)
+
+
+def GetInterUserJobInfo():
+    inter_user_job_info = oneflow._oneflow_internal.GetSerializedInterUserJobInfo()
+    ret = InterUserJobInfo()
+    ret.ParseFromString(inter_user_job_info)
+    return ret
+
+
+def JobBuildAndInferCtx_Open(job_name):
+    job_name = str(job_name)
+    oneflow._oneflow_internal.JobBuildAndInferCtx_Open(job_name)
+
+
+def CurJobBuildAndInferCtx_SetJobConf(job_config_proto):
+    oneflow._oneflow_internal.CurJobBuildAndInferCtx_SetJobConf(job_config_proto)
+
+
+def CurJobBuildAndInferCtx_SetTrainConf(train_config_cfg):
+    oneflow._oneflow_internal.CurJobBuildAndInferCtx_SetTrainConf(train_config_cfg)
+
+
+def InferOpConf(op_conf_proto, upstream_signature):
+    serialized_op_conf = str(text_format.MessageToString(op_conf_proto))
+    serialized_upstream_sig = str(text_format.MessageToString(upstream_signature))
+    op_attribute_str = oneflow._oneflow_internal.InferOpConf(
+        serialized_op_conf, serialized_upstream_sig
+    )
+    return text_format.Parse(op_attribute_str, op_attribute_pb.OpAttribute())
+
+
+def IsInterfaceOpConf(op_conf):
+    op_type_field = op_conf.WhichOneof("op_type")
+    field_number = op_conf_util.OperatorConf.DESCRIPTOR.fields_by_name[
+        op_type_field
+    ].number
+    return oneflow._oneflow_internal.IsInterfaceOpTypeCase(field_number)
+
+
+def GetOpParallelSymbolId(op_conf_proto):
+    serialized_op_conf = str(text_format.MessageToString(op_conf_proto))
+    return oneflow._oneflow_internal.GetOpParallelSymbolId(serialized_op_conf)
+
+
+def CheckAndCompleteUserOpConf(op_conf_proto):
+    serialized_op_conf = str(text_format.MessageToString(op_conf_proto))
+    new_op_conf = oneflow._oneflow_internal.CheckAndCompleteUserOpConf(
+        serialized_op_conf
+    )
+    return text_format.Parse(new_op_conf, op_conf_util.OperatorConf())
+
+
+def CurJobBuildAndInferCtx_AddAndInferConsistentOp(op_conf_proto):
+    serialized_op_conf = str(text_format.MessageToString(op_conf_proto))
+    add_and_infer = (
+        oneflow._oneflow_internal.CurJobBuildAndInferCtx_AddAndInferConsistentOp
+    )
+    op_attribute_str = add_and_infer(serialized_op_conf)
+    return text_format.Parse(op_attribute_str, op_attribute_pb.OpAttribute())
+
+
+def CurJobBuildAndInferCtx_AddAndInferMirroredOp(op_conf_proto):
+    serialized_op_conf = str(text_format.MessageToString(op_conf_proto))
+    add_and_infer = (
+        oneflow._oneflow_internal.CurJobBuildAndInferCtx_AddAndInferMirroredOp
+    )
+    op_attribute_str = add_and_infer(serialized_op_conf)
+    return text_format.Parse(op_attribute_str, op_attribute_pb.OpAttribute())
+
+
+def CurJobBuildAndInferCtx_AddLossLogicalBlobName(lbn):
+    lbn = str(lbn)
+    oneflow._oneflow_internal.CurJobBuildAndInferCtx_AddLossLogicalBlobName(lbn)
+
+
+def CurJobBuildAndInferCtx_AddLbiAndDiffWatcherUuidPair(lbi_and_uuid):
+    serialized = str(text_format.MessageToString(lbi_and_uuid))
+    oneflow._oneflow_internal.CurJobBuildAndInferCtx_AddLbiAndDiffWatcherUuidPair(
+        serialized
+    )
+
+
+def JobBuildAndInferCtx_IsMirroredBlob(job_name, lbn):
+    job_name = str(job_name)
+    lbn = str(lbn)
+    return oneflow._oneflow_internal.JobBuildAndInferCtx_IsMirroredBlob(job_name, lbn)
+
+
+def JobBuildAndInferCtx_MirroredBlobGetNumSubLbi(job_name, lbn):
+    job_name = str(job_name)
+    lbn = str(lbn)
+    return oneflow._oneflow_internal.JobBuildAndInferCtx_MirroredBlobGetNumSubLbi(
+        job_name, lbn
+    )
+
+
+def JobBuildAndInferCtx_MirroredBlobGetSubLbi(job_name, lbn, index):
+    job_name = str(job_name)
+    lbn = str(lbn)
+    ret = oneflow._oneflow_internal.JobBuildAndInferCtx_MirroredBlobGetSerializedSubLbi(
+        job_name, lbn, index
+    )
+    return text_format.Parse(ret, logical_blob_id_util.LogicalBlobId())
+
+
+def JobBuildAndInferCtx_GetStaticShape(job_name, lbn):
+    job_name = str(job_name)
+    lbn = str(lbn)
+    axis_str = oneflow._oneflow_internal.JobBuildAndInferCtx_GetSerializedIdListAsStaticShape(
+        job_name, lbn
+    )
+    int_list = text_format.Parse(axis_str, record_util.Int64List())
+    return tuple(map(int, int_list.value))
+
+
+def JobBuildAndInferCtx_GetDataType(job_name, lbn):
+    job_name = str(job_name)
+    lbn = str(lbn)
+    dtype = oneflow._oneflow_internal.JobBuildAndInferCtx_GetDataType(job_name, lbn)
+    return int(dtype)
+
+
+def JobBuildAndInferCtx_IsDynamic(job_name, lbn):
+    job_name = str(job_name)
+    lbn = str(lbn)
+    ret = oneflow._oneflow_internal.JobBuildAndInferCtx_IsDynamic(job_name, lbn)
+    return ret
+
+
+def JobBuildAndInferCtx_DisableBoxing(job_name, lbn):
+    job_name = str(job_name)
+    lbn = str(lbn)
+    ret = oneflow._oneflow_internal.JobBuildAndInferCtx_DisableBoxing(job_name, lbn)
+    return ret
+
+
+def JobBuildAndInferCtx_GetSplitAxisFromProducerView(job_name, lbn):
+    job_name = str(job_name)
+    lbn = str(lbn)
+    split_axis_str = oneflow._oneflow_internal.JobBuildAndInferCtx_GetSplitAxisFromProducerView(
+        job_name, lbn
+    )
+    split_axis = text_format.Parse(split_axis_str, dtype_util.OptInt64())
+    if split_axis.HasField("value"):
+        return split_axis.value
+    return None
+
+
+def JobBuildAndInferCtx_GetParallelConfFromProducerView(job_name, lbn):
+    job_name = str(job_name)
+    lbn = str(lbn)
+    GetParallelConf = (
+        oneflow._oneflow_internal.JobBuildAndInferCtx_GetSerializedParallelConfFromProducerView
+    )
+    parallel_conf = GetParallelConf(job_name, lbn)
+    parallel_conf = text_format.Parse(parallel_conf, placement_pb.ParallelConf())
+    parallel_conf_cfg = placement_cfg.ParallelConf()
+    parallel_conf_cfg.set_device_tag(parallel_conf.device_tag)
+    for device_name in parallel_conf.device_name:
+        parallel_conf_cfg.add_device_name(device_name)
+    return parallel_conf_cfg
+
+
+def GetMachine2DeviceIdListOFRecordFromParallelConf(parallel_conf):
+    serialized_parallel_conf = str(parallel_conf)
+    ofrecord = oneflow._oneflow_internal.GetMachine2DeviceIdListOFRecordFromParallelConf(
+        serialized_parallel_conf
+    )
+    return text_format.Parse(ofrecord, record_util.OFRecord())
+
+
+def GetFunctionConfigDef():
+    func_config_def = oneflow._oneflow_internal.GetFunctionConfigDef()
+    return text_format.Parse(func_config_def, ConfigDef())
+
+
+def GetScopeConfigDef():
+    scope_config_def = oneflow._oneflow_internal.GetScopeConfigDef()
+    return text_format.Parse(scope_config_def, ConfigDef())
+
+
+def GetInterfaceOpAttributes():
+    op_attributes = oneflow._oneflow_internal.GetSerializedInterfaceOpAttributes()
+    return text_format.Parse(op_attributes, op_attribute_pb.OpAttributeList())
+
+
+def GetJobSet():
+    job_set = oneflow._oneflow_internal.GetSerializedJobSet()
+    ret = job_set_pb.JobSet()
+    ret.ParseFromString(job_set)
+    return ret
+
+
+def GetCurrentJob():
+    serialized_job = oneflow._oneflow_internal.GetSerializedCurrentJob()
+    ret = job_pb.Job()
+    ret.ParseFromString(serialized_job)
+    return ret
diff --git a/python/oneflow/framework/check_point.py b/python/oneflow/framework/check_point.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a6b02bfcb715faf54af02ae3b489263ea446c9a
--- /dev/null
+++ b/python/oneflow/framework/check_point.py
@@ -0,0 +1,230 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import datetime
+import os
+import shutil
+from typing import List, Union
+
+import numpy as np
+
+import oneflow.eager.op_executor as op_executor
+import oneflow.framework.check_point_v2 as check_point_v2
+import oneflow.framework.config_util as config_util
+import oneflow.framework.hob as hob
+import oneflow.framework.job_instance as job_instance
+import oneflow.framework.session_context as session_ctx
+import oneflow.support.enable_if as enable_if
+
+
+class CheckPoint(object):
+    """Create a `CheckPoint` object to manage checkpoint manually.
+
+    """
+
+    def __init__(self) -> None:
+        if not config_util.api_legacy_model_io_enabled():
+            print(
+                "\x1b[1mWARNING: 'flow.train.CheckPoint' is deprecated. Please use the new API:\x1b[0m\nflow.train.CheckPoint().save(path) => \x1b[1m\x1b[92mflow.checkpoint.save(path)\x1b[0m\nflow.train.CheckPoint().load(path) => \x1b[1m\x1b[92mflow.load_variables(flow.checkpoint.get(path))\x1b[0m\nflow.train.CheckPoint().init() is not needed any more.\n"
+            )
+
+    @session_ctx.try_init_default_session
+    def save(self, path: str) -> None:
+        """save a checkpoint to `path`.
+
+        Args:
+            path: A `string` of path to save checkpoint. 
+        """
+        if not config_util.api_legacy_model_io_enabled():
+            check_point_v2.SaveVarDict(path)
+            return
+        assert type(path) is str
+        enable_if.unique([lazy_checkpoint_save, eager_checkpoint_save])(path)
+
+    @session_ctx.try_init_default_session
+    def init(self) -> None:
+        """Initialize models by default initializer of op or Job.
+        """
+        if not config_util.api_legacy_model_io_enabled():
+            return
+        enable_if.unique([lazy_checkpoint_init, eager_checkpoint_init])()
+
+    @session_ctx.try_init_default_session
+    def load(self, path: str) -> None:
+        """load a checkpoint from `path` and initialize models.
+
+        Args:
+            path: A `string` of path to load checkpoint.
+        """
+        if not config_util.api_legacy_model_io_enabled():
+            check_point_v2.LoadVariables(check_point_v2.GetCheckpoint(path))
+            return
+        assert type(path) is str
+        enable_if.unique([lazy_checkpoint_load, eager_checkpoint_load])(path)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.eager_execution_enabled)
+def lazy_checkpoint_save(path):
+    session_ctx.GetDefaultSession().LaunchJob(_MakeModelSaveJobFunc(path))
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.eager_execution_enabled)
+def lazy_checkpoint_init():
+    session_ctx.GetDefaultSession().LaunchJob(_MakeModelInitJobFunc())
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.eager_execution_enabled)
+def lazy_checkpoint_load(path):
+    session_ctx.GetDefaultSession().LaunchJob(_MakeModelLoadJobFunc(path))
+
+
+@enable_if.condition(hob.in_normal_mode & hob.eager_execution_enabled)
+def eager_checkpoint_save(path):
+    op_executor.EagerSaveVariableBlob(path)
+
+
+@enable_if.condition(hob.in_normal_mode & hob.eager_execution_enabled)
+def eager_checkpoint_init():
+    pass
+
+
+@enable_if.condition(hob.in_normal_mode & hob.eager_execution_enabled)
+def eager_checkpoint_load(path):
+    session_ctx.GetDefaultSession().snapshot_mgr.load(path)
+
+
+def _MakeModelInitJobFunc():
+    def push_cb(blob):
+        pass
+
+    def finish_cb():
+        pass
+
+    sess = session_ctx.GetDefaultSession()
+    return job_instance.MakeJobInstance(
+        str(sess.inter_user_job_info.global_model_init_job_name),
+        push_cb=push_cb,
+        finish_cb=finish_cb,
+    )
+
+
+def _MakeModelLoadJobFunc(path):
+    def push_cb(blob):
+        blob.CopyFromNdarray(np.frombuffer(path.encode("ascii"), dtype=np.int8))
+
+    def finish_cb():
+        pass
+
+    sess = session_ctx.GetDefaultSession()
+    return job_instance.MakeJobInstance(
+        str(sess.inter_user_job_info.global_model_load_job_name),
+        push_cb=push_cb,
+        finish_cb=finish_cb,
+    )
+
+
+def _MakeModelSaveJobFunc(path):
+    def push_cb(blob):
+        blob.CopyFromNdarray(np.frombuffer(path.encode("ascii"), dtype=np.int8))
+
+    def finish_cb():
+        pass
+
+    sess = session_ctx.GetDefaultSession()
+    return job_instance.MakeJobInstance(
+        str(sess.inter_user_job_info.global_model_save_job_name),
+        push_cb=push_cb,
+        finish_cb=finish_cb,
+    )
+
+
+class SimpleCheckPointManager(object):
+    """`SimpleCheckPointManager` is a simple automatic checkpoint manager.
+
+    Args:
+        root_path: root path of snapshot
+        prefix: prefix of snapshot
+    """
+
+    def __init__(self, root_path: str, prefix: str = "snapshot_") -> None:
+        if not os.path.exists(root_path):
+            os.makedirs(root_path)
+        else:
+            assert os.path.isdir(root_path)
+        self._root_path = root_path
+        self._prefix = prefix
+
+    def list_checkpoints(self) -> List[str]:
+        def is_snapshot(name):
+            if not name.startswith(self._prefix):
+                return False
+            snapshot_done = os.path.join(self._GetSnapshotPath(name), "snapshot_done")
+            return os.path.exists(snapshot_done) and os.path.isfile(snapshot_done)
+
+        return sorted([f for f in os.listdir(self._root_path) if is_snapshot(f)])
+
+    def latest_checkpoint(self) -> Union[str, None]:
+        names = self.list_checkpoints()
+        if not names:
+            return None
+        else:
+            return names[-1]
+
+    def initialize_or_restore(self) -> None:
+        name = self.latest_checkpoint()
+        if name:
+            check_point_v2.LoadVariables(
+                check_point_v2.GetCheckpoint(self._GetSnapshotPath(name))
+            )
+        else:
+            self.save()
+
+    def save(self) -> None:
+        check_point_v2.SaveVarDict(self._GetSnapshotPath(self._NextSnapshotName()))
+
+    def _NextSnapshotName(self) -> str:
+        return self._prefix + datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+
+    def _GetSnapshotPath(self, name: str) -> str:
+        return os.path.join(self._root_path, name)
+
+
+class SnapshotManager(object):
+    def __init__(self):
+        self.name2path_ = dict()
+
+    def load(self, root_dir, refresh=True):
+        assert os.path.isdir(root_dir)
+        if refresh:
+            self.name2path_ = dict()
+        for file in os.listdir(root_dir):
+            file_path = os.path.join(root_dir, file)
+            if not os.path.isdir(file_path):
+                continue
+            has_out_subfile = False
+            for f in os.listdir(file_path):
+                fpath = os.path.join(file_path, f)
+                if f == "out" and os.path.isfile(fpath):
+                    has_out_subfile = True
+            if not has_out_subfile:
+                continue
+            assert file not in self.name2path_
+            self.name2path_[file] = os.path.join(file_path, "out")
+
+    def get_snapshot_path(self, name):
+        try:
+            return self.name2path_[name]
+        except KeyError:
+            return None
diff --git a/python/oneflow/framework/check_point_v2.py b/python/oneflow/framework/check_point_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..910a3556df29095aaba76dc8796f462b7b33b032
--- /dev/null
+++ b/python/oneflow/framework/check_point_v2.py
@@ -0,0 +1,618 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+from google.protobuf import text_format
+
+import oneflow
+import oneflow._oneflow_internal
+import oneflow._oneflow_internal.oneflow.core.register.logical_blob_id as lbi_util
+import oneflow.core.framework.user_op_attr_pb2 as attr_value_pb
+import oneflow.core.framework.variable_meta_info_pb2 as variable_meta_info_pb
+import oneflow.core.job.initializer_conf_pb2 as initializer_conf_util
+import oneflow.core.operator.op_conf_pb2 as op_conf_pb
+import oneflow.core.register.logical_blob_id_pb2 as logical_blob_id_util
+import oneflow.eager.boxing_util as boxing_util
+import oneflow.eager.op_infer_util as op_infer_util
+import oneflow.framework.config_util as config_util
+import oneflow.framework.dtype as dtype_util
+import oneflow.framework.id_util as id_util
+import oneflow.framework.remote_blob as remote_blob_util
+import oneflow.framework.runtime_mode as rt_mode
+import oneflow.framework.session_context as session_ctx
+import oneflow.ops.get_variable as get_variable
+import oneflow.ops.initializer_util as initializer_util
+import oneflow.support.async_util as async_util
+from oneflow._oneflow_internal import EagerBlobTrait
+from oneflow.experimental import interface_op_read_and_write
+
+META_INFO_FILENAME = "meta"
+DATA_FILENAME = "out"
+FAKE_JOB_NAME = "system_checkpoint"
+OP_PREFIX = "system_checkpoint"
+blob_register = oneflow._oneflow_internal.GetDefaultBlobRegister()
+
+
+def sync_default_session_if_normal():
+    if rt_mode.CurrentMode() == rt_mode.NORMAL_MODE:
+        oneflow.sync_default_session()
+    else:
+        pass
+
+
+class FileBackendVariableBlob:
+    def __init__(
+        self,
+        var_dir: str,
+        dtype: Optional[oneflow.dtype] = None,
+        shape: Optional[Sequence[int]] = None,
+    ):
+        data_path = os.path.join(var_dir, DATA_FILENAME)
+        assert os.path.isfile(data_path)
+        self.var_dir_ = var_dir
+        meta_info_path = os.path.join(self.var_dir_, META_INFO_FILENAME)
+        if os.path.exists(meta_info_path):
+            meta_info = variable_meta_info_pb.VariableMetaInfo()
+            with open(meta_info_path) as f:
+                text_format.Parse(f.read(), meta_info)
+            self.has_meta_info_ = True
+        else:
+            self.has_meta_info_ = False
+        if self.has_meta_info_:
+            assert dtype is None and shape is None
+            self.shape_ = tuple(meta_info.shape.dim)
+            self.dtype_ = dtype_util.convert_proto_dtype_to_oneflow_dtype(
+                meta_info.data_type
+            )
+        elif shape is not None and dtype is not None:
+            self.shape_ = shape
+            self.dtype_ = dtype
+            self.has_meta_info_ = True
+        elif shape is not None or dtype is not None:
+            raise RuntimeError("both or neither of shape and dtype should be None")
+        else:
+            pass
+        if self.has_meta_info_:
+            itemsize = np.dtype(
+                dtype_util.convert_oneflow_dtype_to_numpy_dtype(self.dtype_)
+            ).itemsize
+            assert os.path.getsize(data_path) == np.prod(self.shape).item() * itemsize
+
+    @property
+    def file_path(self) -> str:
+        return os.path.join(self.var_dir_, DATA_FILENAME)
+
+    @property
+    def shape(self) -> Tuple[int]:
+        return self.shape_
+
+    @property
+    def quant_info(self):
+        raise NotImplementedError()
+
+    @property
+    def dtype(self) -> oneflow.dtype:
+        return self.dtype_
+
+    def numpy(self) -> np.ndarray:
+        if not self.has_meta_info_:
+            raise RuntimeError("This variable does not have meta info")
+        return np.fromfile(
+            self.file_path,
+            dtype=dtype_util.convert_oneflow_dtype_to_numpy_dtype(self.dtype),
+        ).reshape(self.shape)
+
+
+ValueContainer = Union[
+    EagerBlobTrait, FileBackendVariableBlob, np.ndarray, "oneflow.Tensor"
+]
+
+
+def _ElemCnt(shape):
+    return np.prod(shape).astype(int).item()
+
+
+@session_ctx.try_init_default_session
+def GetAllVariables() -> Dict[str, oneflow._oneflow_internal.EagerConsistentBlob]:
+    """
+    Get all variables of all jobs as a dict.
+    """
+    sync_default_session_if_normal()
+    sess = session_ctx.GetDefaultSession()
+    interface_ops = sess.interface_ops
+    variables = {}
+    for op in interface_ops:
+        op_attr = sess.OpAttribute4InterfaceOpName(op)
+        if op_attr.op_conf.WhichOneof("op_type") != "variable_conf":
+            continue
+        variables[op] = interface_op_read_and_write.GetEagerInterfaceBlob(op)
+    return variables
+
+
+def _LoadSingleVariable(path: str) -> Optional[FileBackendVariableBlob]:
+    if os.path.isfile(os.path.join(path, DATA_FILENAME)):
+        return FileBackendVariableBlob(path)
+    return None
+
+
+def _GetCheckpoint(
+    path: str,
+) -> Union[Dict[str, FileBackendVariableBlob], FileBackendVariableBlob]:
+    assert os.path.isdir(path), "Directory {} doesn't exist!".format(path)
+    single_var = _LoadSingleVariable(path)
+    if single_var is not None:
+        return single_var
+    var_dict = {}
+    for f in os.listdir(path):
+        var_dir = os.path.join(path, f)
+        var = _LoadSingleVariable(var_dir)
+        if var is not None:
+            var_dict[f] = var
+    return var_dict
+
+
+@session_ctx.try_init_default_session
+def GetCheckpoint(
+    path: str,
+) -> Union[Dict[str, FileBackendVariableBlob], FileBackendVariableBlob]:
+    """
+    Load variable(s) from file system.
+    """
+    return _GetCheckpoint(path)
+
+
+def Load(
+    path: str,
+) -> Union[Dict[str, FileBackendVariableBlob], FileBackendVariableBlob]:
+    return _GetCheckpoint(path)
+
+
+def _GetOpNameFromLbn(lbn):
+    return lbn.split("/")[0]
+
+
+def _GetScopeSymbolIdFromEagerBlob(blob):
+    name = _GetOpNameFromLbn(blob.logical_blob_name)
+    sess = session_ctx.GetDefaultSession()
+    op_conf = sess.OpAttribute4InterfaceOpName(name).op_conf
+    scope_symbol_id = op_conf.scope_symbol_id
+    return scope_symbol_id
+
+
+def _ReadSlice(
+    container: ValueContainer,
+) -> Iterable[Tuple[Sequence[int], Sequence[int], np.ndarray]]:
+    """
+    Return a generator which iterates over the input blob or array and yields
+    (start_nd_idx, stop_nd_idx, slice_np_array)
+    """
+    if isinstance(container, oneflow.Tensor):
+
+        def ReadFromTensor(tensor, start_nd_idx, stop_nd_idx):
+            start_nd_idx = list(map(int, start_nd_idx))
+            stop_nd_idx = list(map(int, stop_nd_idx))
+            return tensor[
+                tuple(
+                    [
+                        slice(start_nd_idx[i], stop_nd_idx[i])
+                        for i in range(len(start_nd_idx))
+                    ]
+                )
+            ].numpy()
+
+        yield from _ForEachSlice(container, ReadFromTensor)
+    elif isinstance(container, EagerBlobTrait):
+
+        def ReadFromEagerBlob(eager_blob, start_nd_idx, stop_nd_idx):
+            scope_symbol_id = _GetScopeSymbolIdFromEagerBlob(eager_blob)
+            return _LogicalSlice(
+                eager_blob.blob_object, start_nd_idx, stop_nd_idx, scope_symbol_id
+            )
+
+        yield from _ForEachSlice(container, ReadFromEagerBlob)
+    elif isinstance(container, FileBackendVariableBlob):
+        np_dtype = np.dtype(
+            dtype_util.convert_oneflow_dtype_to_numpy_dtype(container.dtype)
+        )
+        with open(container.file_path, "rb") as f:
+
+            def ReadFromFile(_, start_nd_idx, stop_nd_idx):
+                length = _ElemCnt(np.array(stop_nd_idx) - np.array(start_nd_idx))
+                slice = f.read(length * np_dtype.itemsize)
+                return np.frombuffer(slice, dtype=np_dtype).reshape(
+                    np.array(stop_nd_idx) - np.array(start_nd_idx)
+                )
+
+            yield from _ForEachSlice(container, ReadFromFile)
+    elif isinstance(container, np.ndarray):
+
+        def ReadFromNpArray(array, start_nd_idx, stop_nd_idx):
+            slice_objs = []
+            for (start, stop) in zip(start_nd_idx, stop_nd_idx):
+                slice_objs.append(slice(start, stop))
+            return array[tuple(slice_objs)]
+
+        yield from _ForEachSlice(container, ReadFromNpArray)
+    else:
+        raise RuntimeError("Unknown type: {}".format(type(container).__name__))
+
+
+def _SaveVarDict(
+    path: str,
+    var_dict: Optional[
+        Dict[str, Union[FileBackendVariableBlob, EagerBlobTrait]]
+    ] = None,
+) -> None:
+    if var_dict is None:
+        var_dict = GetAllVariables()
+
+    def IsFileOrNonEmptyDir(path):
+        if os.path.isfile(path):
+            return True
+        if os.path.isdir(path) and len(os.listdir(path)) != 0:
+            return True
+        return False
+
+    assert not IsFileOrNonEmptyDir(
+        path
+    ), "{} is a file or non-empty directory! Note that flow.save is different from torch.save. It saves each weight as a separated file so that a directory instead of a file should be given.".format(
+        path
+    )
+    os.makedirs(path, exist_ok=True)
+    for (name, var) in var_dict.items():
+        meta_info = variable_meta_info_pb.VariableMetaInfo()
+        meta_info.shape.dim[:] = var.shape
+        meta_info.data_type = oneflow._oneflow_internal.deprecated.GetProtoDtype4OfDtype(
+            var.dtype
+        )
+        var_dir = os.path.join(path, name)
+        param_path = os.path.join(var_dir, DATA_FILENAME)
+        os.makedirs(os.path.dirname(param_path))
+        with open(param_path, "wb") as f:
+            for (_, _, slice) in _ReadSlice(var):
+                f.write(slice.tobytes())
+        with open(os.path.join(var_dir, META_INFO_FILENAME), "w") as f:
+            f.write(text_format.MessageToString(meta_info))
+    with open(os.path.join(path, "snapshot_done"), "w"):
+        pass
+
+
+@session_ctx.try_init_default_session
+def SaveVarDict(
+    path: str,
+    var_dict: Optional[
+        Dict[str, Union[FileBackendVariableBlob, EagerBlobTrait]]
+    ] = None,
+) -> None:
+    """
+    Save `var_dict` to `path`
+    """
+    sync_default_session_if_normal()
+    return _SaveVarDict(path, var_dict)
+
+
+def save(obj, save_dir):
+    return _SaveVarDict(save_dir, obj)
+
+
+def _LogicalSlice(
+    input_blob_object: oneflow._oneflow_internal.BlobObject,
+    start: Sequence[int],
+    stop: Sequence[int],
+    scope_symbol_id: int,
+) -> np.ndarray:
+    """
+    Construct a logical_slice op and run it by oneflow eager,
+    return the sliced result as a numpy ndarray
+    """
+    op_name = id_util.UniqueStr(OP_PREFIX)
+
+    def AsyncSlice(Yield):
+        def build(builder):
+            op_conf = op_conf_pb.OperatorConf()
+            device_tag = oneflow.current_scope().device_parallel_desc_symbol.device_tag
+            op_conf.device_tag = device_tag
+            op_conf.name = op_name
+            op_conf.user_conf.op_type_name = "logical_slice"
+            op_conf.user_conf.input["x"].s.append("{}/x_0".format(op_name))
+            op_conf.user_conf.output["y"].s.append("{}/y_0".format(op_name))
+            parallel_conf = input_blob_object.parallel_desc_symbol.parallel_conf
+            op_conf.user_conf.attr["parallel_conf"].at_string = str(parallel_conf)
+            op_conf.user_conf.attr["start"].at_list_int64.val[:] = start
+            op_conf.user_conf.attr["stop"].at_list_int64.val[:] = stop
+            op_conf.user_conf.attr["step"].at_list_int64.val[:] = [1] * len(start)
+            bn_in_op2blob_object = (
+                oneflow._oneflow_internal.deprecated.BnInOp2BlobObject()
+            )
+            bn_in_op2blob_object["x_0"] = input_blob_object
+            op_attribute = op_infer_util.Infer(
+                op_conf, bn_in_op2blob_object, scope_symbol_id
+            )
+            cfg_op_attribute = oneflow._oneflow_internal.deprecated.MakeOpAttributeByString(
+                str(op_attribute)
+            )
+            builder.StatelessCall(
+                cfg_op_attribute,
+                parallel_conf,
+                bn_in_op2blob_object,
+                boxing_util.BoxingTo,
+            )
+            Yield(bn_in_op2blob_object["y_0"])
+
+        oneflow._oneflow_internal.deprecated.LogicalRun(build)
+
+    lbi = lbi_util.LogicalBlobId()
+    lbi.set_op_name(op_name)
+    lbi.set_blob_name(op_name)
+    blob_object = async_util.Await(1, AsyncSlice)[0]
+    blob = oneflow._oneflow_internal.EagerConsistentBlob(
+        lbi,
+        blob_object=blob_object,
+        blob_register=blob_register,
+        job_name=FAKE_JOB_NAME,
+    )
+    return blob.numpy()
+
+
+def _GetCpu0VariableBlobFromNumpy(
+    np_array: np.ndarray, dtype: oneflow.dtype
+) -> oneflow._oneflow_internal.EagerConsistentBlob:
+    """
+    Add a variable on cpu 0, and feed the value of `np_array`
+
+    Note: dtype argument cannot be eliminated by
+    convert_numpy_dtype_to_oneflow_dtype(np_array.dtype),
+    because np.int8 == np.char and
+    numpy_dtype_to_oneflow_dtype(oneflow_dtype_to_numpy_dtype(flow.int8))
+    may be flow.char
+    """
+    with oneflow.scope.placement("cpu", "0:0"):
+        op_name = id_util.UniqueStr(OP_PREFIX)
+        op_conf = get_variable.GenerateVariableOpConf(
+            name=op_name,
+            shape=np_array.shape,
+            dtype=dtype,
+            initializer=initializer_util.zeros_initializer(dtype=dtype),
+            trainable=False,
+        )
+        current_parallel_desc_sym = oneflow.current_scope().device_parallel_desc_symbol
+        device_tag = current_parallel_desc_sym.device_tag
+        op_conf.device_tag = device_tag
+        op_attribute = op_infer_util.Infer(op_conf, {})
+        var_blob = get_variable.CreateEagerVariableBlob(
+            op_attribute, job_name=FAKE_JOB_NAME
+        )
+        interface_op_read_and_write.FeedValueToInterfaceBlobObject(
+            var_blob.blob_object, np_array
+        )
+        return var_blob
+
+
+def _LogicalSliceAssign(
+    ref_blob_object: oneflow._oneflow_internal.BlobObject,
+    value_blob_object: oneflow._oneflow_internal.BlobObject,
+    start: Sequence[int],
+    stop: Sequence[int],
+    scope_symbol_id: Optional[int],
+) -> None:
+    """
+    Construct a logical_slice_assign op and run it by oneflow eager
+    """
+
+    def BuildAssignInstruction(builder):
+        op_conf = op_conf_pb.OperatorConf()
+        device_tag = oneflow.current_scope().device_parallel_desc_symbol.device_tag
+        op_conf.device_tag = device_tag
+        op_name = id_util.UniqueStr(OP_PREFIX)
+        op_conf.name = op_name
+        op_conf.user_conf.op_type_name = "logical_slice_assign"
+        op_conf.user_conf.input["value"].s.append("{}/value_0".format(op_name))
+        op_conf.user_conf.input["ref"].s.append("{}/ref_0".format(op_name))
+        parallel_conf = ref_blob_object.parallel_desc_symbol.parallel_conf
+        op_conf.user_conf.attr["parallel_conf"].at_string = str(parallel_conf)
+        op_conf.user_conf.attr["start"].at_list_int64.val[:] = start
+        op_conf.user_conf.attr["stop"].at_list_int64.val[:] = stop
+        op_conf.user_conf.attr["step"].at_list_int64.val[:] = [1] * len(start)
+        bn_in_op2blob_object = oneflow._oneflow_internal.deprecated.BnInOp2BlobObject()
+        bn_in_op2blob_object["ref_0"] = ref_blob_object
+        bn_in_op2blob_object["value_0"] = value_blob_object
+        op_attribute = op_infer_util.Infer(
+            op_conf, bn_in_op2blob_object, scope_symbol_id
+        )
+        cfg_op_attribute = oneflow._oneflow_internal.deprecated.MakeOpAttributeByString(
+            str(op_attribute)
+        )
+        builder.StatelessCall(
+            cfg_op_attribute, parallel_conf, bn_in_op2blob_object, boxing_util.BoxingTo
+        )
+
+    oneflow._oneflow_internal.deprecated.LogicalRun(BuildAssignInstruction)
+
+
+def FeedValueToVariable(
+    var_blob: Union[oneflow._oneflow_internal.EagerConsistentBlob, "oneflow.Tensor"],
+    value: ValueContainer,
+    scope_symbol_id: Optional[int],
+) -> None:
+    """
+    Feed the value of `value` to the variable `var_blob`
+    """
+    assert isinstance(
+        value, (EagerBlobTrait, FileBackendVariableBlob, np.ndarray, oneflow.Tensor)
+    ), "Unknown value type: {}".format(type(value).__name__)
+    if isinstance(value, FileBackendVariableBlob):
+        if not value.has_meta_info_:
+            value = FileBackendVariableBlob(
+                value.var_dir_, var_blob.dtype, var_blob.shape
+            )
+    assert var_blob.shape == value.shape, "{} vs {}".format(var_blob.shape, value.shape)
+    if isinstance(value, np.ndarray):
+        value_flow_dtype = dtype_util.convert_numpy_dtype_to_oneflow_dtype(value.dtype)
+    else:
+        value_flow_dtype = value.dtype
+    assert var_blob.dtype == value_flow_dtype, "{} vs {}".format(
+        var_blob.dtype, value_flow_dtype
+    )
+    if isinstance(var_blob, oneflow.Tensor):
+        raise ValueError("Tensor object arguments are not supported")
+    else:
+        assert isinstance(var_blob, EagerBlobTrait)
+        var_blob_object = var_blob.blob_object
+    for (start, stop, slice) in _ReadSlice(value):
+        slice_value_blob = _GetCpu0VariableBlobFromNumpy(slice, var_blob.dtype)
+        _LogicalSliceAssign(
+            var_blob_object, slice_value_blob.blob_object, start, stop, scope_symbol_id
+        )
+
+
+@session_ctx.try_init_default_session
+def LoadVariables(value_dict: Dict[str, ValueContainer], ignore_mismatch: bool = True):
+    """
+    Load value in `value_dict` into oneflow variables.
+    For example, if `value_dict` is {'x', np.ones(x_shape)},
+    the value of variable "x" will all ones.
+    If `ignore_mismatch` is False, an exception will be raised when
+    there is a name in `value_dict` not belonging to any variable.
+    """
+    sync_default_session_if_normal()
+    all_vars = GetAllVariables()
+    for (name, value) in value_dict.items():
+        if name in all_vars:
+            var_blob = interface_op_read_and_write.GetEagerInterfaceBlob(name)
+            scope_symbol_id = _GetScopeSymbolIdFromEagerBlob(var_blob)
+            FeedValueToVariable(var_blob, value, scope_symbol_id)
+        elif not ignore_mismatch:
+            raise RuntimeError('"{}" is not a variable name'.format(name))
+    oneflow._oneflow_internal.eager.single_client.Sync()
+
+
+def _ForEachSlice(
+    container: ValueContainer,
+    f: Union[
+        Callable[[EagerBlobTrait, Sequence[int], Sequence[int]], Any],
+        Callable[[FileBackendVariableBlob, Sequence[int], Sequence[int]], Any],
+        Callable[[np.ndarray, Sequence[int], Sequence[int]], Any],
+    ],
+):
+    """
+    Slice container into slices whose size < SLICE_BYTES. For every slice,
+    yield start_nd_idx, stop_nd_idx and f(slice)
+    """
+    assert isinstance(
+        container, (EagerBlobTrait, FileBackendVariableBlob, np.ndarray, oneflow.Tensor)
+    ), "Unknown type: {}".format(type(container).__name__)
+    assert container.shape is not None
+    SLICE_BYTES = 32 * 1024 * 1024
+    if isinstance(container, np.ndarray):
+        np_dtype = container.dtype
+    else:
+        np_dtype = np.dtype(
+            dtype_util.convert_oneflow_dtype_to_numpy_dtype(container.dtype)
+        )
+    SLICE_LEN = SLICE_BYTES // np_dtype.itemsize
+    start_idx = 0
+    size = _ElemCnt(container.shape)
+    cnt = 1
+    for axis in reversed(range(len(container.shape))):
+        cnt *= container.shape[axis]
+        if cnt > SLICE_LEN:
+            break
+    unit_size = _ElemCnt(tuple(container.shape)[axis + 1 :])
+    max_unit_num = SLICE_LEN // unit_size
+    while start_idx < size:
+        remainder = container.shape[axis]
+        while remainder > 0:
+            unit_num = max_unit_num if remainder >= max_unit_num else remainder
+            length = unit_num * unit_size
+            remainder -= unit_num
+            stop_idx = start_idx + length
+            start_nd_idx = np.unravel_index(start_idx, container.shape)
+            stop_nd_idx = np.unravel_index(stop_idx - 1, container.shape)
+            stop_nd_idx = tuple([x + 1 for x in stop_nd_idx])
+            yield (start_nd_idx, stop_nd_idx, f(container, start_nd_idx, stop_nd_idx))
+            start_idx = stop_idx
+
+
+def generate_values_by_initializer(initializer, shape, dtype):
+    np_dtype = np.dtype(dtype_util.convert_oneflow_dtype_to_numpy_dtype(dtype))
+    length = _ElemCnt(shape)
+    return np.array(initializer(length)).astype(np_dtype).reshape(shape)
+
+
+def init_by_initializer_conf(
+    var_blob: Union[EagerBlobTrait, "oneflow.Tensor"],
+    initializer_conf: initializer_conf_util.InitializerConf,
+    sync_between_multi_machine: bool,
+    scope_symbol_id: Optional[int],
+    random_seed: int = 0,
+):
+    initializer = initializer_util.GetInitializer(
+        initializer_conf, random_seed, var_blob.shape
+    )
+    if initializer is None:
+        return
+
+    def GenerateValueAndAssign(var_blob, start_nd_idx, stop_nd_idx):
+        shape = np.array(stop_nd_idx) - np.array(start_nd_idx)
+        vals = generate_values_by_initializer(initializer, shape, var_blob.dtype)
+        if isinstance(var_blob, oneflow.Tensor):
+            raise ValueError("Tensor object arguments are not supported")
+        else:
+            assert isinstance(var_blob, EagerBlobTrait)
+            var_blob_object = var_blob.blob_object
+        slice_value_blob = _GetCpu0VariableBlobFromNumpy(vals, var_blob.dtype)
+        _LogicalSliceAssign(
+            var_blob_object,
+            slice_value_blob.blob_object,
+            start_nd_idx,
+            stop_nd_idx,
+            scope_symbol_id,
+        )
+
+    for _ in _ForEachSlice(var_blob, GenerateValueAndAssign):
+        pass
+    if sync_between_multi_machine:
+        oneflow._oneflow_internal.eager.single_client.Sync()
+
+
+def Init() -> None:
+    sync_default_session_if_normal()
+    sess = session_ctx.GetDefaultSession()
+    for (op_name, var_blob) in GetAllVariables().items():
+        var_conf = sess.OpAttribute4InterfaceOpName(op_name).op_conf.variable_conf
+        if not (
+            var_conf.HasField("initializer")
+            or var_conf.HasField("initialize_with_snapshot")
+        ):
+            continue
+        if var_conf.HasField("initialize_with_snapshot"):
+            initialize_with_snapshot_conf = var_conf.initialize_with_snapshot
+            if initialize_with_snapshot_conf.HasField("key"):
+                snapshot_key = op_name
+            else:
+                snapshot_key = initialize_with_snapshot_conf.key
+            var_dir = os.path.dirname(
+                os.path.join(initialize_with_snapshot_conf.path, snapshot_key)
+            )
+            LoadVariables({op_name: GetCheckpoint(var_dir)})
+            continue
+        scope_symbol_id = _GetScopeSymbolIdFromEagerBlob(var_blob)
+        init_by_initializer_conf(
+            var_blob, var_conf.initializer, False, scope_symbol_id, var_conf.random_seed
+        )
+    oneflow._oneflow_internal.eager.single_client.Sync()
diff --git a/python/oneflow/framework/compile_context.py b/python/oneflow/framework/compile_context.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab9a00a02fc4778540f3a21c591c21f1c7ad5677
--- /dev/null
+++ b/python/oneflow/framework/compile_context.py
@@ -0,0 +1,82 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from contextlib import contextmanager
+
+import oneflow
+import oneflow._oneflow_internal
+import oneflow.framework.c_api_util as c_api_util
+import oneflow.framework.distribute_context as distribute_ctx
+import oneflow.framework.hob as hob
+import oneflow.framework.placement_context as placement_context
+import oneflow.framework.session_context as session_ctx
+import oneflow.support.enable_if as enable_if
+
+
+def GetCurJobConfigProto():
+    return enable_if.unique([GetEagerCurJobConfigProto, GetLazyCurJobConfigProto])()
+
+
+@enable_if.condition(hob.in_global_mode & hob.eager_execution_enabled)
+def GetEagerCurJobConfigProto():
+    function_desc = session_ctx.GetDefaultSession().CurrentEagerGlobalFunctionDesc()
+    assert function_desc is not None
+    return function_desc.job_config_proto
+
+
+@enable_if.condition(hob.in_global_mode & ~hob.eager_execution_enabled)
+def GetLazyCurJobConfigProto():
+    job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+    function_desc = session_ctx.GetDefaultSession().GetLazyFunctionDesc(job_name)
+    assert function_desc is not None
+    return function_desc.job_config_proto
+
+
+logged_op_confs = set({})
+
+
+def CurJobAddOp(op_conf, scope_symbol=None):
+    if distribute_ctx.IsMirroredStrategyEnabled():
+        return CurJobAddMirroredOp(op_conf, scope_symbol)
+    return CurJobAddConsistentOp(op_conf, scope_symbol)
+
+
+def CurJobAddConsistentOp(op_conf, scope_symbol=None):
+    if scope_symbol is None:
+        scope_symbol = oneflow.current_scope()
+    op_conf.scope_symbol_id = scope_symbol.symbol_id
+    if not op_conf.HasField("device_tag"):
+        device_tag = scope_symbol.device_parallel_desc_symbol.device_tag
+        op_conf.device_tag = device_tag
+    op_attr = c_api_util.CurJobBuildAndInferCtx_AddAndInferConsistentOp(op_conf)
+    if c_api_util.IsInterfaceOpConf(op_conf):
+        sess = session_ctx.GetDefaultSession()
+        sess.AddInfo4InterfaceOpName(op_conf.name, op_attr)
+    return op_attr
+
+
+def CurJobAddMirroredOp(op_conf, scope_symbol=None):
+    assert not hob.consistent_view_enabled(None)
+    if scope_symbol is None:
+        scope_symbol = oneflow.current_scope()
+    op_conf.scope_symbol_id = scope_symbol.symbol_id
+    if not op_conf.HasField("device_tag"):
+        device_tag = scope_symbol.device_parallel_desc_symbol.device_tag
+        op_conf.device_tag = device_tag
+    op_attr = c_api_util.CurJobBuildAndInferCtx_AddAndInferMirroredOp(op_conf)
+    if c_api_util.IsInterfaceOpConf(op_conf):
+        sess = session_ctx.GetDefaultSession()
+        sess.AddInfo4InterfaceOpName(op_conf.name, op_attr)
+    return op_attr
diff --git a/python/oneflow/framework/compiler.py b/python/oneflow/framework/compiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab7c9a516c62c6b0c29001a8233a72cd624ec40b
--- /dev/null
+++ b/python/oneflow/framework/compiler.py
@@ -0,0 +1,220 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import inspect
+import typing
+from contextlib import contextmanager
+
+import oneflow
+import oneflow._oneflow_internal
+import oneflow.framework.c_api_util as c_api_util
+import oneflow.framework.distribute as distribute_util
+import oneflow.framework.hob as hob
+import oneflow.framework.input_blob_def as input_blob_util
+import oneflow.framework.placement_context as placement_ctx
+import oneflow.framework.placement_util as placement_util
+import oneflow.framework.push_util as push_util
+import oneflow.framework.remote_blob as remote_blob_util
+import oneflow.framework.runtime_mode as runtime_mode
+import oneflow.framework.scope_util as scope_util
+import oneflow.framework.session_context as session_ctx
+import oneflow.framework.typing as oft
+import oneflow.framework.typing_util as oft_util
+import oneflow.ops as ops
+import oneflow.support.enable_if as enable_if
+import oneflow.support.func_inspect_util as func_inspect_util
+
+
+def Compile(session, function_desc, config_proto):
+    with InterpretScope(session, function_desc, config_proto):
+        _CompileJob(session, function_desc)
+        session.StashJob(function_desc.job_func.__name__)
+        oneflow._oneflow_internal.CurJobBuildAndInferCtx_Complete()
+        session.StashJob(
+            function_desc.job_func.__name__,
+            function_desc.job_func.__name__ + "_after_complete",
+        )
+
+
+def EagerRun(session, function_desc, config_proto, args):
+    with InterpretScope(session, function_desc, config_proto):
+        ret = _InterpretGlobalFunction(function_desc, args)
+        oneflow._oneflow_internal.CurJobBuildAndInferCtx_Complete()
+        session_ctx.GetDefaultSession().UpdateInfo4InterfaceOp()
+    return ret
+
+
+@contextmanager
+def InterpretScope(session, function_desc, config_proto):
+    job_conf = function_desc.job_config_proto
+    job_conf.set_job_name(function_desc.job_func.__name__)
+    placement_scope = function_desc.function_attribute.default_placement_scope
+    if placement_scope is None:
+        tag_and_dev_ids = placement_util.GetDefaultMachineDeviceIds(session.resource)
+        hierarchy = None
+    else:
+        assert isinstance(placement_scope, placement_ctx.EmptyPlacementScope)
+        tag_and_dev_ids = (
+            placement_scope.device_tag,
+            placement_scope.machine_device_ids,
+        )
+        hierarchy = placement_scope.hierarchy
+    distribute_strategy = function_desc.function_attribute.default_distribute_strategy
+    if distribute_strategy is None:
+        distribute_strategy = distribute_util.DistributeConsistentStrategy()
+    is_mirrored = isinstance(
+        distribute_strategy, distribute_util.DistributeMirroredStrategy
+    )
+    assert isinstance(hierarchy, (list, tuple)) or hierarchy is None
+    if hierarchy is not None:
+        hierarchy = oneflow._oneflow_internal.Size(tuple(hierarchy))
+    scope = scope_util.MakeInitialScope(
+        job_conf, *tag_and_dev_ids, hierarchy, is_mirrored
+    )
+    with _JobBuildAndInferCtx(job_conf.job_name()), distribute_strategy:
+        c_api_util.CurJobBuildAndInferCtx_SetJobConf(job_conf)
+        with runtime_mode.ModeScope(runtime_mode.GLOBAL_MODE):
+            with scope_util.ScopeContext(scope):
+                yield
+
+
+def _CompileJob(session, function_desc):
+    func = function_desc.job_func
+    parameters = func.__oneflow_function_signature__.parameters
+    if len(parameters) == 0:
+        func.__oneflow_input_blob_defs__ = ()
+    elif all((p.annotation is inspect._empty for (_, p) in parameters.items())):
+        func.__oneflow_input_blob_defs__ = _GetArgDefault(func)
+    elif all((p.annotation is not inspect._empty for (_, p) in parameters.items())):
+        func.__oneflow_input_blob_defs__ = _MakeInputBlobDefFromParameterSignature(
+            parameters
+        )
+    else:
+        raise NotImplementedError(
+            "All parameters of global function should be annotated"
+        )
+    inputs = _RecursiveMakeInputBlobs(func.__oneflow_input_blob_defs__)
+    ret = func(*inputs)
+    return_annotation = func.__oneflow_function_signature__.return_annotation
+    oft_util.CheckReturnByAnnotation(func.__name__, ret, return_annotation)
+    func.__oneflow_output_remote_blobs__ = _RecursiveMakeRetRemoteBlobs(
+        ret, allow_cpu_return_op=function_desc.function_attribute.allow_cpu_return_op
+    )
+
+
+def _InterpretGlobalFunction(function_desc, args):
+    func = function_desc.job_func
+    parameters = func.__oneflow_function_signature__.parameters
+    if len(parameters) == 0:
+        func.__oneflow_input_blob_defs__ = ()
+    elif all((p.annotation is inspect._empty for (_, p) in parameters.items())):
+        func.__oneflow_input_blob_defs__ = _GetArgDefault(func)
+    elif all((p.annotation is not inspect._empty for (_, p) in parameters.items())):
+        func.__oneflow_input_blob_defs__ = _MakeInputBlobDefFromParameterSignature(
+            parameters
+        )
+    else:
+        raise NotImplementedError(
+            "All parameters of global function should be annotated"
+        )
+    inputs = push_util.MakeEagerInputBlobs(func.__oneflow_input_blob_defs__, args)
+    ret = func(*inputs)
+    return_annotation = func.__oneflow_function_signature__.return_annotation
+    oft_util.CheckReturnByAnnotation(func.__name__, ret, return_annotation)
+    return _RecursiveMakeRetRemoteBlobs(
+        ret, allow_cpu_return_op=function_desc.function_attribute.allow_cpu_return_op
+    )
+
+
+@contextmanager
+def _JobBuildAndInferCtx(job_name):
+    c_api_util.JobBuildAndInferCtx_Open(job_name)
+    try:
+        yield
+    finally:
+        oneflow._oneflow_internal.JobBuildAndInferCtx_Close()
+
+
+def _GetArgDefault(func):
+    if hasattr(func, "__oneflow_arg_default__"):
+        return func.__oneflow_arg_default__
+    return _CloneArgBlobDef(func_inspect_util.GetArgDefaults(func))
+
+
+def _CloneArgBlobDef(args):
+    if isinstance(args, input_blob_util.ArgBlobDef):
+        return args.Clone()
+    if isinstance(args, (tuple, list)):
+        return type(args)((_CloneArgBlobDef(x) for x in args))
+    if isinstance(args, dict):
+        return {k: _CloneArgBlobDef(v) for (k, v) in args}
+    raise NotImplementedError(
+        "oneflow.global_function only accepts nested input blob defs"
+    )
+
+
+def _RecursiveMakeInputBlobs(input_blob_def):
+    if isinstance(input_blob_def, input_blob_util.ArgBlobDef):
+        return ops.InputOpByArgBlobDef(input_blob_def)
+    if isinstance(input_blob_def, (tuple, list)):
+        return type(input_blob_def)(
+            (_RecursiveMakeInputBlobs(x) for x in input_blob_def)
+        )
+    if isinstance(input_blob_def, dict):
+        return {k: _RecursiveMakeInputBlobs(v) for (k, v) in input_blob_def.items()}
+    raise NotImplementedError(
+        "oneflow.global_function accepts "
+        + "ArgBlobDefs or list/tuple/dict nested ArgBlobDefs as argument"
+    )
+
+
+def _MakeInputBlobDefFromParameterSignature(parameters):
+    def CheckAndRecusiveMake(p):
+        return _RecusiveMakeInputBlobDef(p.annotation)
+
+    return tuple((CheckAndRecusiveMake(p) for (_, p) in parameters.items()))
+
+
+def _RecusiveMakeInputBlobDef(cls):
+    if oft.OriginFrom(cls, oft.OneflowNumpyDef):
+        return cls.NewInputBlobDef()
+    elif oft.OriginFrom(cls, typing.Tuple):
+        return tuple((_RecusiveMakeInputBlobDef(a) for a in cls.__args__))
+    else:
+        raise NotImplementedError(
+            "\nannotation %s" % cls
+            + "not supported"
+            + "\nonly support oneflow.typing.Numpy.Placeholder, oneflow.typing.ListNumpy.Placeholder"
+        )
+
+
+def _RecursiveMakeRetRemoteBlobs(remote_blobs, **kwarg):
+    if remote_blobs is None:
+        return None
+    if isinstance(remote_blobs, oneflow._oneflow_internal.BlobDesc):
+        return ops.ReturnRemoteBlob(remote_blobs, **kwarg)
+    if isinstance(remote_blobs, (tuple, list)):
+        return type(remote_blobs)(
+            (_RecursiveMakeRetRemoteBlobs(x, **kwarg) for x in remote_blobs)
+        )
+    if isinstance(remote_blobs, dict):
+        return {
+            k: _RecursiveMakeRetRemoteBlobs(v, **kwarg)
+            for (k, v) in remote_blobs.items()
+        }
+    raise NotImplementedError(
+        "oneflow.global_function returns "
+        + "RemoteBlob or list/tuple/dict nested RemoteBlob only"
+    )
diff --git a/python/oneflow/framework/config_util.py b/python/oneflow/framework/config_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c592c1b0b649fa51efde1944658688435ee1652
--- /dev/null
+++ b/python/oneflow/framework/config_util.py
@@ -0,0 +1,615 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import traceback
+
+import oneflow._oneflow_internal
+import oneflow.framework.hob as hob
+import oneflow.framework.session_context as session_ctx
+import oneflow.support.enable_if as enable_if
+
+
+def api_load_library(val: str) -> None:
+    """Load necessary library for job
+
+    Args:
+        val (str): path to shared object file
+    """
+    return enable_if.unique([load_library, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def load_library(val):
+    assert type(val) is str
+    sess = session_ctx.GetDefaultSession()
+    sess.config_proto.load_lib_path.append(val)
+
+
+def api_load_library_now(val: str) -> None:
+    """Load necessary library for job now
+
+    Args:
+        val (str): path to shared object file
+    """
+    return enable_if.unique([load_library_now, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def load_library_now(val):
+    assert type(val) is str
+    oneflow._oneflow_internal.LoadLibraryNow(val)
+
+
+def api_machine_num(val: int) -> None:
+    """Set available number of machine/node for  running job .
+
+    Args:
+        val (int): available number of machines
+    """
+    return enable_if.unique([machine_num, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def machine_num(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is int
+    sess.config_proto.resource.machine_num = val
+
+
+def api_gpu_device_num(val: int) -> None:
+    """Set number of GPUs on each machine to run oneflow on.
+
+    Args:
+        val (int): number of GPUs. It is identical on every machine. In other words,
+        you can't specify different number of GPUs you would like to use on each machine.
+    """
+    if oneflow._oneflow_internal.flags.with_cuda():
+        return enable_if.unique([gpu_device_num, do_nothing])(val)
+    else:
+        print(
+            "INFO: for CPU-only OneFlow, oneflow.config.gpu_device_num is equivalent to oneflow.config.cpu_device_num"
+        )
+        print(traceback.format_stack()[-2])
+        return enable_if.unique([cpu_device_num, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def gpu_device_num(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is int
+    sess.config_proto.resource.gpu_device_num = val
+
+
+def api_cpu_device_num(val: int) -> None:
+    """Set number of CPUs on each machine to run oneflow on. Usually you don't need to set this.
+
+    Args:
+        val (int): number of CPUs. It is identical on every machine.
+    """
+    return enable_if.unique([cpu_device_num, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def cpu_device_num(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is int
+    sess.config_proto.resource.cpu_device_num = val
+
+
+def api_comm_net_worker_num(val: int) -> None:
+    """Set up the workers number in epoll  mode network,
+            If use RDMA mode network, then doesn't need.
+
+    Args:
+        val (int): number of workers
+    """
+    return enable_if.unique([comm_net_worker_num, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def comm_net_worker_num(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is int
+    sess.config_proto.resource.comm_net_worker_num = val
+
+
+def api_max_mdsave_worker_num(val: int) -> None:
+    """Set up max number of workers for mdsave process.
+
+    Args:
+        val (int):  max number of workers
+    """
+    return enable_if.unique([max_mdsave_worker_num, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def max_mdsave_worker_num(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is int
+    sess.config_proto.resource.max_mdsave_worker_num = val
+
+
+def api_numa_aware_cuda_malloc_host(val: bool = True) -> None:
+    """Whether or not let numa know  that  cuda allocated host's memory.
+
+    Args:
+        val (bool, optional): True or False. Defaults to True.
+    """
+    print(
+        "'enable_numa_aware_cuda_malloc_host' has been deprecated, has no effect and will be removed in the future."
+    )
+
+
+def api_compute_thread_pool_size(val: int) -> None:
+    """Set up the size of compute thread pool
+
+    Args:
+        val (int): size of  thread pool
+    """
+    return enable_if.unique([compute_thread_pool_size, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def compute_thread_pool_size(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is int
+    sess.config_proto.resource.compute_thread_pool_size = val
+
+
+def api_rdma_mem_block_mbyte(val: int) -> None:
+    """Set up the memory block size in rdma mode.
+
+    Args:
+        val (int): size of block, e.g. 1024(mb)
+    """
+    return enable_if.unique([rdma_mem_block_mbyte, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def rdma_mem_block_mbyte(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is int
+    sess.config_proto.resource.rdma_mem_block_mbyte = val
+
+
+def api_rdma_recv_msg_buf_mbyte(val: int) -> None:
+    """Set up the buffer size for receiving messages in rama mode
+
+    Args:
+        val (int): buffer size, e.g. 1024(mb)
+    """
+    return enable_if.unique([rdma_recv_msg_buf_mbyte, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def rdma_recv_msg_buf_mbyte(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is int
+    sess.config_proto.resource.rdma_recv_msg_buf_mbyte = val
+
+
+def api_reserved_host_mem_mbyte(val: int) -> None:
+    """Set up the memory size of reserved host
+
+    Args:
+        val (int):  memory size, e.g. 1024(mb)
+    """
+    return enable_if.unique([reserved_host_mem_mbyte, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def reserved_host_mem_mbyte(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is int
+    sess.config_proto.resource.reserved_host_mem_mbyte = val
+
+
+def api_reserved_device_mem_mbyte(val: int) -> None:
+    """Set up the memory size of reserved device
+
+    Args:
+        val (int):  memory size, e.g. 1024(mb)
+    """
+    return enable_if.unique([reserved_device_mem_mbyte, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def reserved_device_mem_mbyte(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is int
+    sess.config_proto.resource.reserved_device_mem_mbyte = val
+
+
+def api_use_rdma(val: bool = True) -> None:
+    """Whether use RDMA to speed up data transmission in cluster nodes or not.
+          if not, then use normal epoll mode.
+
+    Args:
+        val (bool, optional):  Defaults to True.
+    """
+    return enable_if.unique([use_rdma, do_nothing])(val=val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def use_rdma(val=True):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is bool
+    sess.config_proto.resource.use_rdma = val
+
+
+def api_thread_enable_local_message_queue(val: bool) -> None:
+    """Whether or not enable thread using local  message queue.
+
+    Args:
+        val (bool):  True or False
+    """
+    return enable_if.unique([thread_enable_local_message_queue, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def thread_enable_local_message_queue(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is bool
+    sess.config_proto.resource.thread_enable_local_message_queue = val
+
+
+def api_enable_debug_mode(val: bool) -> None:
+    """Whether use debug mode or not.
+
+    Args:
+        val (bool):  True or False
+    """
+    return enable_if.unique([enable_debug_mode, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def enable_debug_mode(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is bool
+    sess.config_proto.resource.enable_debug_mode = val
+
+
+def api_legacy_model_io_enabled():
+    sess = session_ctx.GetDefaultSession()
+    return sess.config_proto.resource.enable_legacy_model_io
+
+
+def api_enable_legacy_model_io(val: bool = True):
+    """Whether or not use legacy model io.
+
+    Args:
+        val ([type]): True or False
+    """
+    return enable_if.unique([enable_legacy_model_io, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def enable_legacy_model_io(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is bool
+    sess.config_proto.resource.enable_legacy_model_io = val
+
+
+def api_enable_model_io_v2(val):
+    """Whether or not use version2  of model input/output function.
+
+    Args:
+        val ([type]): True or False
+    """
+    return enable_if.unique([enable_model_io_v2, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def enable_model_io_v2(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is bool
+    sess.config_proto.resource.enable_model_io_v2 = val
+
+
+def api_collect_act_event(val: bool = True) -> None:
+    """Whether or not collect active event.
+
+    Args:
+        val (bool, optional): True or False. Defaults to True.
+    """
+    return enable_if.unique([collect_act_event, do_nothing])(val=val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def collect_act_event(val=True):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is int
+    sess.config_proto.profile_conf.collect_act_event = val
+
+
+def api_enable_fusion(val: bool = True) -> None:
+    """Whether or not allow fusion the operators
+
+    Args:
+        val (bool, optional): True or False. Defaults to True.
+    """
+    return enable_if.unique([enable_fusion, do_nothing])(val=val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def enable_fusion(val=True):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is bool
+    sess.config_proto.resource.collective_boxing_conf.enable_fusion = val
+
+
+def api_num_callback_threads(val: int) -> None:
+    """Set up number of callback threads for boxing process.
+            Boxing is used to convert between different parallel properties of logical tensor
+
+    Args:
+        val (int): number of  callback threads
+    """
+    return enable_if.unique([num_callback_threads, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def num_callback_threads(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is int
+    sess.config_proto.resource.collective_boxing_conf.num_callback_threads = val
+
+
+def api_enable_tensor_float_32_compute(val: bool = True) -> None:
+    """Whether or not to enable Tensor-float-32 on supported GPUs
+
+    Args:
+        val (bool, optional): True or False. Defaults to True.
+    """
+    return enable_if.unique([enable_tensor_float_32_compute, do_nothing])(val=val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def enable_tensor_float_32_compute(val=True):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is bool
+    sess.config_proto.resource.enable_tensor_float_32_compute = val
+
+
+def api_enable_mem_chain_merge(val: bool = True) -> None:
+    """Whether or not to enable MemChain merge.
+
+    Args:
+        val (bool, optional): True or False. Defaults to True.
+    """
+    return enable_if.unique([enable_mem_chain_merge, do_nothing])(val=val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def enable_mem_chain_merge(val=True):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is bool
+    sess.config_proto.resource.enable_mem_chain_merge = val
+
+
+def api_nccl_use_compute_stream(val: bool = False) -> None:
+    """Whether or not nccl use compute stream to reuse nccl memory and speedup
+
+    Args:
+        val (bool, optional): True or False. Defaults to False.
+    """
+    return enable_if.unique([nccl_use_compute_stream, do_nothing])(val=val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def nccl_use_compute_stream(val=False):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is bool
+    sess.config_proto.resource.nccl_use_compute_stream = val
+
+
+def api_disable_group_boxing_by_dst_parallel(val: bool = False) -> None:
+    """Whether or not disable group boxing by dst parallel pass to reduce boxing memory life cycle.
+
+    Args:
+        val (bool, optional): True or False. Defaults to False.
+    """
+    return enable_if.unique([disable_group_boxing_by_dst_parallel, do_nothing])(val=val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def disable_group_boxing_by_dst_parallel(val=False):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is bool
+    sess.config_proto.resource.disable_group_boxing_by_dst_parallel = val
+
+
+def api_nccl_num_streams(val: int) -> None:
+    """Set up the number of nccl parallel streams while use boxing
+
+    Args:
+        val (int): number of streams
+    """
+    return enable_if.unique([nccl_num_streams, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def nccl_num_streams(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is int
+    sess.config_proto.resource.collective_boxing_conf.nccl_num_streams = val
+
+
+def api_nccl_fusion_threshold_mb(val: int) -> None:
+    """Set up threshold for oprators fusion
+
+    Args:
+        val (int): int number, e.g. 10(mb)
+    """
+    return enable_if.unique([nccl_fusion_threshold_mb, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def nccl_fusion_threshold_mb(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is int
+    sess.config_proto.resource.collective_boxing_conf.nccl_fusion_threshold_mb = val
+
+
+def api_nccl_fusion_all_reduce_use_buffer(val: bool) -> None:
+    """Whether or not use buffer during nccl fusion progress
+
+    Args:
+        val (bool): True or False
+    """
+    return enable_if.unique([nccl_fusion_all_reduce_use_buffer, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def nccl_fusion_all_reduce_use_buffer(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is bool
+    sess.config_proto.resource.collective_boxing_conf.nccl_fusion_all_reduce_use_buffer = (
+        val
+    )
+
+
+def api_nccl_fusion_all_reduce(val: bool) -> None:
+    """Whether or not use nccl fusion during all reduce progress
+
+    Args:
+        val (bool):  True or False
+    """
+    return enable_if.unique([nccl_fusion_all_reduce, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def nccl_fusion_all_reduce(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is bool
+    sess.config_proto.resource.collective_boxing_conf.nccl_fusion_all_reduce = val
+
+
+def api_nccl_fusion_reduce_scatter(val: bool) -> None:
+    """Whether or not  use nccl fusion during reduce scatter progress
+
+    Args:
+        val (bool): True or False
+    """
+    return enable_if.unique([nccl_fusion_reduce_scatter, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def nccl_fusion_reduce_scatter(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is bool
+    sess.config_proto.resource.collective_boxing_conf.nccl_fusion_reduce_scatter = val
+
+
+def api_nccl_fusion_all_gather(val: bool) -> None:
+    """Whether or not use nccl fusion during all  gather progress
+
+    Args:
+        val (bool): True or False
+    """
+    return enable_if.unique([nccl_fusion_all_gather, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def nccl_fusion_all_gather(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is bool
+    sess.config_proto.resource.collective_boxing_conf.nccl_fusion_all_gather = val
+
+
+def api_nccl_fusion_reduce(val: bool) -> None:
+    """Whether or not use nccl fusion during reduce progress
+
+    Args:
+        val (bool): True or False
+    """
+    return enable_if.unique([nccl_fusion_reduce, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def nccl_fusion_reduce(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is bool
+    sess.config_proto.resource.collective_boxing_conf.nccl_fusion_reduce = val
+
+
+def api_nccl_fusion_broadcast(val: bool) -> None:
+    """Whether or not use nccl fusion during broadcast progress
+
+    Args:
+        val (bool): True or False
+    """
+    return enable_if.unique([nccl_fusion_broadcast, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def nccl_fusion_broadcast(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is bool
+    sess.config_proto.resource.collective_boxing_conf.nccl_fusion_broadcast = val
+
+
+def api_nccl_fusion_max_ops(val: int) -> None:
+    """Maximum number of ops for nccl fusion.
+
+    Args:
+        val (int): Maximum number of ops
+    """
+    return enable_if.unique([nccl_fusion_max_ops, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def nccl_fusion_max_ops(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is int
+    sess.config_proto.resource.collective_boxing_conf.nccl_fusion_max_ops = val
+
+
+def api_nccl_enable_all_to_all(val: bool) -> None:
+    """Whether or not use nccl all2all during s2s boxing
+
+    Args:
+        val (bool): True or False
+    """
+    return enable_if.unique([nccl_enable_all_to_all, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def nccl_enable_all_to_all(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is bool
+    sess.config_proto.resource.collective_boxing_conf.nccl_enable_all_to_all = val
+
+
+def api_nccl_enable_mixed_fusion(val: bool) -> None:
+    """Whether or not use nccl mixed fusion
+
+    Args:
+        val (bool): True or False
+    """
+    return enable_if.unique([nccl_enable_mixed_fusion, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.session_initialized)
+def nccl_enable_mixed_fusion(val):
+    sess = session_ctx.GetDefaultSession()
+    assert type(val) is bool
+    sess.config_proto.resource.collective_boxing_conf.nccl_enable_mixed_fusion = val
+
+
+@enable_if.condition(hob.in_normal_mode & hob.session_initialized)
+def do_nothing(*args, **kwargs):
+    print("Nothing happened because the session is running")
+    return False
diff --git a/python/oneflow/framework/distribute.py b/python/oneflow/framework/distribute.py
new file mode 100644
index 0000000000000000000000000000000000000000..117ab34e7f33086bb70c139e5456d1d6e23d1478
--- /dev/null
+++ b/python/oneflow/framework/distribute.py
@@ -0,0 +1,226 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import traceback
+from contextlib import contextmanager
+
+import oneflow._oneflow_internal
+import oneflow.framework.distribute_context as distribute_ctx
+from oneflow import oneflow_deprecate
+
+
+@oneflow_deprecate()
+def deprecated_mirrored_strategy():
+    print(
+        "WARNING:",
+        "oneflow.distribute.mirrored_strategy",
+        "will be removed in the future, use {} instead.".format(
+            "oneflow.scope.mirrored_view"
+        ),
+    )
+    print(traceback.format_stack()[-2])
+    return DistributeMirroredStrategy()
+
+
+class DistributeMirroredStrategy(distribute_ctx.DistributeStrategy):
+    """Create a scope in mirrored view. All operators within the scope will be mirrored among diffierent accelerators.
+    Usage::
+
+        with oneflow.scope.mirrored_view():
+            ...
+
+    """
+
+    def __init__(self):
+        distribute_ctx.DistributeStrategy.__init__(self, True)
+
+
+from oneflow import oneflow_deprecate
+
+
+@oneflow_deprecate()
+def deprecated_mirrored_strategy_enabled():
+    print(
+        "WARNING:",
+        "oneflow.distribute.mirrored_strategy_enabled",
+        "will be removed in the future, use {} instead.".format(
+            "oneflow.scope.mirrored_view_enabled"
+        ),
+    )
+    print(traceback.format_stack()[-2])
+    return MirroredStrategyEnabled()
+
+
+def MirroredStrategyEnabled() -> bool:
+    """
+
+    Returns:
+        bool: `True` if mirrored strategy is enabled in current context where this function is called.
+
+    """
+    return distribute_ctx.IsMirroredStrategyEnabled()
+
+
+from oneflow import oneflow_deprecate
+
+
+@oneflow_deprecate()
+def deprecated_consistent_strategy():
+    print(
+        "WARNING:",
+        "oneflow.distribute.consistent_strategy",
+        "will be removed in the future, use {} instead.".format(
+            "oneflow.scope.consistent_view"
+        ),
+    )
+    print(traceback.format_stack()[-2])
+    return DistributeConsistentStrategy()
+
+
+class DistributeConsistentStrategy(distribute_ctx.DistributeStrategy):
+    """Create a scope in consistent view. All operators within the scope will be automatically parallelized among diffierent accelerators for best performance and least data transfer.
+
+    Usage::
+
+        with oneflow.scope.consistent_view():
+            ...
+
+    """
+
+    def __init__(self):
+        distribute_ctx.DistributeStrategy.__init__(self, False)
+
+
+from oneflow import oneflow_deprecate
+
+
+@oneflow_deprecate()
+def deprecated_consistent_strategy_enabled():
+    print(
+        "WARNING:",
+        "oneflow.distribute.consistent_strategy_enabled",
+        "will be removed in the future, use {} instead.".format(
+            "oneflow.scope.consistent_view_enabled"
+        ),
+    )
+    print(traceback.format_stack()[-2])
+    return ConsistentStrategyEnabled()
+
+
+def ConsistentStrategyEnabled() -> bool:
+    """
+
+    Returns:
+        bool: `True` if consistent strategy is enabled in current context where this function is called.
+
+    """
+    return distribute_ctx.IsConsistentStrategyEnabled()
+
+
+def split(axis: int) -> oneflow._oneflow_internal.distribute.SplitDistribute:
+    """Generate a split scheme in which op will be splitted at `axis`.
+
+    Args:
+        axis (int): At `axis` the op will be splitted.
+
+    Returns:
+        SplitDistribute: Split scheme object, often required by `with_distribute` method of `Blob` or `oneflow.get_variable`.
+
+    Example::
+        weight = weight.with_distribute(distribute.split(1))
+
+    """
+    assert type(axis) is int
+    return oneflow._oneflow_internal.distribute.split(axis)
+
+
+def broadcast() -> oneflow._oneflow_internal.distribute.BroadcastDistribute:
+    """Generate a broadcast scheme.
+
+    Returns:
+        BroadcastDistribute: Broadcast scheme object, often required by `with_distribute` method of `Blob` or `oneflow.get_variable`.
+
+    Example::
+        segment_ids = segment_ids.with_distribute(flow.distribute.broadcast())
+
+    """
+    return oneflow._oneflow_internal.distribute.broadcast()
+
+
+def auto() -> oneflow._oneflow_internal.distribute.AutoDistribute:
+    """Generate a broadcast scheme.
+
+    Returns:
+        AutoDistribute: Auto distribute scheme object, often required by `with_distribute` method of `Blob` or `oneflow.get_variable`.
+
+    """
+    return oneflow._oneflow_internal.distribute.auto()
+
+
+def assert_is_valid_distribute(
+    distribute: oneflow._oneflow_internal.distribute.Distribute,
+) -> None:
+    assert isinstance(
+        distribute, oneflow._oneflow_internal.distribute.Distribute
+    ), "not a valid distribute policy.\n           expected: 1) oneflow.distribute.split(axis); 2) oneflow.distribute.broadcast(); 3) oneflow.distribute.auto()"
+
+
+def get_local_rank():
+    return oneflow._oneflow_internal.GetLocalRank()
+
+
+def get_rank():
+    """Returns the rank of current process group.
+
+    Returns:
+        The rank of the process group.
+
+    """
+    return oneflow._oneflow_internal.GetRank()
+
+
+def get_world_size():
+    """Returns the number of processes in the current process group.
+
+    Returns:
+        The world size of the process group.
+
+    """
+    return oneflow._oneflow_internal.GetWorldSize()
+
+
+def is_multi_client():
+    return oneflow._oneflow_internal.IsMultiClient()
+
+
+def split_sbp(
+    axis: int,
+) -> oneflow._oneflow_internal.oneflow.core.job.sbp_parallel.SbpParallel:
+    """Generate a split scheme in which op will be splitted at `axis`.
+
+    Args:
+        axis (int): At `axis` the op will be splitted.
+
+    Returns:
+        SbpParallel: Split scheme object, often required by `to_consistent` method of `Tensor`
+
+    Example::
+        array = numpy.array([[1.0, 2.0], [3.0, 4.0]])
+        t1 = flow.tensor(array)
+        ct2 = t1.to_consistent(sbp=flow.sbp.split(0), placement=("cuda", {0: [0, 1, 2, 3]}))
+
+    """
+    assert type(axis) is int
+    return oneflow._oneflow_internal.sbp.split(axis)
diff --git a/python/oneflow/framework/distribute_context.py b/python/oneflow/framework/distribute_context.py
new file mode 100644
index 0000000000000000000000000000000000000000..abf87ba3dd1adbe9b1c98951caff9ed4053ba0f4
--- /dev/null
+++ b/python/oneflow/framework/distribute_context.py
@@ -0,0 +1,60 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow.framework.scope_util as scope_util
+import oneflow.framework.session_context as session_ctx
+
+
+class DistributeStrategy(object):
+    def __init__(self, is_mirrored):
+        self.is_mirrored_ = is_mirrored
+        self.scope_context_ = None
+        sess = session_ctx.GetDefaultSession()
+        if sess.is_running and (
+            not sess.has_empty_is_mirrored_strategy_enabled_stack()
+        ):
+
+            def BuildScope(old_scope, builder):
+                return builder.BuildScopeWithNewIsMirrored(old_scope, is_mirrored)
+
+            self.scope_context_ = scope_util.ScopeContext(
+                scope_util.MakeScope(BuildScope)
+            )
+
+    def __enter__(self, *argc, **kwarg):
+        PushMirroredStrategyEnabled(self.is_mirrored_)
+        if self.scope_context_ is not None:
+            self.scope_context_.__enter__(*argc, **kwarg)
+
+    def __exit__(self, *argc, **kwarg):
+        PopMirroredStrategyEnabled()
+        if self.scope_context_ is not None:
+            self.scope_context_.__exit__(*argc, **kwarg)
+
+
+def PushMirroredStrategyEnabled(val):
+    session_ctx.GetDefaultSession().push_mirrored_strategy_enabled(val)
+
+
+def IsMirroredStrategyEnabled():
+    return session_ctx.GetDefaultSession().is_mirrored_strategy_enabled()
+
+
+def IsConsistentStrategyEnabled():
+    return session_ctx.GetDefaultSession().is_consistent_strategy_enabled()
+
+
+def PopMirroredStrategyEnabled():
+    session_ctx.GetDefaultSession().pop_mirrored_strategy_enabled()
diff --git a/python/oneflow/framework/docstr/__init__.py b/python/oneflow/framework/docstr/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b00d55675f10b45d6cb639c428d6ca68b41f7fc
--- /dev/null
+++ b/python/oneflow/framework/docstr/__init__.py
@@ -0,0 +1,17 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from .math_ops import *
+from .random import *
diff --git a/python/oneflow/framework/docstr/math_ops.py b/python/oneflow/framework/docstr/math_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd2134ce42a2d8cec95919521f08bb28f47951c1
--- /dev/null
+++ b/python/oneflow/framework/docstr/math_ops.py
@@ -0,0 +1,26 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow
+from oneflow.framework.docstr.utils import add_docstr
+
+add_docstr(
+    oneflow.F.sin,
+    "\n    sin(x: Tensor) -> Tensor\n\n    Returns a new tensor with the sine of the elements of :attr:`input`.\n\n    .. math::\n\n        \\text{y}_{i} = \\sin(\\text{x}_{i})\n\n    Args:\n        x (Tensor): the input tensor.\n\n    For example:\n\n    .. code-block:: python\n\n        >>> import oneflow as flow\n        >>> import numpy as np\n        >>> x1 = flow.Tensor(np.array([-0.5461,  0.1347, -2.7266, -0.2746]).astype(np.float32))\n        >>> y1 = flow.F.sin(x1)\n        >>> y1\n        tensor([-0.5194,  0.1343, -0.4032, -0.2712], dtype=oneflow.float32)\n        >>> x2 = flow.Tensor(np.array([-1.4, 2.6, 3.7]).astype(np.float32),device=flow.device('cuda'))\n        >>> y2 = flow.F.sin(x2)\n        >>> y2\n        tensor([-0.9854,  0.5155, -0.5298], device='cuda:0', dtype=oneflow.float32)\n\n\n",
+)
+add_docstr(
+    oneflow.F.cos,
+    "\n    cos(x: Tensor) -> Tensor\n\n    Returns a new tensor with the cosine  of the elements of :attr:`input`.\n    \n    .. math::\n        \\text{y}_{i} = \\cos(\\text{x}_{i})\n\n    Args:\n        x (Tensor): the input tensor.\n\n    For example:\n\n    .. code-block:: python\n\n        >>> import oneflow as flow\n        >>> import numpy as np\n        >>> x = np.array([1.4309,  1.2706, -0.8562,  0.9796])\n        >>> x = flow.Tensor(x, dtype=flow.float32)\n        >>> y = flow.F.cos(x)\n        >>> y\n        tensor([0.1394, 0.2957, 0.6553, 0.5574], dtype=oneflow.float32)\n\n",
+)
diff --git a/python/oneflow/framework/docstr/random.py b/python/oneflow/framework/docstr/random.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c24487224d709da6e14789eefe66986b3ba3d58
--- /dev/null
+++ b/python/oneflow/framework/docstr/random.py
@@ -0,0 +1,22 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow
+from oneflow.framework.docstr.utils import add_docstr
+
+add_docstr(
+    oneflow.F.bernoulli,
+    "\n    bernoulli(input, *, generator=None, out=None)\n    \n    This operator returns a Tensor with binaray random numbers (0 / 1) from a Bernoulli distribution.\n\n    Args:\n        input (Tensor): the input tensor of probability values for the Bernoulli distribution\n        generator: (Generator, optional) a pseudorandom number generator for sampling\n        out (Tensor, optional): the output tensor.\n\n    Shape:\n        - Input: :math:`(*)`. Input can be of any shape\n        - Output: :math:`(*)`. Output is of the same shape as input\n\n    For example:\n\n    .. code-block:: python\n\n        >>> import numpy as np\n        >>> import oneflow as flow\n        >>> arr = np.array(\n        ...    [\n        ...        [1.0, 1.0, 1.0],\n        ...        [1.0, 1.0, 1.0],\n        ...        [1.0, 1.0, 1.0],\n        ...    ]\n        ... )\n        >>> x = flow.Tensor(arr)\n        >>> y = flow.F.bernoulli(x)\n        >>> y\n        tensor([[1., 1., 1.],\n                [1., 1., 1.],\n                [1., 1., 1.]], dtype=oneflow.float32)\n\n    ",
+)
diff --git a/python/oneflow/framework/docstr/utils.py b/python/oneflow/framework/docstr/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cfc557af1503fa70e1de1792d7efe631723f1e8
--- /dev/null
+++ b/python/oneflow/framework/docstr/utils.py
@@ -0,0 +1,25 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+_function_docstr = {}
+
+
+def add_docstr(fun, docstr: str):
+    _function_docstr[fun] = docstr
+
+
+def register_docstr():
+    for (fun, docstr) in _function_docstr.items():
+        setattr(fun, "__doc__", docstr)
diff --git a/python/oneflow/framework/dtype.py b/python/oneflow/framework/dtype.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d01d3962dc82ecf91ef3b8ceb43194eab06503f
--- /dev/null
+++ b/python/oneflow/framework/dtype.py
@@ -0,0 +1,74 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import numpy as np
+
+import oneflow
+import oneflow._oneflow_internal
+import oneflow.core.common.data_type_pb2 as data_type_pb2
+
+_dtypes = [
+    oneflow.char,
+    oneflow.float,
+    oneflow.float32,
+    oneflow.double,
+    oneflow.float64,
+    oneflow.float16,
+    oneflow.int8,
+    oneflow.int32,
+    oneflow.int64,
+    oneflow.uint8,
+    oneflow.record,
+    oneflow.tensor_buffer,
+]
+
+
+def dtypes():
+    return _dtypes
+
+
+def convert_proto_dtype_to_oneflow_dtype(proto_dtype):
+    return oneflow._oneflow_internal.deprecated.GetDTypeByDataType(proto_dtype)
+
+
+_ONEFLOW_DTYPE_TO_NUMPY_DTYPE = {
+    oneflow.char: np.byte,
+    oneflow.float: np.float32,
+    oneflow.float16: np.float16,
+    oneflow.float32: np.float32,
+    oneflow.float64: np.double,
+    oneflow.double: np.double,
+    oneflow.int8: np.int8,
+    oneflow.int32: np.int32,
+    oneflow.int64: np.int64,
+    oneflow.uint8: np.uint8,
+}
+
+
+def convert_oneflow_dtype_to_numpy_dtype(oneflow_dtype: oneflow.dtype):
+    if oneflow_dtype not in _ONEFLOW_DTYPE_TO_NUMPY_DTYPE:
+        raise NotImplementedError
+    return _ONEFLOW_DTYPE_TO_NUMPY_DTYPE[oneflow_dtype]
+
+
+def convert_numpy_dtype_to_oneflow_dtype(numpy_dtype: np.dtype):
+    for (k, v) in _ONEFLOW_DTYPE_TO_NUMPY_DTYPE.items():
+        if v == numpy_dtype:
+            return k
+    raise NotImplementedError
+
+
+del data_type_pb2
+del np
diff --git a/python/oneflow/framework/env_util.py b/python/oneflow/framework/env_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef07485ee0994f1f27d4b574b084fb1a2bbb5047
--- /dev/null
+++ b/python/oneflow/framework/env_util.py
@@ -0,0 +1,413 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+import socket
+import traceback
+from contextlib import closing
+
+import oneflow._oneflow_internal
+import oneflow.core.control.ctrl_bootstrap_pb2 as ctrl_bootstrap_pb
+import oneflow.core.job.env_pb2 as env_pb
+import oneflow.core.job.resource_pb2 as resource_util
+import oneflow.framework.c_api_util as c_api_util
+import oneflow.framework.hob as hob
+import oneflow.framework.placement_context as placement_ctx
+import oneflow.framework.scope_util as scope_util
+import oneflow.framework.session_context as session_ctx
+import oneflow.support.enable_if as enable_if
+from oneflow import oneflow_deprecate
+
+
+def api_all_device_placement(device_type: str) -> None:
+    """Return a placement containing all devices of all machines under env.
+
+    Args:
+        device_type (str): cuda or cpu
+    """
+    return oneflow._oneflow_internal.AllDevicePlacement(device_type)
+
+
+def api_enable_eager_execution(val: bool = True) -> None:
+    """If True, job will execute in eager mode, else use lazy mode(static graph).
+
+    Args:
+        val (bool, optional): Whether  eager execution or not.  Defaults to True.
+    """
+    return enable_if.unique([enable_eager_environment])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.any_global_function_defined)
+def enable_eager_environment(val=True):
+    return oneflow._oneflow_internal.EnableEagerEnvironment(val)
+
+
+def api_env_init() -> bool:
+    """Init environment for job
+
+    Returns:
+        bool: [description]
+    """
+    return enable_if.unique([env_init, do_nothing])()
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.env_initialized)
+def env_init():
+    global default_env_proto
+    is_multi_client = oneflow._oneflow_internal.IsMultiClient()
+    assert len(default_env_proto.machine) > 0
+    CompleteEnvProto(default_env_proto, is_multi_client)
+    c_api_util.InitEnv(default_env_proto, is_multi_client)
+    if not is_multi_client:
+        if oneflow._oneflow_internal.CurrentMachineId() == 0:
+            scope_util.InitScopeStack()
+        else:
+            exit(0)
+    return True
+
+
+def api_get_current_resource() -> resource_util.Resource:
+    """Get current resources, such as:machine nums, cpu/gpu device nums,
+            epoch network threed num, rdma params...
+
+    Returns:
+        resource_util.Resource: [description]
+    """
+    return enable_if.unique([get_current_resource])()
+
+
+@enable_if.condition(hob.in_normal_mode & hob.env_initialized)
+def get_current_resource():
+    return c_api_util.CurrentResource()
+
+
+def api_get_current_machine_id():
+    """Get machine id of current machine/node
+
+    Returns:
+        [type]: [description]
+    """
+    return enable_if.unique([get_current_machine_id])()
+
+
+@enable_if.condition(hob.in_normal_mode & hob.env_initialized)
+def get_current_machine_id() -> int:
+    return oneflow._oneflow_internal.CurrentMachineId()
+
+
+def api_machine(*val: list) -> None:
+    """Set machines' hostnames.
+
+    For instance:
+
+        oneflow.env.machine([{"addr": "192.168.1.1"}, {"addr": "192.168.1.2"}])
+
+    Args:
+        val:  `list`, `tuple` or multiple arguments of `dict`. First in the list is the master machine.
+    """
+    return enable_if.unique([machine, do_nothing])(*val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.env_initialized)
+def machine(*val):
+    del default_env_proto.machine[:]
+    if len(val) == 1 and isinstance(val[0], (list, tuple)):
+        val = val[0]
+    default_env_proto.ClearField("machine")
+    default_env_proto.machine.extend(_MakeMachine(val))
+
+
+def api_ctrl_port(val: int) -> None:
+    """Set port number used to control the execution across multiple machines. Same on every machine.
+
+    Args:
+        val: a port number accessible to peer machines
+    """
+    return enable_if.unique([ctrl_port, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.env_initialized)
+def ctrl_port(val):
+    assert type(val) is int
+    default_env_proto.ctrl_port = val
+
+
+def api_data_port(val: int) -> None:
+    """Set port number used to data transfer among multiple machines. Same on every machine.
+
+    Args:
+        val: a port number accessible to peer machines
+    """
+    return enable_if.unique([data_port, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.env_initialized)
+def data_port(val):
+    assert type(val) is int
+    default_env_proto.data_port = val
+
+
+from oneflow import oneflow_deprecate
+
+
+@oneflow_deprecate()
+def api_grpc_use_no_signal(val: bool = True) -> None:
+    """Set rpc use signal or not (deprecate)
+
+    Args:
+        val (bool, optional): True or False. Defaults to True.
+    """
+    print(
+        "WARNING:",
+        "oneflow.env.grpc_use_no_signal is deprecated, users no longer need to set rpc use signal or not. \n",
+        traceback.format_stack()[-2],
+    )
+    return None
+
+
+def api_log_dir(val: str) -> None:
+    """Specify a dir to store OneFlow's logging files. If not specified, it is `./log` by default.
+
+    Args:
+        val (str): string , log file path
+    """
+    return enable_if.unique([log_dir, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.env_initialized)
+def log_dir(val):
+    assert type(val) is str
+    default_env_proto.cpp_logging_conf.log_dir = val
+
+
+def api_logtostderr(val: int) -> None:
+    """Set whether log messages go to stderr instead of logfiles
+
+    Args:
+        val (int): [description]
+    """
+    return enable_if.unique([logtostderr, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.env_initialized)
+def logtostderr(val):
+    assert type(val) is int
+    default_env_proto.cpp_logging_conf.logtostderr = val
+
+
+def api_logbuflevel(val: int) -> None:
+    """Log messages at a level <= this flag are buffered.
+            Log messages at a higher level are flushed immediately.
+
+    Args:
+        val (int): int, number of level
+    """
+    return enable_if.unique([logbuflevel, do_nothing])(val)
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.env_initialized)
+def logbuflevel(val):
+    assert type(val) is int
+    default_env_proto.cpp_logging_conf.logbuflevel = val
+
+
+@enable_if.condition(hob.in_normal_mode & hob.env_initialized)
+def do_nothing(*args, **kwargs):
+    print("Nothing happened because environment has been initialized")
+    return False
+
+
+def CompleteEnvProto(env_proto, is_multi_client):
+    if is_multi_client:
+        _UpdateDefaultEnvProtoByMultiClientEnvVars(env_proto)
+    if env_proto.HasField("ctrl_port") == False:
+        if len(env_proto.machine) == 1:
+            env_proto.ctrl_port = _FindFreePort()
+        else:
+            raise ValueError(
+                "a ctrl_port is required if running multi-node, set it with 'oneflow.env.ctrl_port([YOUR PORT])'"
+            )
+
+
+def _MakeMachine(machines):
+    if isinstance(machines, str):
+        machines = [machines]
+    rp_machine = env_pb.EnvProto().machine
+    for m_data in machines:
+        m = rp_machine.add()
+        if isinstance(m_data, str):
+            m.addr = m_data
+        elif isinstance(m_data, dict):
+            if "addr" in m_data:
+                m.addr = m_data["addr"]
+            if "ctrl_port_agent" in m_data:
+                m.ctrl_port_agent = m_data["ctrl_port_agent"]
+            if "data_port_agent" in m_data:
+                m.data_port_agent = m_data["data_port_agent"]
+        else:
+            raise NotImplementedError
+    id = 0
+    addrs_for_check = set()
+    for m in rp_machine:
+        m.id = id
+        id += 1
+        assert m.addr not in addrs_for_check
+        addrs_for_check.add(m.addr)
+    return rp_machine
+
+
+def api_init_bootstrap_confs(*val: list, **kargs) -> None:
+    return enable_if.unique([MakeBootstrapConfs, do_nothing])(*val, **kargs)
+
+
+def _MakeBootstrapConf(bootstrap_info: dict):
+    global config_master_addr
+    assert config_master_addr.HasField("host"), "must config master host first"
+    assert config_master_addr.HasField("port"), "must config master port first"
+    assert config_world_size != 0, "must config world size first"
+    bootstrap_conf = ctrl_bootstrap_pb.BootstrapConf()
+    bootstrap_conf.master_addr.CopyFrom(config_master_addr)
+    bootstrap_conf.world_size = config_world_size
+    assert "rank" in bootstrap_info
+    bootstrap_conf.rank = bootstrap_info["rank"]
+    if "host" in bootstrap_info:
+        bootstrap_conf.host = bootstrap_info["host"]
+    global config_bootstrap_ctrl_port
+    if config_bootstrap_ctrl_port != 0:
+        bootstrap_conf.ctrl_port = config_bootstrap_ctrl_port
+    global config_node_size
+    if config_node_size != 0:
+        bootstrap_conf.node_size = config_node_size
+    return bootstrap_conf
+
+
+@enable_if.condition(hob.in_normal_mode & ~hob.env_initialized)
+def MakeBootstrapConfs(
+    node_list, master_port, world_size=0, ctrl_port=-1, node_size=-1
+):
+    """Set ctrl_bootstrap_conf' info.
+
+    For instance:
+
+        ONEFLOW_TEST_NODE_LIST=192.168.1.16,192.168.1.15 ONEFLOW_TEST_MASTER_PORT=43256
+        ONEFLOW_TEST_WORLD_SIZE=2 ONEFLOW_TEST_RANK_CTRL_PORT=34527
+
+    Args:
+        val:  `list`, First in the list is the master machine.
+    """
+    if isinstance(node_list, str):
+        node_list = [node_list]
+    global global_ctrl_bootstrap_confs
+    assert len(global_ctrl_bootstrap_confs) == 0, "ctrl_bootstrap_conf has been inited"
+    global config_master_addr
+    config_master_addr.host = node_list[0]
+    config_master_addr.port = master_port
+    global config_world_size
+    if world_size == 0:
+        config_world_size = len(node_list)
+    else:
+        assert world_size % len(node_list) == 0
+        config_world_size = world_size
+    global config_bootstrap_ctrl_port
+    if ctrl_port != -1:
+        config_bootstrap_ctrl_port = ctrl_port
+    global config_node_size
+    if node_size != -1:
+        config_node_size = node_size
+    rank = 0
+    for rank_host in node_list:
+        assert isinstance(rank_host, str)
+        bootstrap_conf = _MakeBootstrapConf({"rank": rank, "host": rank_host})
+        if rank == 0:
+            global default_env_proto
+            default_env_proto.ctrl_bootstrap_conf.CopyFrom(bootstrap_conf)
+        global_ctrl_bootstrap_confs.append(bootstrap_conf)
+        rank += 1
+    return global_ctrl_bootstrap_confs
+
+
+def _DefaultEnvProto():
+    env_proto = env_pb.EnvProto()
+    machine = env_proto.machine.add()
+    machine.id = 0
+    machine.addr = "127.0.0.1"
+    return env_proto
+
+
+def _FindFreePort():
+    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
+        s.bind(("localhost", 0))
+        s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+        return s.getsockname()[1]
+
+
+def GetEnvDefaultParallelConf(device_tag):
+    if device_tag not in device_tag2default_parallel_conf:
+        parallel_conf = placement_ctx.MakeParallelConf4Resource(
+            device_tag, c_api_util.EnvResource()
+        )
+        device_tag2default_parallel_conf[device_tag] = parallel_conf
+    return device_tag2default_parallel_conf[device_tag]
+
+
+def HasAllMultiClientEnvVars():
+    env_var_names = ["MASTER_ADDR", "MASTER_PORT", "WORLD_SIZE", "RANK", "LOCAL_RANK"]
+    has_all_env_vars = all([os.getenv(x) for x in env_var_names])
+    if not has_all_env_vars:
+        has_at_least_one_env_var = any([os.getenv(x) for x in env_var_names])
+        assert not has_at_least_one_env_var
+    return has_all_env_vars
+
+
+def SetDefaultMultiClientEnvVars():
+    os.environ["MASTER_ADDR"] = "127.0.0.1"
+    os.environ["MASTER_PORT"] = str(_FindFreePort())
+    os.environ["WORLD_SIZE"] = "1"
+    os.environ["RANK"] = "0"
+    os.environ["LOCAL_RANK"] = "0"
+
+
+def _UpdateDefaultEnvProtoByMultiClientEnvVars(env_proto):
+    assert HasAllMultiClientEnvVars()
+
+    def str2int(env_config):
+        assert env_config.isdigit()
+        return int(env_config)
+
+    bootstrap_conf = ctrl_bootstrap_pb.BootstrapConf()
+    master_addr = ctrl_bootstrap_pb.Address()
+    master_addr.host = os.getenv("MASTER_ADDR")
+    master_addr.port = str2int(os.getenv("MASTER_PORT"))
+    bootstrap_conf.master_addr.CopyFrom(master_addr)
+    bootstrap_conf.world_size = str2int(os.getenv("WORLD_SIZE"))
+    bootstrap_conf.rank = str2int(os.getenv("RANK"))
+    env_proto.ctrl_bootstrap_conf.CopyFrom(bootstrap_conf)
+    cpp_logging_conf = env_pb.CppLoggingConf()
+    if os.getenv("GLOG_log_dir"):
+        cpp_logging_conf.log_dir = os.getenv("GLOG_log_dir")
+    if os.getenv("GLOG_logtostderr"):
+        cpp_logging_conf.logtostderr = os.getenv("GLOG_logtostderr")
+    if os.getenv("GLOG_logbuflevel"):
+        cpp_logging_conf.logbuflevel = os.getenv("GLOG_logbuflevel")
+    env_proto.cpp_logging_conf.CopyFrom(cpp_logging_conf)
+
+
+device_tag2default_parallel_conf = {}
+default_env_proto = _DefaultEnvProto()
+config_master_addr = ctrl_bootstrap_pb.Address()
+config_world_size = 0
+config_bootstrap_ctrl_port = 0
+config_node_size = 0
+global_ctrl_bootstrap_confs = []
diff --git a/python/oneflow/framework/function_desc.py b/python/oneflow/framework/function_desc.py
new file mode 100644
index 0000000000000000000000000000000000000000..f35926a97ed8fe408ffeaa4a14b0553e87296e2d
--- /dev/null
+++ b/python/oneflow/framework/function_desc.py
@@ -0,0 +1,99 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow._oneflow_internal
+import oneflow._oneflow_internal.oneflow.core.job.job_conf as job_conf_cfg
+import oneflow.framework.hob as hob
+import oneflow.framework.session_context as session_ctx
+import oneflow.support.enable_if as enable_if
+
+
+class FunctionAttribute(object):
+    def __init__(self):
+        self.default_placement_scope = None
+        self.default_distribute_strategy = None
+        self.allow_cpu_return_op = True
+
+
+class FunctionDesc(object):
+    def __init__(self, job_func=None, job_config_proto=None, function_attribute=None):
+        if job_config_proto is None:
+            job_config_proto = job_conf_cfg.JobConfigProto()
+        if function_attribute is None:
+            function_attribute = FunctionAttribute()
+        self.job_func = job_func
+        self.job_config_proto = job_config_proto
+        self.job_config_proto.mutable_predict_conf()
+        self.function_attribute = function_attribute
+
+    def IsTrainable(self):
+        if self.job_config_proto.has_train_conf():
+            return True
+        if self.job_config_proto.has_predict_conf():
+            return False
+        raise NotImplementedError
+
+    def HasAttr(self, attr_name):
+        if attr_name == "flag_name2flag_value":
+            return False
+        name2default = session_ctx.GetDefaultSession().function_flag_name2default_val
+        if attr_name in self.job_config_proto.flag_name2flag_value():
+            return True
+        return getattr(self.job_config_proto, "has_" + attr_name)()
+
+    def __getattr__(self, attr_name):
+        assert attr_name != "flag_name2flag_value"
+        flag_name2flag_value = self.job_config_proto.flag_name2flag_value()
+        name2default = session_ctx.GetDefaultSession().function_flag_name2default_val
+        if attr_name not in name2default:
+            assert getattr(self.job_config_proto, "has_" + attr_name)()
+            return getattr(self.job_config_proto, attr_name)()
+        attr_value = name2default[attr_name]
+        if attr_name in flag_name2flag_value:
+            attr_value = flag_name2flag_value[attr_name]
+        if attr_value.HasField("at_bool"):
+            return attr_value.at_bool
+        elif attr_value.HasField("at_int64"):
+            return attr_value.at_int64
+        elif attr_value.HasField("at_double"):
+            return attr_value.at_double
+        elif attr_value.HasField("at_string"):
+            return attr_value.at_string
+        else:
+            raise NotImplementedError()
+
+
+@enable_if.condition(hob.in_global_mode & hob.eager_execution_enabled)
+def GetCurrentEagerGlobalFunctionDesc():
+    sess = session_ctx.GetDefaultSession()
+    ret = sess.CurrentEagerGlobalFunctionDesc()
+    assert ret is not None
+    return ret
+
+
+@enable_if.condition(hob.in_global_mode & ~hob.eager_execution_enabled)
+def GetCurrentLazyGlobalFunctionDesc():
+    sess = session_ctx.GetDefaultSession()
+    job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+    ret = sess.GetLazyFunctionDesc(job_name)
+    assert ret is not None
+    return ret
+
+
+def api_current_global_function_desc() -> FunctionDesc:
+    api_func = enable_if.unique(
+        [GetCurrentLazyGlobalFunctionDesc, GetCurrentEagerGlobalFunctionDesc]
+    )
+    return api_func()
diff --git a/python/oneflow/framework/function_util.py b/python/oneflow/framework/function_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..548ea33066c9afb547813d069dff8cb31a8a82f2
--- /dev/null
+++ b/python/oneflow/framework/function_util.py
@@ -0,0 +1,988 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import copy
+import functools
+import inspect
+import re
+import sys
+import traceback
+from typing import Any, Callable, Optional, Union
+
+import oneflow._oneflow_internal
+import oneflow._oneflow_internal.oneflow.core.common.data_type as data_type_cfg
+import oneflow.framework.distribute_context as distribute_ctx
+import oneflow.framework.hob as hob
+import oneflow.framework.placement_context as placement_ctx
+import oneflow.framework.runtime_mode as rt_mode
+import oneflow.framework.session_context as session_ctx
+import oneflow.framework.typing_util as oft_util
+import oneflow.support.enable_if as enable_if
+import oneflow.support.pb_util as pb_util
+from oneflow import oneflow_deprecate
+from oneflow.framework.function_desc import FunctionDesc
+
+
+class FunctionConfig(object):
+    """OneFlow function's configurations.
+    """
+
+    def __init__(self) -> None:
+        self.function_desc = FunctionDesc()
+
+    def __getattr__(
+        self, attr_name: str
+    ) -> Callable[[Optional[Union[bool, int, float, str]]], None]:
+        name2default = session_ctx.GetDefaultSession().function_flag_name2default_val
+        assert attr_name in name2default
+        flag_name2flag_value = (
+            self.function_desc.job_config_proto.mutable_flag_name2flag_value()
+        )
+        default_val = name2default[attr_name]
+
+        def FunctionConfigSetter(
+            attr_value: Optional[Union[bool, int, float, str]] = None
+        ) -> None:
+            if default_val.HasField("at_bool"):
+                if attr_value is None:
+                    attr_value = True
+                assert type(attr_value) is bool
+                flag_name2flag_value[attr_name].set_at_bool(attr_value)
+            elif default_val.HasField("at_int64"):
+                assert type(attr_value) is int
+                flag_name2flag_value[attr_name].set_at_int64(attr_value)
+            elif default_val.HasField("at_double"):
+                assert type(attr_value) is float
+                flag_name2flag_value[attr_name].set_at_double(attr_value)
+            elif default_val.HasField("at_string"):
+                assert type(attr_value) is str
+                flag_name2flag_value[attr_name].set_at_string(attr_value)
+            else:
+                raise NotImplementedError(
+                    "config_flag `%s' with type %s is not supported"
+                    % (attr_name, type(attr_value))
+                )
+
+        return FunctionConfigSetter
+
+
+def api_oneflow_function(
+    type: str = "predict", function_config: FunctionConfig = None
+) -> Callable[[Callable], Callable]:
+    """Creates a callable OneFlow global function from a Python function.
+
+    For instance::
+
+        @oneflow.global_function(flow.FunctionConfig())
+        def train():
+            # your model
+
+    Args:
+        function_config (FunctionConfig, optional): a `FunctionConfig` object. Defaults to FunctionConfig().
+
+    Returns:
+        Callable[[Callable], Callable]: a callable which is called to execute the compiled function
+    """
+    if isinstance(type, FunctionConfig):
+        function_config = type
+        print(
+            "WARNING: flow.global_function(func_config) is deprecated. Please replace it with flow.global_function(type, func_config).\n            "
+        )
+        print(traceback.format_stack()[-2])
+    else:
+        assert type in ["train", "predict"]
+        if function_config is None:
+            function_config = FunctionConfig()
+        if type == "train":
+            function_config.function_desc.job_config_proto.mutable_train_conf()
+        else:
+            function_config.function_desc.job_config_proto.mutable_predict_conf()
+    api = enable_if.unique([eager_oneflow_function, lazy_oneflow_function])
+    return api(function_config)
+
+
+@enable_if.condition(hob.in_normal_mode & hob.eager_execution_enabled)
+def eager_oneflow_function(function_config=FunctionConfig()):
+    assert isinstance(function_config, FunctionConfig)
+
+    def Decorator(job_func):
+        if not hasattr(job_func, "__oneflow_function_signature__"):
+            job_func.__oneflow_function_signature__ = inspect.signature(job_func)
+        oft_util.CheckGlobalFunctionAnnotation(job_func.__oneflow_function_signature__)
+        sess = session_ctx.GetDefaultSession()
+        function_desc = _CloneFunctionDesc(function_config.function_desc, job_func)
+
+        @functools.wraps(job_func)
+        def Func(*args, **kwargs):
+            return _RunEagerJob(sess, function_desc, *args, **kwargs)
+
+        for x in dir(job_func):
+            if x.startswith("__oneflow_"):
+                setattr(Func, x, getattr(job_func, x))
+        return Func
+
+    return Decorator
+
+
+@enable_if.condition(
+    hob.in_normal_mode & ~hob.eager_execution_enabled & ~hob.session_initialized
+)
+def lazy_oneflow_function(function_config=FunctionConfig()):
+    assert isinstance(function_config, FunctionConfig)
+
+    def Decorator(job_func):
+        if not hasattr(job_func, "__oneflow_function_signature__"):
+            job_func.__oneflow_function_signature__ = inspect.signature(job_func)
+        oft_util.CheckGlobalFunctionAnnotation(job_func.__oneflow_function_signature__)
+        sess = session_ctx.GetDefaultSession()
+
+        @functools.wraps(job_func)
+        def Func(*args, **kwargs):
+            return _RunLazyJob(sess, job_func, *args, **kwargs)
+
+        sess.AddJob(_CloneFunctionDesc(function_config.function_desc, job_func))
+        for x in dir(job_func):
+            if x.startswith("__oneflow_"):
+                setattr(Func, x, getattr(job_func, x))
+        return Func
+
+    return Decorator
+
+
+def global_function_or_identity(*args, **kwargs):
+    if rt_mode.CurrentMode() == rt_mode.NORMAL_MODE:
+        return api_oneflow_function(*args, **kwargs)
+    else:
+        assert rt_mode.CurrentMode() == rt_mode.GLOBAL_MODE
+        identity_decorator = lambda func: func
+        return identity_decorator
+
+
+def _CloneFunctionDesc(func_desc, job_func):
+    new_func_desc = FunctionDesc(job_func=job_func)
+    new_func_desc.job_config_proto.CopyFrom(func_desc.job_config_proto)
+    new_func_desc.function_attribute = copy.deepcopy(func_desc.function_attribute)
+    return new_func_desc
+
+
+def oneflow_function_config(*field_paths):
+    def Decorator(func):
+        global _class_property2return_obj_class
+        for field_path in field_paths:
+            fields = field_path.split(".")
+            assert len(fields) > 0
+            cls = FunctionConfig
+            for (index, field) in enumerate(fields):
+                assert field != "function_desc"
+                assert re.match("^[_\\w]+[_\\w\\d]*$", field)
+                if (cls, field) not in _class_property2return_obj_class:
+                    class_name = ".".join(["function_config"] + fields[: index + 1])
+
+                    def Init(self, function_desc):
+                        self.function_desc = function_desc
+
+                    config_class = type(class_name, (object,), dict(__init__=Init))
+                    setattr(cls, field, _MakeInnerJobConfigClassProperty(config_class))
+                    _class_property2return_obj_class[cls, field] = config_class
+                cls = _class_property2return_obj_class[cls, field]
+            cls.__call__ = _MakeLeafJobConfigCall(func)
+        return func
+
+    return Decorator
+
+
+_class_property2return_obj_class = {}
+
+
+def _MakeInnerJobConfigClassProperty(return_obj_class):
+    return property(lambda self: return_obj_class(self.function_desc))
+
+
+def _MakeLeafJobConfigCall(method):
+    return lambda self, *argv, **kwarg: method(self.function_desc, *argv, **kwarg)
+
+
+def _RunEagerJob(session, function_desc, *args):
+    return session.TryInit().EagerRun(function_desc, *args)
+
+
+def _RunLazyJob(session, job_func, *args, **kwargs):
+    return session.TryInit().LazyRun(job_func, *args, **kwargs)
+
+
+@oneflow_function_config("default_data_type")
+def set_default_data_type(func_desc, value):
+    """Set default data type for job
+
+    Args:
+        func_desc ([type]): job function
+        value ([type]): data type. e.g. flow.float
+    """
+    func_desc.job_config_proto.set_default_data_type(
+        data_type_cfg.DataType(
+            oneflow._oneflow_internal.deprecated.GetProtoDtype4OfDtype(value)
+        )
+    )
+
+
+@oneflow_function_config("default_initializer_conf")
+def set_default_initializer_conf(func_desc, value):
+    """Set default initial configuration for job
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    assert type(value) is dict
+    pb_util.PythonDict2CFG(
+        value, func_desc.job_config_proto.mutable_default_initializer_conf()
+    )
+
+
+@oneflow_function_config("exp_run_conf")
+def set_exp_run_conf(value):
+    """Set experimental configuration for job
+
+    Args:
+        value ([type]): [description]
+    """
+    assert type(func_desc, value) is dict
+    pb_util.PythonDict2CFG(value, func_desc.job_config_proto.mutable_exp_run_conf())
+
+
+@oneflow_function_config("static_mem_alloc_policy_white_list.has")
+def static_mem_alloc_policy_white_list_has_policy(func_desc, policy):
+    """Get items from white list related to static memory allocation policy
+
+    Args:
+        func_desc ([type]): [description]
+        policy ([type]): [description]
+
+    Returns:
+        [type]: [description]
+    """
+    return getattr(
+        func_desc.job_config_proto.mutable_memory_allocation_algorithm_conf(), policy
+    )()
+
+
+@oneflow_function_config("static_mem_alloc_policy_white_list.add")
+def static_mem_alloc_policy_white_list_add_policy(func_desc, policy):
+    """Add item to white list related to static memory allocation policy
+
+    Args:
+        func_desc ([type]): [description]
+        policy ([type]): [description]
+    """
+    getattr(
+        func_desc.job_config_proto.mutable_memory_allocation_algorithm_conf(),
+        "set_" + policy,
+    )(True)
+
+
+@oneflow_function_config("static_mem_alloc_policy_white_list.remove")
+def static_mem_alloc_policy_white_list_remove_policy(func_desc, policy):
+    """Remove item of white list related to static memory allocation policy
+
+    Args:
+        func_desc ([type]): [description]
+        policy ([type]): [description]
+    """
+    getattr(
+        func_desc.job_config_proto.mutable_memory_allocation_algorithm_conf(),
+        "set_" + policy,
+    )(False)
+
+
+@oneflow_function_config("static_mem_alloc_policy_white_list.policy_mem_size_first")
+def policy_mem_size_first(func_desc):
+    """A static memory allocation policy called: mem_size_first
+
+    Args:
+        func_desc ([type]): [description]
+
+    Returns:
+        [type]: [description]
+    """
+    return "use_mem_size_first_algo"
+
+
+@oneflow_function_config(
+    "static_mem_alloc_policy_white_list.policy_mutual_exclusion_first"
+)
+def policy_mutual_exclusion_first(func_desc):
+    """A static memory allocation policy called: mutual_exclusion_first
+
+    Args:
+        func_desc ([type]): [description]
+
+    Returns:
+        [type]: [description]
+    """
+    return "use_mutual_exclusion_first_algo"
+
+
+@oneflow_function_config("static_mem_alloc_policy_white_list.policy_time_line")
+def policy_time_line(func_desc):
+    """A static memory allocation policy called: time_line
+
+    Args:
+        func_desc ([type]): [description]
+
+    Returns:
+        [type]: [description]
+    """
+    return "use_time_line_algo"
+
+
+@oneflow_function_config("static_mem_alloc_algo_white_list.show")
+def show_static_mem_alloc_algo_white_list(func_desc):
+    """Show configuration of  static memory allocation policy,
+          including: "use_mem_size_first_algo", "use_mutual_exclusion_first_algo", "use_time_line_algo"
+
+    Args:
+        func_desc ([type]): [description]
+
+    Returns:
+        [type]: [description]
+    """
+    return [
+        "use_mem_size_first_algo",
+        "use_mutual_exclusion_first_algo",
+        "use_time_line_algo",
+    ]
+
+
+@oneflow_function_config("enable_cudnn")
+def set_enable_cudnn(func_desc, value=True):
+    """Whether use cudnn to accelerate job or not.
+
+    Args:
+        func_desc ([type]): [description]
+        value (bool, optional): [description]. Defaults to True.
+    """
+    func_desc.job_config_proto.set_enable_cudnn(value)
+
+
+@oneflow_function_config("cudnn_buf_limit_mbyte")
+def set_cudnn_buf_limit_mbyte(func_desc, value):
+    """Set cudnn buffer limit, e.g. 1024mb
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    func_desc.job_config_proto.set_cudnn_buf_limit_mbyte(value)
+
+
+@oneflow_function_config("cudnn_conv_force_fwd_algo")
+def set_cudnn_conv_force_fwd_algo(func_desc, value):
+    """Set value to cudnn conv_force_forward algorithm
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    func_desc.job_config_proto.set_cudnn_conv_force_fwd_algo(value)
+
+
+@oneflow_function_config("cudnn_conv_force_bwd_data_algo")
+def set_cudnn_conv_force_bwd_data_algo(func_desc, value):
+    """Set value to cudnn conv_force_backward_data algorithm
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    func_desc.job_config_proto.set_cudnn_conv_force_bwd_data_algo(value)
+
+
+@oneflow_function_config("cudnn_conv_force_bwd_filter_algo")
+def set_cudnn_conv_force_bwd_filter_algo(func_desc, value):
+    """Set value to cudnn conv_force_backward_filter algorithm
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    func_desc.job_config_proto.set_cudnn_conv_force_bwd_filter_algo(value)
+
+
+@oneflow_function_config("cudnn_conv_heuristic_search_algo")
+def set_cudnn_conv_heuristic_search_algo(func_desc, value):
+    """Set value to cudnn conv_heuristic_search algorithm
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    func_desc.job_config_proto.set_cudnn_conv_heuristic_search_algo(value)
+
+
+@oneflow_function_config("enable_cudnn_fused_normalization_add_relu")
+def set_enable_cudnn_fused_normalization_add_relu(func_desc, value):
+    """Whether enable cudnn_fused_normalization_add_relu.
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    func_desc.job_config_proto.set_enable_cudnn_fused_normalization_add_relu(value)
+
+
+@oneflow_function_config("enable_fuse_add_to_output")
+def set_enable_fuse_add_to_output(func_desc, value):
+    """Whether enable fuse_add_to_output.
+            If enabled, try to fuse a binary element-wise add to one of the predecessors to improve performance.
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    func_desc.job_config_proto.set_enable_fuse_add_to_output(value)
+
+
+@oneflow_function_config("enable_fuse_cast_scale")
+def set_enable_fuse_cast_scale(func_desc, value=True):
+    """Whether enable fuse_cast_scale.
+            If enabled, try to fuse cast and scalar_mul_by_tensor to improve performance.
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    func_desc.job_config_proto.set_enable_fuse_cast_scale(value)
+
+
+@oneflow_function_config("cudnn_conv_use_deterministic_algo_only")
+def set_cudnn_conv_use_deterministic_algo_only(func_desc, value):
+    """Set value to cudnn conv_use_deterministic_only algorithm
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    func_desc.job_config_proto.set_cudnn_conv_use_deterministic_algo_only(value)
+
+
+@oneflow_function_config("enable_reused_mem")
+def set_enable_reused_mem(func_desc, value=True):
+    """Whether enable reuse memory or not
+
+    Args:
+        func_desc ([type]): [description]
+        value (bool, optional): [description]. Defaults to True.
+    """
+    func_desc.job_config_proto.set_enable_reused_mem(value)
+
+
+@oneflow_function_config("enable_inplace")
+def set_enable_inplace(func_desc, value=True):
+    """Whether enable inplace  or not
+
+    Args:
+        func_desc ([type]): [description]
+        value (bool, optional): [description]. Defaults to True.
+    """
+    func_desc.job_config_proto.set_enable_inplace(value)
+
+
+@oneflow_function_config("enable_inplace_in_reduce_struct")
+def set_enable_inplace_in_reduce_struct(func_desc, value=True):
+    print(
+        "'enable_inplace_in_reduce_struct' has been deprecated, has no effect and will be removed in the future."
+    )
+
+
+@oneflow_function_config("enable_nccl")
+def set_enable_nccl(func_desc, value=True):
+    print(
+        "'enable_nccl' has been deprecated, has no effect and will be removed in the future."
+    )
+
+
+@oneflow_function_config("use_nccl_inter_node_communication")
+def set_use_nccl_inter_node_communication(func_desc, value=True):
+    print(
+        "'use_nccl_inter_node_communication' has been deprecated, has no effect and will be removed in the future."
+    )
+
+
+@oneflow_function_config("use_boxing_v2")
+def set_use_boxing_v2(func_desc, value=True):
+    print(
+        "'use_boxing_v2' has been deprecated, has no effect and will be removed in the future."
+    )
+
+
+@oneflow_function_config("do_parallel_cast_before_widening_type_cast")
+def set_do_parallel_cast_before_widening_type_cast(func_desc, value=True):
+    func_desc.job_config_proto.set_do_parallel_cast_before_widening_type_cast(value)
+
+
+@oneflow_function_config("enable_all_reduce_group")
+def set_enable_all_reduce_group(func_desc, value=True):
+    print(
+        "'enable_all_reduce_group' has been deprecated, has no effect and will be removed in the future."
+    )
+
+
+@oneflow_function_config("all_reduce_group_num")
+def set_all_reduce_group_num(func_desc, value):
+    print(
+        "'all_reduce_group_num' has been deprecated, has no effect and will be removed in the future."
+    )
+
+
+@oneflow_function_config("all_reduce_lazy_ratio")
+def set_all_reduce_lazy_ratio(func_desc, value):
+    print(
+        "'all_reduce_lazy_ratio' has been deprecated, has no effect and will be removed in the future."
+    )
+
+
+@oneflow_function_config("all_reduce_group_min_mbyte")
+def set_all_reduce_group_min_mbyte(func_desc, value):
+    print(
+        "'all_reduce_group_min_mbyte' has been deprecated, has no effect and will be removed in the future."
+    )
+
+
+@oneflow_function_config("all_reduce_group_size_warmup")
+def set_all_reduce_group_size_warmup(func_desc, value):
+    print(
+        "'all_reduce_group_size_warmup' has been deprecated, has no effect and will be removed in the future."
+    )
+
+
+@oneflow_function_config("all_reduce_fp16")
+def set_all_reduce_fp16(func_desc, value=True):
+    print(
+        "'all_reduce_fp16' has been deprecated, has no effect and will be removed in the future."
+    )
+
+
+@oneflow_function_config(
+    "optimizer_placement_optimization_mode",
+    "train.optimizer_placement_optimization_mode",
+)
+def set_optimizer_placement_optimization_mode(func_desc, mode):
+    """Enable optimizer_placement_optimization with mode 'mode'
+
+    Args:
+        func_desc ([type]): [description]
+        mode (str): [description].
+    """
+    assert mode in ["non_distributed", "distributed_split"]
+    func_desc.job_config_proto.set_optimizer_placement_optimization_mode(mode)
+
+
+@oneflow_function_config(
+    "optimizer_placement_optimization_threshold",
+    "train.optimizer_placement_optimization_threshold",
+)
+def set_optimizer_placement_optimization_threshold(func_desc, value):
+    func_desc.job_config_proto.set_optimizer_placement_optimization_threshold(value)
+
+
+@oneflow_function_config("enable_non_distributed_optimizer")
+def set_enable_non_distributed_optimizer(func_desc, value=True):
+    """Whether enable non_distributed optimizer or not
+
+    Args:
+        func_desc ([type]): [description]
+        value (bool, optional): [description]. Defaults to True.
+    """
+    if value:
+        set_optimizer_placement_optimization_mode(func_desc, "non_distributed")
+
+
+@oneflow_function_config("disable_all_reduce_sequence")
+def set_disable_all_reduce_sequence(func_desc, value=True):
+    print(
+        "'disable_all_reduce_sequence' has been deprecated, has no effect and will be removed in the future."
+    )
+
+
+@oneflow_function_config("prune_parallel_cast_ops")
+def set_prune_parallel_cast_ops(func_desc, value=True):
+    """Whether prune parallel cast  operations or not.
+
+    Args:
+        func_desc ([type]): [description]
+        value (bool, optional): [description]. Defaults to True.
+    """
+    func_desc.job_config_proto.set_prune_parallel_cast_ops(value)
+
+
+@oneflow_function_config("prune_cast_to_static_shape_ops")
+def set_prune_cast_to_static_shape_ops(func_desc, value=True):
+    """Whether or not set prune_cast to static shape opretions
+
+    Args:
+        func_desc ([type]): [description]
+        value (bool, optional): [description]. Defaults to True.
+    """
+    func_desc.job_config_proto.set_prune_cast_to_static_shape_ops(value)
+
+
+@oneflow_function_config("prune_amp_white_identity_ops")
+def set_prune_amp_white_identity_ops(func_desc, value=True):
+    """Whether prune amp_white_identity operations or not.
+
+    Args:
+        func_desc ([type]): [description]
+        value (bool, optional): [description]. Defaults to True.
+    """
+    func_desc.job_config_proto.set_prune_amp_white_identity_ops(value)
+
+
+@oneflow_function_config("non_distributed_optimizer_group_size_mbyte")
+def set_non_distributed_optimizer_group_size_mbyte(func_desc, value):
+    print(
+        "'non_distributed_optimizer_group_size_mbyte' has been deprecated, has no effect and will be removed in the future."
+    )
+
+
+@oneflow_function_config(
+    "enable_true_half_config_when_conv", "cudnn_conv_enable_true_half"
+)
+def set_cudnn_conv_enable_true_half(func_desc, value=True):
+    """Whether  use true_half mode or not during  convolution calculation process while using cudnn.
+
+    Args:
+        func_desc ([type]): [description]
+        value (bool, optional): [description]. Defaults to True.
+    """
+    func_desc.job_config_proto.set_cudnn_conv_enable_pseudo_half(not value)
+
+
+@oneflow_function_config(
+    "cudnn_conv_enable_pseudo_half", "enable_cudnn_conv_pseudo_half"
+)
+def set_cudnn_conv_enable_pseudo_half(func_desc, value):
+    """Whether  enable pseudo_half mode or not during  convolution calculation process while using cudnn
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    func_desc.job_config_proto.set_cudnn_conv_enable_pseudo_half(value)
+
+
+@oneflow_function_config("enable_float_compute_for_half_gemm")
+def set_enable_float_compute_for_half_gemm(func_desc, value=True):
+    """Whether  enable float_compute or not ,
+          if True, means that the type of intermedia value is float when compute half gemm.
+
+    Args:
+        func_desc ([type]): [description]
+        value (bool, optional): [description]. Defaults to True.
+    """
+    print(
+        "WARNING: enable_float_compute_for_half_gemm has been deprecated, because we always use float compute for half gemm. Please remove it.\n        "
+    )
+    print(traceback.format_stack()[-3])
+
+
+@oneflow_function_config("enable_quantization_aware_training")
+@oneflow_function_config("enable_qat")
+def set_enable_quantization_aware_training(func_desc, value=True):
+    """If true, then job will use quantization aware training
+
+    Args:
+        func_desc ([type]): [description]
+        value (bool, optional): [description]. Defaults to True.
+    """
+    func_desc.job_config_proto.set_enable_quantization_aware_training(value)
+
+
+@oneflow_function_config("qat.per_channel_weight_quantization")
+def set_qat_per_channel(func_desc, value=True):
+    func_desc.job_config_proto.mutable_qat_config().set_per_channel_weight_quantization(
+        value
+    )
+
+
+@oneflow_function_config("qat.symmetric")
+def set_qat_symmetric(func_desc, value=True):
+    func_desc.job_config_proto.mutable_qat_config().set_symmetric(value)
+
+
+@oneflow_function_config("qat.moving_min_max_momentum")
+def set_qat_moving_min_max_momentum(func_desc, value: float):
+    func_desc.job_config_proto.mutable_qat_config().set_moving_min_max_momentum(value)
+
+
+@oneflow_function_config("qat.moving_min_max_stop_update_after_iters")
+def set_qat_moving_min_max_momentum(func_desc, value: float):
+    func_desc.job_config_proto.mutable_qat_config().set_moving_min_max_stop_update_after_iters(
+        value
+    )
+
+
+@oneflow_function_config("qat.target_backend")
+def set_qat_symmetric(func_desc, value: str):
+    func_desc.job_config_proto.mutable_qat_config().set_target_backend(value)
+
+
+@oneflow_function_config("enable_auto_mixed_precision")
+def set_enable_auto_mixed_precision(func_desc, value=True):
+    """If true, then job will use mixed precision mode, it means use both float16 and float32 during model training.
+
+    Args:
+        func_desc ([type]): [description]
+        value (bool, optional): [description]. Defaults to True.
+    """
+    func_desc.job_config_proto.set_enable_auto_mixed_precision(value)
+
+
+@oneflow_function_config("enable_keep_header_only")
+def set_enable_keep_header_only(func_desc, value=True):
+    """deprecated api.
+
+    Args:
+        func_desc ([type]): [description]
+        value (bool, optional): [description]. Defaults to True.
+    """
+    print("Sorry! enable_keep_header_only is deprecated and it doesn't work.\n")
+
+
+@oneflow_function_config("concurrency_width")
+def set_concurrency_width(func_desc, value):
+    """Set up concurrency width
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    func_desc.job_config_proto.set_concurrency_width(value)
+
+
+@oneflow_function_config("train.model_update_conf")
+def set_model_update_conf(func_desc, value):
+    """Set up optimizer and update method of learning rate  for job
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    print(
+        "WARNING: func_config.train.* has been deprecated. Please replace it by the new optimizer api.\n        "
+    )
+    print(traceback.format_stack()[-3])
+    assert type(value) is dict
+    pb_msg = func_desc.job_config_proto.mutable_train_conf().mutable_model_update_conf()
+    pb_util.PythonDict2CFG(value, pb_msg)
+
+
+@oneflow_function_config("indexed_slices_optimizer_conf")
+def set_indexed_slices_optimizer_conf(func_desc, value):
+    """Set indexed slices configuration of optimizer
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    assert type(value) is dict
+    pb_msg = func_desc.job_config_proto.mutable_indexed_slices_optimizer_conf()
+    pb_util.PythonDict2CFG(value, pb_msg)
+
+
+@oneflow_function_config("enable_fuse_model_update_ops")
+def set_enable_fuse_model_update_ops(func_desc, value=True):
+    """Whether enable fuse_model_update_ops.
+            If enabled, try to fuse cast + scale + l1_l2_regularize_gradient + model_update to one op to improve performance.
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    func_desc.job_config_proto.set_enable_fuse_model_update_ops(value)
+
+
+@oneflow_function_config("enable_gradients_stats_aggregation")
+def set_enable_gradients_stats_aggregation(func_desc, value=True):
+    """Whether enable gradients_stats_aggregation.
+            If enabled, gradients stats ops (norm, finite, ...) will be aggregated.
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    func_desc.job_config_proto.set_enable_gradients_stats_aggregation(value)
+
+
+@oneflow_function_config("train.loss_scale_factor")
+def set_loss_scale_factor(func_desc, value):
+    """Set scale factor for loss
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    print(
+        "WARNING: func_config.train.* has been deprecated. Please replace it by the new optimizer api.\n        "
+    )
+    print(traceback.format_stack()[-3])
+    func_desc.job_config_proto.mutable_train_conf().set_loss_scale_factor(value)
+
+
+@oneflow_function_config("train.primary_lr")
+def set_primary_lr(func_desc, value):
+    """Set the primary leaning rate for job
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    print(
+        "WARNING: func_config.train.* has been deprecated. Please replace it by the new optimizer api.\n        "
+    )
+    print(traceback.format_stack()[-3])
+    func_desc.job_config_proto.mutable_train_conf().set_primary_lr(value)
+
+
+@oneflow_function_config("train.secondary_lr")
+def set_secondary_lr(func_desc, value):
+    """Set the secondary leaning rate for job
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    print(
+        "WARNING: func_config.train.* has been deprecated. Please replace it by the new optimizer api.\n        "
+    )
+    print(traceback.format_stack()[-3])
+    func_desc.job_config_proto.mutable_train_conf().set_secondary_lr(value)
+
+
+@oneflow_function_config("train.num_gradient_accumulation_steps")
+def set_num_gradient_accumulation_steps(func_desc, value):
+    func_desc.job_config_proto.set_num_gradient_accumulation_steps(value)
+
+
+@oneflow_function_config("default_placement_scope")
+def set_default_placement(func_desc, value):
+    """Set the default placement for job
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    assert isinstance(value, placement_ctx.EmptyPlacementScope)
+    func_desc.function_attribute.default_placement_scope = value
+
+
+@oneflow_function_config("use_xla_jit")
+def set_use_xla_jit(func_desc, value=True):
+    """Whether use xla  or not
+
+    Args:
+        func_desc ([type]): [description]
+        value (bool, optional): [description]. Defaults to True.
+    """
+    func_desc.job_config_proto.mutable_xrt_config().set_use_xla_jit(value)
+
+
+@oneflow_function_config("use_tensorrt")
+def set_use_tensorrt(func_desc, value=True):
+    """Whether use tensorrt or not
+
+    Args:
+        func_desc ([type]): [description]
+        value (bool, optional): [description]. Defaults to True.
+    """
+    func_desc.job_config_proto.mutable_xrt_config().set_use_tensorrt(value)
+
+
+@oneflow_function_config("tensorrt.use_fp16")
+def set_tensorrt_use_fp16(func_desc, value=True):
+    """Whether use tensorrt fp16  or not
+
+    Args:
+        func_desc ([type]): [description]
+        value (bool, optional): [description]. Defaults to True.
+    """
+    set_use_tensorrt(func_desc, True)
+    func_desc.job_config_proto.mutable_xrt_config().mutable_tensorrt_config().set_use_fp16(
+        value
+    )
+
+
+@oneflow_function_config("tensorrt.use_int8")
+def set_tensorrt_use_int8(func_desc, value=True):
+    """Whether use tensorrt int8 mode or not
+
+    Args:
+        func_desc ([type]): [description]
+        value (bool, optional): [description]. Defaults to True.
+    """
+    set_use_tensorrt(func_desc, True)
+    func_desc.job_config_proto.mutable_xrt_config().mutable_tensorrt_config().set_use_int8(
+        value
+    )
+
+
+@oneflow_function_config("tensorrt.int8_calibration")
+def set_tensorrt_int8_calibration(func_desc, value):
+    """Set up calibration of tensorrt int8
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    assert func_desc.job_config_proto.xrt_config().tensorrt_config().use_int8()
+    func_desc.job_config_proto.mutable_xrt_config().mutable_tensorrt_config().set_int8_calibration(
+        value
+    )
+
+
+@oneflow_function_config("default_logical_view")
+def set_default_distribute_strategy(func_desc, value):
+    """Set up default distribute strategy for job
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    assert isinstance(value, distribute_ctx.DistributeStrategy)
+    func_desc.function_attribute.default_distribute_strategy = value
+
+
+@oneflow_function_config("allow_cpu_return_op")
+def allow_cpu_return_op(func_desc, value):
+    """Whether allow operaions returned from cpu or  not
+
+    Args:
+        func_desc ([type]): [description]
+        value ([type]): [description]
+    """
+    func_desc.function_attribute.allow_cpu_return_op = value
+
+
+@oneflow_function_config("default_distribute_strategy")
+@oneflow_deprecate()
+def deprecated_set_default_distribute_strategy(*args, **kwargs):
+    print(
+        "WARNING:",
+        "function_config.default_distribute_strategy",
+        "has been deprecated. Please use {} instead.".format(
+            "function_config.default_logical_view"
+        ),
+    )
+    print(traceback.format_stack()[-3], file=sys.stderr)
+    set_default_distribute_strategy(*args, **kwargs)
diff --git a/python/oneflow/framework/functional.py b/python/oneflow/framework/functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c8b8164fd8dbcb3c3a8eaac5045dc081f3ff200
--- /dev/null
+++ b/python/oneflow/framework/functional.py
@@ -0,0 +1,67 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+import oneflow._oneflow_internal
+
+
+def RecursveDetermine(arg):
+    if isinstance(arg, flow.Tensor):
+        if not arg.is_determined:
+            arg.determine()
+        return arg._local_or_consistent_tensor
+    elif isinstance(arg, list) or isinstance(arg, tuple):
+        arg = list(arg)
+        for i in range(len(arg)):
+            arg[i] = RecursveDetermine(arg[i])
+        return arg
+    elif isinstance(arg, dict):
+        for (k, v) in arg.items():
+            arg[k] = RecursveDetermine(v)
+    else:
+        return arg
+
+
+class Function:
+    def __init__(self, func_name, handle):
+        self.func_name = func_name
+        self.handle = handle
+
+    def __call__(self, *args, **kwargs):
+        args = list(args)
+        for i in range(len(args)):
+            args[i] = RecursveDetermine(args[i])
+        for (k, v) in kwargs.items():
+            kwargs[k] = RecursveDetermine(v)
+        return self.handle(*args, **kwargs)
+
+
+def RegisterFunctionalApis():
+    import inspect
+
+    import oneflow.F
+
+    for s in dir(oneflow._oneflow_internal.F):
+        f = getattr(oneflow._oneflow_internal.F, s)
+        if inspect.isbuiltin(f):
+            func_name = s
+            if s in _function_name_aliases:
+                func_name = _function_name_aliases[s]
+                setattr(oneflow.F, func_name, Function(func_name, f))
+            setattr(oneflow.F, s, Function(func_name, f))
+    del inspect
+
+
+_function_name_aliases = {"add_scalar": "scalar_add"}
diff --git a/python/oneflow/framework/generator.py b/python/oneflow/framework/generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4e980eb0e88412aa817e28eea04582b2c2ec761
--- /dev/null
+++ b/python/oneflow/framework/generator.py
@@ -0,0 +1,33 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow
+import oneflow._oneflow_internal
+
+
+def create_generator(device=None):
+    if device is None:
+        device = "auto"
+    return oneflow._oneflow_internal.create_generator(device)
+
+
+def default_generator(device=None):
+    if device is None:
+        device = "auto"
+    return oneflow._oneflow_internal.default_generator(device)
+
+
+def manual_seed(seed):
+    oneflow._oneflow_internal.manual_seed(seed)
diff --git a/python/oneflow/framework/graph_build_util.py b/python/oneflow/framework/graph_build_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2e401eab9d9c129dca7dd868eb090243b7049d0
--- /dev/null
+++ b/python/oneflow/framework/graph_build_util.py
@@ -0,0 +1,162 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from contextlib import contextmanager
+
+from google.protobuf import text_format
+
+import oneflow._oneflow_internal
+import oneflow.core.job.scope_pb2 as scope_pb2_util
+import oneflow.framework.attr_util as attr_util
+import oneflow.framework.c_api_util as c_api_util
+import oneflow.framework.placement_util as placement_util
+import oneflow.framework.scope_util as scope_util
+import oneflow.framework.session_context as session_context
+from oneflow._oneflow_internal import Tensor as InternalTensor
+from oneflow.framework.tensor import Tensor
+
+lazy_mode = oneflow._oneflow_internal.lazy_mode
+
+
+@contextmanager
+def graph_build_context(config_proto, session):
+    prev_scope = oneflow._oneflow_internal.GetCurrentScope()
+    new_scope = scope_util.MakeInitialScope(config_proto, "cpu", ["0:0"], None, False)
+    with lazy_mode.gard(True):
+        with JobBuildAndInferCtx(config_proto):
+            with BlockScopeContext(prev_scope, new_scope):
+                yield
+
+
+class JobBuildAndInferCtx(object):
+    def __init__(self, config_proto):
+        self._job_conf = config_proto
+
+    def __enter__(self):
+        c_api_util.JobBuildAndInferCtx_Open(self._job_conf.job_name())
+        c_api_util.CurJobBuildAndInferCtx_SetJobConf(self._job_conf)
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        oneflow._oneflow_internal.CurJobBuildAndInferCtx_Complete()
+        oneflow._oneflow_internal.JobBuildAndInferCtx_Close()
+        if exc_type is None:
+            return True
+        else:
+            return False
+
+
+class BlockScopeContext(object):
+    def __init__(self, prev_scope, new_scope):
+        assert prev_scope is not None
+        assert new_scope is not None
+        self._prev_scope = prev_scope
+        self._new_scope = new_scope
+
+    def __enter__(self):
+        oneflow._oneflow_internal.GlobalScopeStackPush(self._new_scope)
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        assert oneflow._oneflow_internal.GetCurrentScope() is self._new_scope
+        oneflow._oneflow_internal.GlobalScopeStackPop()
+        assert oneflow._oneflow_internal.GetCurrentScope() is self._prev_scope
+        if exc_type is None:
+            return True
+        else:
+            return False
+
+
+def make_new_block_scope(prev_scope, block):
+    assert prev_scope is not None
+    assert block is not None
+    attr_dict = dict()
+    if block.config.stage_id is not None:
+        attr_dict["pipeline_stage_id_hint"] = block.config.stage_id
+    if block.config.activation_checkpointing is not None:
+        attr_dict["checkpointing"] = block.config.activation_checkpointing
+    name2default = session_context.GetDefaultSession().scope_attr_name2default_val
+
+    def scope_proto_setter(scope_proto):
+        for (attr_name, py_value) in attr_dict.items():
+            assert attr_name in name2default
+            attr_util.SetAttrValue(
+                scope_proto.mutable_attr_name2attr_value()[attr_name],
+                py_value,
+                name2default[attr_name],
+            )
+        scope_proto.clear_scope_op_name_prefixes()
+        scope_proto.add_scope_op_name_prefixes(block.name_prefix + block.name)
+
+    new_scope = None
+
+    def build_scope(builder):
+        nonlocal new_scope
+        new_scope = builder.BuildScopeByProtoSetter(prev_scope, scope_proto_setter)
+        assert new_scope is not None
+
+    oneflow._oneflow_internal.deprecated.LogicalRun(build_scope)
+    return new_scope
+
+
+def scope_to_proto(scope):
+    return text_format.Parse(scope._proto_str, scope_pb2_util.ScopeProto())
+
+
+def build_graph_input_arg(op_name, arg):
+    assert isinstance(arg, (Tensor, InternalTensor))
+    input_conf = (
+        oneflow._oneflow_internal.oneflow.core.operator.op_conf.FeedInputOpConf()
+    )
+    input_op = oneflow._oneflow_internal.one.FeedInputOpExpr(
+        op_name, input_conf, ["in_0"], ["out_0"]
+    )
+    attrs = oneflow._oneflow_internal.MutableCfgAttrMap()
+    if isinstance(arg, Tensor):
+        if not arg.is_determined:
+            arg.determine()
+        tensor_in_c = arg._local_or_consistent_tensor
+    else:
+        tensor_in_c = arg
+    lazy_arg = input_op.apply([tensor_in_c], attrs)[0]
+    return lazy_arg
+
+
+def build_graph_state(op_name, state_tensor):
+    var_conf = (
+        oneflow._oneflow_internal.oneflow.core.operator.op_conf.FeedVariableOpConf()
+    )
+    var_op = oneflow._oneflow_internal.one.FeedVariableOpExpr(
+        op_name, var_conf, ["in_0"], ["out_0"]
+    )
+    attrs = oneflow._oneflow_internal.MutableCfgAttrMap()
+    assert isinstance(state_tensor, Tensor)
+    if not state_tensor.is_determined:
+        state_tensor.determine()
+    tensor_in_c = state_tensor._local_or_consistent_tensor
+    lazy_tensor = var_op.apply([tensor_in_c], attrs)[0]
+    return lazy_tensor
+
+
+def build_graph_output(op_name, out):
+    assert isinstance(out, InternalTensor)
+    assert out.is_lazy
+    output_conf = (
+        oneflow._oneflow_internal.oneflow.core.operator.op_conf.FetchOutputOpConf()
+    )
+    output_op = oneflow._oneflow_internal.one.FetchOutputOpExpr(
+        op_name, output_conf, ["in_0"], ["out_0"]
+    )
+    attrs = oneflow._oneflow_internal.MutableCfgAttrMap()
+    eager_out = output_op.apply([out], attrs)[0]
+    return eager_out
diff --git a/python/oneflow/framework/hob.py b/python/oneflow/framework/hob.py
new file mode 100644
index 0000000000000000000000000000000000000000..f374ba003156efed0ea05a6618dfa64578fd45e2
--- /dev/null
+++ b/python/oneflow/framework/hob.py
@@ -0,0 +1,83 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow
+import oneflow._oneflow_internal
+import oneflow.framework.runtime_mode as rt_mode
+import oneflow.framework.session_context as session_ctx
+from oneflow.support.high_order_bool import bool_functor
+
+
+@bool_functor("Current mode is %s" % rt_mode.NORMAL_MODE)
+def in_normal_mode(ctx):
+    return rt_mode.CurrentMode() == rt_mode.NORMAL_MODE
+
+
+@bool_functor("Current mode is %s" % rt_mode.GLOBAL_MODE)
+def in_global_mode(ctx):
+    return rt_mode.CurrentMode() == rt_mode.GLOBAL_MODE
+
+
+@bool_functor("Current mode is %s" % rt_mode.DEVICE_MODE)
+def in_device_mode(ctx):
+    return rt_mode.CurrentMode() == rt_mode.DEVICE_MODE
+
+
+@bool_functor("Environment initialized")
+def env_initialized(ctx):
+    assert in_normal_mode(ctx)
+    return oneflow._oneflow_internal.IsEnvInited()
+
+
+@bool_functor("Any global function defined")
+def any_global_function_defined(ctx):
+    assert in_normal_mode(ctx)
+    return session_ctx.GetDefaultSession().AnyGlobalFunctionDefined()
+
+
+@bool_functor("Eager execution enabled")
+def eager_execution_enabled(ctx):
+    return oneflow._oneflow_internal.EagerExecutionEnabled()
+
+
+@bool_functor("Session initialized")
+def session_initialized(ctx):
+    assert in_normal_mode(ctx)
+    return session_ctx.GetDefaultSession().is_running
+
+
+@bool_functor("Current global function is trainable")
+def is_trainable(ctx):
+    assert in_global_mode(ctx)
+    if oneflow._oneflow_internal.EagerExecutionEnabled():
+        return session_ctx.GetDefaultSession().CurrentEagerGlobalFunctionDesc()
+    else:
+        job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+        return session_ctx.GetDefaultSession().GetFunctionDesc(job_name)
+
+
+@bool_functor("Current machine is master")
+def is_current_machine_master(ctx):
+    return oneflow._oneflow_internal.CurrentMachineId() == 0
+
+
+@bool_functor("Consistent view enabled")
+def consistent_view_enabled(ctx):
+    return oneflow.scope.consistent_view_enabled()
+
+
+@bool_functor("Mirrored view enabled")
+def mirrored_view_enabled(ctx):
+    return oneflow.scope.mirrored_view_enabled()
diff --git a/python/oneflow/framework/id_util.py b/python/oneflow/framework/id_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee1bde48edda821bbe9974be794cc2249cd9d639
--- /dev/null
+++ b/python/oneflow/framework/id_util.py
@@ -0,0 +1,20 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow._oneflow_internal
+
+
+def UniqueStr(prefix):
+    return oneflow._oneflow_internal.UniqueStr(prefix)
diff --git a/python/oneflow/framework/input_blob_def.py b/python/oneflow/framework/input_blob_def.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6e235645af46a1cfc3128f5ebccf1d4dc142608
--- /dev/null
+++ b/python/oneflow/framework/input_blob_def.py
@@ -0,0 +1,282 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import sys
+import traceback
+from functools import reduce
+from typing import Any, Optional, Sequence, Union
+
+import numpy as np
+
+import oneflow
+import oneflow._oneflow_internal
+import oneflow._oneflow_internal.oneflow.core.register.logical_blob_id as lbi_util
+import oneflow.core.job.sbp_parallel_pb2 as sbp_parallel_pb
+import oneflow.core.operator.interface_blob_conf_pb2 as inter_face_blob_conf_util
+import oneflow.core.operator.op_conf_pb2 as op_conf_util
+import oneflow.framework.c_api_util as c_api_util
+import oneflow.framework.compile_context as compile_context
+import oneflow.framework.distribute as distribute_util
+import oneflow.framework.id_util as id_util
+import oneflow.framework.placement_context as placement_ctx
+import oneflow.framework.remote_blob as remote_blob_util
+
+
+class ArgBlobDef(object):
+    def __init__(
+        self,
+        shape,
+        dtype,
+        name=None,
+        distribute=oneflow._oneflow_internal.distribute.auto(),
+    ):
+        lbi = lbi_util.LogicalBlobId()
+        if name is None:
+            name = id_util.UniqueStr("Input_")
+        lbi.set_op_name(name)
+        lbi.set_blob_name("out")
+        self.lbi_ = lbi
+        assert type(shape) is tuple
+        for dim in shape:
+            assert type(dim) is int
+            assert dim > 0
+        self.shape_ = shape
+        self.dtype_ = dtype
+        self.distribute_ = distribute
+
+    @property
+    def lbi(self):
+        return self.lbi_
+
+    @property
+    def op_name(self):
+        return self.lbi_.op_name()
+
+    @property
+    def blob_name(self):
+        return self.lbi_.blob_name()
+
+    @property
+    def unique_name(self):
+        return self.op_name + "/" + self.blob_name + self._Distribute2Str()
+
+    @property
+    def shape(self):
+        return self.shape_
+
+    @property
+    def dtype(self):
+        return self.dtype_
+
+    @property
+    def is_dynamic(self):
+        raise NotImplementedError
+
+    def with_distribute(self, distribute):
+        return type(self)(shape=self.shape_, dtype=self.dtype_, name=self.op_name)
+
+    def Clone(self, op_name=None):
+        return type(self)(shape=self.shape_, dtype=self.dtype_, name=op_name)
+
+    def AddAndInferOp(self, op_conf):
+        raise NotImplementedError
+
+    def EagerAddAndInferOp(self, op_conf):
+        raise NotImplementedError
+
+    def CheckAndAsyncPush(self, session, arg_ndarray):
+        self._CheckNdarray(arg_ndarray)
+        self._AsyncPush(session, arg_ndarray)
+
+    def _CheckNdarray(self, ndarray):
+        raise NotImplementedError
+
+    def _AsyncPush(self, session, arg_ndarray):
+        raise NotImplementedError
+
+    def ToInterfaceBlobConf(self):
+        interface_blob_conf = inter_face_blob_conf_util.InterfaceBlobConf()
+        interface_blob_conf.shape.dim.extend(self.shape_)
+        interface_blob_conf.data_type = oneflow._oneflow_internal.deprecated.GetProtoDtype4OfDtype(
+            self.dtype_
+        )
+        interface_blob_conf.is_dynamic = self.is_dynamic
+        sbp_parallel = sbp_parallel_pb.SbpParallel()
+        sbp_parallel.split_parallel.axis = 0
+        interface_blob_conf.parallel_distribution.sbp_parallel.extend([sbp_parallel])
+        return interface_blob_conf
+
+    def _Distribute2Str(self):
+        if (
+            type(self.distribute_)
+            is oneflow._oneflow_internal.distribute.AutoDistribute
+        ):
+            return ""
+        elif (
+            type(self.distribute_)
+            is oneflow._oneflow_internal.distribute.SplitDistribute
+        ):
+            return ":S" + str(self.distribute_.axis)
+        elif (
+            type(self.distribute_)
+            is oneflow._oneflow_internal.distribute.BroadcastDistribute
+        ):
+            return ":B"
+        else:
+            raise NotImplementedError
+
+
+class FixedTensorDef(ArgBlobDef):
+    def __init__(
+        self,
+        shape: Sequence[int],
+        dtype: oneflow.dtype = oneflow.float,
+        name: Optional[str] = None,
+    ) -> None:
+        ArgBlobDef.__init__(self, shape, dtype=dtype, name=name)
+
+    @property
+    def is_dynamic(self) -> bool:
+        return False
+
+    def AddAndInferOp(self, op_conf: op_conf_util.OperatorConf) -> Any:
+        return compile_context.CurJobAddConsistentOp(op_conf)
+
+    def EagerAddAndInferOp(self, op_conf: op_conf_util.OperatorConf) -> Any:
+        parallel_symbol = oneflow.current_scope().device_parallel_desc_symbol
+        if (
+            parallel_symbol.device_tag == "gpu"
+            and list(dict(parallel_symbol.machine_id2device_id_list).keys()) == [0]
+            and (parallel_symbol.parallel_num == 1)
+        ):
+            device_tag = "gpu"
+            device_ids = "@0:%s" % parallel_symbol.machine_id2device_id_list[0][0]
+        else:
+            device_tag = "cpu"
+            device_ids = "@0:0"
+        with oneflow.scope.placement(device_tag, device_ids):
+            return compile_context.CurJobAddConsistentOp(op_conf)
+
+    def _CheckNdarray(self, ndarray: np.ndarray) -> None:
+        assert isinstance(ndarray, np.ndarray)
+        assert ndarray.shape == self.shape
+
+    def _AsyncPush(self, session: object, arg_ndarray: np.ndarray) -> None:
+        session.AsyncPush(self.op_name, _MakePushNdarrayCallback(arg_ndarray))
+
+
+class MirroredTensorDef(ArgBlobDef):
+    def __init__(
+        self,
+        shape: Sequence[int],
+        dtype: oneflow.dtype = oneflow.float,
+        name: Optional[str] = None,
+    ) -> None:
+        assert type(shape) is tuple
+        ArgBlobDef.__init__(self, shape, dtype=dtype, name=name)
+        self.sub_consistent_blob_list_ = []
+
+    @property
+    def is_dynamic(self) -> bool:
+        return True
+
+    def AddAndInferOp(self, op_conf: op_conf_util.OperatorConf) -> None:
+        _AddAndInferMirroredOp(
+            self.unique_name, op_conf, self.sub_consistent_blob_list_
+        )
+
+    def EagerAddAndInferOp(self, op_conf: op_conf_util.OperatorConf) -> Any:
+        return compile_context.CurJobAddMirroredOp(op_conf)
+
+    def _CheckNdarray(self, ndarray_list: Sequence[np.ndarray]) -> None:
+        assert isinstance(ndarray_list, (list, tuple))
+        assert len(self.sub_consistent_blob_list_) == len(ndarray_list)
+
+        def GetElemCnt(shape):
+            return reduce(lambda x, y: x * y, shape, 1)
+
+        for (consistent_blob, ndarray) in zip(
+            self.sub_consistent_blob_list_, ndarray_list
+        ):
+            assert type(ndarray) is np.ndarray
+            assert len(ndarray.shape) == len(self.shape)
+            assert GetElemCnt(ndarray.shape) <= GetElemCnt(self.shape)
+
+    def _AsyncPush(self, session: object, ndarray_list: Sequence[np.ndarray]) -> None:
+        for i in range(len(ndarray_list)):
+            sub_blob = self.sub_consistent_blob_list_[i]
+            session.AsyncPush(
+                sub_blob.op_name, _MakePushNdarrayCallback(ndarray_list[i])
+            )
+
+
+def _AddAndInferMirroredOp(mirrored_lbn, op_conf, sub_consistent_blob_list):
+    compile_context.CurJobAddMirroredOp(op_conf)
+    job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+    num_sub_lbi = c_api_util.JobBuildAndInferCtx_MirroredBlobGetNumSubLbi(
+        job_name, mirrored_lbn
+    )
+    for i in range(num_sub_lbi):
+        sub_lbi = c_api_util.JobBuildAndInferCtx_MirroredBlobGetSubLbi(
+            job_name, mirrored_lbn, i
+        )
+        lbi = lbi_util.LogicalBlobId()
+        lbi.set_op_name(sub_lbi.op_name)
+        lbi.set_blob_name(sub_lbi.blob_name)
+        sub_consistent_blob_list.append(
+            oneflow._oneflow_internal.ConsistentBlob(
+                lbi, "", oneflow._oneflow_internal.distribute.auto()
+            )
+        )
+
+
+def _MakePushNdarrayCallback(ndarray):
+    copied = np.copy(ndarray, order="C")
+
+    def Copy(ofblob):
+        capacity = reduce(lambda x, y: x * y, ofblob.static_shape, 1)
+        elem_cnt = reduce(lambda x, y: x * y, copied.shape, 1)
+        assert elem_cnt <= capacity, "%s v.s. %s" % (copied.shape, ofblob.static_shape)
+        ofblob.CopyFromNdarray(copied)
+
+    return Copy
+
+
+class DeprecatedFixedTensorDef(FixedTensorDef):
+    def __init__(self, *args, **kwargs):
+        running_script = traceback.format_stack()[-2].split(",")[0].split(" ")[3]
+        if not running_script.endswith('input_blob_def.py"'):
+            print(
+                "WARNING: oneflow.FixedTensorDef has been deprecated. Please use oneflow.typing.Numpy.Placeholder instead."
+            )
+            print(
+                "For instance:\n            - def job_func(images=oneflow.FixedTensorDef((32, 1, 28, 28), dtype=flow.float))\n            + def job_func(images:oneflow.typing.Numpy.Placeholder((32, 1, 28, 28), dtype=flow.float))"
+            )
+            print(traceback.format_stack()[-2])
+        super().__init__(*args, **kwargs)
+
+
+class DeprecatedMirroredTensorDef(MirroredTensorDef):
+    def __init__(self, *args, **kwargs):
+        running_script = traceback.format_stack()[-2].split(",")[0].split(" ")[3]
+        if not running_script.endswith('input_blob_def.py"'):
+            print(
+                "WARNING: oneflow.MirroredTensorDef has been deprecated. Please use oneflow.typing.ListNumpy.Placeholder instead."
+            )
+            print(
+                "For instance:\n            - def job_func(images=oneflow.MirroredTensorDef((32, 1, 28, 28), dtype=flow.float))\n            + def job_func(images:oneflow.typing.ListNumpy.Placeholder((32, 1, 28, 28), dtype=flow.float))"
+            )
+            print(traceback.format_stack()[-2])
+        super().__init__(*args, **kwargs)
diff --git a/python/oneflow/framework/interpret_util.py b/python/oneflow/framework/interpret_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe5f9e3daf01f5a5a0c1cbb446a2e494b6b195d1
--- /dev/null
+++ b/python/oneflow/framework/interpret_util.py
@@ -0,0 +1,84 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow
+import oneflow._oneflow_internal
+import oneflow.eager.gradient_util as gradient_util
+import oneflow.framework.compile_context as compile_ctx
+import oneflow.framework.hob as hob
+import oneflow.support.enable_if as enable_if
+
+blob_register = oneflow._oneflow_internal.GetDefaultBlobRegister()
+
+
+def Forward(op_conf, scope_symbol=None):
+    if scope_symbol is None:
+        scope_symbol = oneflow.current_scope()
+    func = enable_if.unique([LazyInfer, EagerForward])
+    return func(compile_ctx.CurJobAddOp, op_conf, scope_symbol)
+
+
+def OpKernelForward(op_conf, opkernel_object):
+    func = enable_if.unique([LazyOpKernelInfer, EagerOpKernelForward])
+    return func(compile_ctx.CurJobAddOp, op_conf, opkernel_object)
+
+
+def ConsistentForward(op_conf, scope_symbol=None):
+    if scope_symbol is None:
+        scope_symbol = oneflow.current_scope()
+    func = enable_if.unique([LazyInfer, EagerForward])
+    return func(compile_ctx.CurJobAddConsistentOp, op_conf, scope_symbol)
+
+
+def OpKernelConsistentForward(op_conf, opkernel_object):
+    func = enable_if.unique([LazyOpKernelInfer, EagerOpKernelForward])
+    return func(compile_ctx.CurJobAddConsistentOp, op_conf, opkernel_object)
+
+
+@enable_if.condition(hob.in_global_mode & ~hob.eager_execution_enabled)
+def LazyInfer(add_and_infer, op_conf, scope_symbol=None):
+    return add_and_infer(op_conf, scope_symbol)
+
+
+@enable_if.condition(hob.in_global_mode & ~hob.eager_execution_enabled)
+def LazyOpKernelInfer(add_and_infer, op_conf, opkernel_object):
+    return add_and_infer(op_conf, opkernel_object.scope_symbol)
+
+
+@enable_if.condition(hob.in_global_mode & hob.eager_execution_enabled)
+def EagerForward(add_and_infer, op_conf, scope_symbol=None):
+    op_attribute = add_and_infer(op_conf, scope_symbol)
+    parallel_conf = scope_symbol.device_parallel_desc_symbol.parallel_conf
+    import oneflow.eager.op_executor as op_executor
+
+    op_executor.Interpret(op_attribute, parallel_conf, blob_register)
+    bw_blob_register = gradient_util.GetDefaultBackwardBlobRegister()
+    gradient_util.TrySetBackwardUsedBlobObject(
+        op_attribute, blob_register, bw_blob_register
+    )
+    return op_attribute
+
+
+@enable_if.condition(hob.in_global_mode & hob.eager_execution_enabled)
+def EagerOpKernelForward(add_and_infer, op_conf, opkernel_object):
+    op_attribute = add_and_infer(op_conf, opkernel_object.scope_symbol)
+    import oneflow.eager.op_executor as op_executor
+
+    op_executor.OpKernelCall(opkernel_object, op_attribute, blob_register)
+    bw_blob_register = gradient_util.GetDefaultBackwardBlobRegister()
+    gradient_util.TrySetBackwardUsedBlobObject(
+        op_attribute, blob_register, bw_blob_register
+    )
+    return op_attribute
diff --git a/python/oneflow/framework/job_instance.py b/python/oneflow/framework/job_instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5da48d59c4901e4b063f852a5c299df41767e09
--- /dev/null
+++ b/python/oneflow/framework/job_instance.py
@@ -0,0 +1,145 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import sys
+import traceback
+
+import oneflow._oneflow_internal
+import oneflow.framework.ofblob as ofblob
+
+
+def MakeUserJobInstance(job_name, finish_cb=None):
+    return MakeJobInstance(job_name, finish_cb=finish_cb)
+
+
+def MakePullJobInstance(job_name, op_name, pull_cb, finish_cb=None):
+    return MakeJobInstance(
+        job_name,
+        sole_output_op_name_in_user_job=op_name,
+        pull_cb=pull_cb,
+        finish_cb=finish_cb,
+    )
+
+
+def MakePushJobInstance(job_name, op_name, push_cb, finish_cb=None):
+    return MakeJobInstance(
+        job_name,
+        sole_input_op_name_in_user_job=op_name,
+        push_cb=push_cb,
+        finish_cb=finish_cb,
+    )
+
+
+def MakeArgPassJobInstance(job_name, src_op_name, dst_op_name, finish_cb=None):
+    return MakeJobInstance(
+        job_name,
+        sole_output_op_name_in_user_job=src_op_name,
+        sole_input_op_name_in_user_job=dst_op_name,
+        finish_cb=finish_cb,
+    )
+
+
+def MakeJobInstance(*arg, **kw):
+    def _DoNothing():
+        pass
+
+    if "finish_cb" not in kw or kw["finish_cb"] is None:
+        kw["finish_cb"] = _DoNothing
+    job_instance = JobInstance(*arg, **kw)
+    global _flying_job_instance
+    _flying_job_instance[id(job_instance)] = job_instance
+
+    def DereferenceJobInstance(job_instance):
+        global _flying_job_instance
+        del _flying_job_instance[id(job_instance)]
+
+    job_instance.AddPostFinishCallback(DereferenceJobInstance)
+    return job_instance
+
+
+class JobInstance(oneflow._oneflow_internal.JobInstance):
+    def __init__(
+        self,
+        job_name,
+        sole_input_op_name_in_user_job=None,
+        sole_output_op_name_in_user_job=None,
+        push_cb=None,
+        pull_cb=None,
+        finish_cb=None,
+    ):
+        oneflow._oneflow_internal.JobInstance.__init__(self)
+        self.thisown = 0
+        self.job_name_ = str(job_name)
+        self.sole_input_op_name_in_user_job_ = str(sole_input_op_name_in_user_job)
+        self.sole_output_op_name_in_user_job_ = str(sole_output_op_name_in_user_job)
+        self.push_cb_ = push_cb
+        self.pull_cb_ = pull_cb
+        self.finish_cb_ = finish_cb
+        self.post_finish_cbs_ = []
+
+    def job_name(self):
+        try:
+            return self.job_name_
+        except Exception as e:
+            print(traceback.format_exc())
+            raise e
+
+    def sole_input_op_name_in_user_job(self):
+        try:
+            return self.sole_input_op_name_in_user_job_
+        except Exception as e:
+            print(traceback.format_exc())
+            raise e
+
+    def sole_output_op_name_in_user_job(self):
+        try:
+            return self.sole_output_op_name_in_user_job_
+        except Exception as e:
+            print(traceback.format_exc())
+            raise e
+
+    def PushBlob(self, of_blob_ptr):
+        try:
+            self.push_cb_(ofblob.OfBlob(of_blob_ptr))
+        except Exception as e:
+            print(traceback.format_exc())
+            raise e
+
+    def PullBlob(self, of_blob_ptr):
+        try:
+            self.pull_cb_(ofblob.OfBlob(of_blob_ptr))
+        except Exception as e:
+            print(traceback.format_exc())
+            raise e
+
+    def Finish(self):
+        try:
+            self.finish_cb_()
+        except Exception as e:
+            print(traceback.format_exc())
+            raise e
+        finally:
+            try:
+                for post_finish_cb in self.post_finish_cbs_:
+                    post_finish_cb(self)
+            except Exception as e:
+                print(traceback.format_exc())
+                raise e
+
+    def AddPostFinishCallback(self, cb):
+        self.post_finish_cbs_.append(cb)
+
+
+_flying_job_instance = {}
diff --git a/python/oneflow/framework/job_set_util.py b/python/oneflow/framework/job_set_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c9970b4c1c817073f6787f32861a3d75677cd1d
--- /dev/null
+++ b/python/oneflow/framework/job_set_util.py
@@ -0,0 +1,53 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional, TypeVar
+
+from oneflow.core.job.job_set_pb2 import JobSet
+
+_VT = TypeVar("_VT")
+
+
+def inter_job_reuse_mem_strategy(
+    strategy_str: str, job_set: Optional[JobSet] = None, **kwargs: _VT
+) -> None:
+    """Set memory sharing strategy for job set.
+
+    Args:
+        strategy_str: An optional `string` from: `mem_sharing_priority`, `parallelism_priority` 
+        or `custom_parallelism`. 
+        job_set: A `JobSet` object. If None, set default job set.
+    """
+    assert type(strategy_str) is str
+    if job_set == None:
+        job_set = _default_job_set
+    if strategy_str == "reuse_mem_priority":
+        job_set.inter_job_reuse_mem_strategy.reuse_mem_priority.SetInParent()
+        assert job_set.inter_job_reuse_mem_strategy.HasField("reuse_mem_priority")
+    elif strategy_str == "parallelism_priority":
+        job_set.inter_job_reuse_mem_strategy.parallelism_priority.SetInParent()
+        assert job_set.inter_job_reuse_mem_strategy.HasField("parallelism_priority")
+    elif strategy_str == "custom_parallelism":
+        assert kwargs["job_name_groups"] is not None
+        for job_name_group in kwargs["job_name_groups"]:
+            group = (
+                job_set.inter_job_reuse_mem_strategy.custom_parallelism.nonparallel_group.add()
+            )
+            for job_name in job_name_group:
+                assert type(job_name) is str
+                group.job_name.append(job_name)
+
+
+_default_job_set = JobSet()
diff --git a/python/oneflow/framework/local_blob.py b/python/oneflow/framework/local_blob.py
new file mode 100644
index 0000000000000000000000000000000000000000..273bef4b70d24e92fe88b209c88b21e866da5a4e
--- /dev/null
+++ b/python/oneflow/framework/local_blob.py
@@ -0,0 +1,109 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import traceback
+
+import numpy as np
+
+import oneflow._oneflow_internal
+import oneflow.framework.remote_blob as remote_blob_util
+
+
+class LocalBlob(object):
+    def __init__(self, ndarray, is_dynamic):
+        self.ndarray_ = ndarray
+        self.is_dynamic_ = is_dynamic
+
+    @property
+    def is_dynamic(self):
+        return self.is_dynamic_
+
+    def ndarray_list(self):
+        print(
+            "WARNING:",
+            "LocalBlob.ndarray_list is deprecated, please use LocalBlob.numpy()\n",
+            traceback.format_stack()[-2],
+        )
+        return self.numpy_list()
+
+    def numpy_list(self):
+        return [self.numpy()]
+
+    def ndarray(self):
+        print(
+            "WARNING:",
+            "LocalBlob.ndarray is deprecated, please use LocalBlob.numpy()\n",
+            traceback.format_stack()[-2],
+        )
+        return self.numpy()
+
+    def numpy(self, parallel_id=None):
+        assert parallel_id is None or parallel_id == 0
+        return self.ndarray_
+
+    def parallel_num(self):
+        return 1
+
+    def __getattr__(self, attr):
+        return getattr(self.numpy(), attr)
+
+
+def MakeLocalBlob4EagerBlob(eager_blob):
+    assert isinstance(eager_blob, oneflow._oneflow_internal.EagerBlobTrait)
+    if isinstance(eager_blob, oneflow._oneflow_internal.EagerMirroredBlob):
+        assert eager_blob.numpy_size() == 1
+        return LocalBlob(eager_blob.numpy(), is_dynamic=eager_blob.is_dynamic)
+    elif isinstance(eager_blob, oneflow._oneflow_internal.EagerConsistentBlob):
+        return LocalBlob(eager_blob.numpy(), is_dynamic=False)
+    else:
+        raise NotImplementedError
+
+
+non_override_field = set(
+    [
+        "__class__",
+        "__doc__",
+        "__new__",
+        "__init__",
+        "__del__",
+        "__call__",
+        "__getattr__",
+        "__getattribute__",
+        "__setattr__",
+        "__delattr__",
+        "__dir__",
+        "__get__",
+        "__set__",
+        "__delete__",
+    ]
+)
+
+
+def MakeBlobMethod(field_name):
+    def ConvertOtherArgs(args):
+        return [x.numpy() if isinstance(x, LocalBlob) else x for x in args]
+
+    return lambda self, *args: getattr(self.numpy(), field_name)(
+        *ConvertOtherArgs(args)
+    )
+
+
+for field_name in dir(np.ndarray):
+    if field_name.startswith("__") == False:
+        continue
+    if field_name in non_override_field:
+        continue
+    if hasattr(LocalBlob, field_name) == False:
+        setattr(LocalBlob, field_name, MakeBlobMethod(field_name))
diff --git a/python/oneflow/framework/model.py b/python/oneflow/framework/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..a720759ea2b27bdca6971f6f2ab054d9dac04f83
--- /dev/null
+++ b/python/oneflow/framework/model.py
@@ -0,0 +1,748 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+__all__ = [
+    "DataModule",
+    "NumpyDataModule",
+    "TrainingConfig",
+    "ValidationConfig",
+    "CheckpointConfig",
+    "Callback",
+    "Model",
+]
+import inspect
+from abc import ABC
+from typing import Any, List, Optional, Tuple, Union
+
+import numpy as np
+
+import oneflow._oneflow_internal
+import oneflow.framework.dtype as dtype_util
+import oneflow.framework.typing as oneflow_typing
+from oneflow.framework.check_point_v2 import GetCheckpoint, LoadVariables, SaveVarDict
+from oneflow.framework.function_util import FunctionConfig as ExecutionConfig
+from oneflow.framework.function_util import api_oneflow_function
+from oneflow.framework.local_blob import LocalBlob
+from oneflow.framework.session_util import api_clear_default_session
+from oneflow.framework.tensor import Tensor
+from oneflow.nn.module import Module
+from oneflow.nn.optimizer.optimizer import Optimizer as OOPOptimizer
+from oneflow.ops.optimizer import Optimizer
+
+
+class DataModule(Module):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+
+    def forward(self, step_idx: int = 0, optimizer_idx: int = 0):
+        pass
+
+    def infer_oneflow_data_placeholder(
+        self, batch: Tuple[Any] = None, optimizer_idx: int = 0
+    ):
+        return None
+
+
+class NumpyDataModule(DataModule):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+
+    def forward(self, step_idx: int = 0, optimizer_idx: int = 0):
+        pass
+
+    def __call__(self, *args):
+        ret = self.forward(*args)
+        return ret
+
+    def infer_oneflow_data_placeholder(
+        self, batch: Tuple[np.ndarray, ...] = None, optimizer_idx: int = 0
+    ):
+        assert isinstance(batch, tuple), "model.NumpyDataModule must return a tuple."
+        data_placeholder_list = []
+        for item in batch:
+            assert isinstance(
+                item, np.ndarray
+            ), "model.NumpyDataModule must return a tuple of numpy."
+            of_dtype = dtype_util.convert_numpy_dtype_to_oneflow_dtype(item.dtype)
+            numpy_placeholder = oneflow_typing.Numpy.Placeholder(
+                shape=item.shape, dtype=of_dtype
+            )
+            data_placeholder_list.append(numpy_placeholder)
+        return data_placeholder_list
+
+
+class TrainingConfig:
+    def __init__(self):
+        super().__init__()
+        self.exe_cfg = ExecutionConfig()
+        self.data = None
+        self.error_msg = ""
+
+    def config_execution(self, exe_cfg: ExecutionConfig = None):
+        self.exe_cfg = exe_cfg
+
+    def config_data(self, data: DataModule = None):
+        self.data = data
+
+    def check_valid(self):
+        is_valid = True
+        self.error_msg = ""
+        if not isinstance(self.exe_cfg, ExecutionConfig):
+            self.error_msg += "model.TrainingConfig exe_cfg is not ExecutionConfig;"
+            is_valid = False
+        if self.data is None:
+            self.error_msg += "model.TrainingConfig data is None;"
+            is_valid = False
+        if not isinstance(self.data, DataModule):
+            self.error_msg += "model.TrainingConfig data is not DataModule;"
+            is_valid = False
+        return is_valid
+
+
+class ValidationConfig:
+    def __init__(self):
+        super().__init__()
+        self.exe_cfg = ExecutionConfig()
+        self.data = None
+        self.step_interval = 10
+        self.error_msg = ""
+
+    def config_execution(self, exe_cfg: ExecutionConfig = None):
+        self.exe_cfg = exe_cfg
+
+    def config_data(self, data: DataModule = None):
+        self.data = data
+
+    def config_step_interval(self, step_interval: int = 1):
+        self.step_interval = step_interval
+
+    def check_valid(self):
+        is_valid = True
+        self.error_msg = ""
+        if self.data is None:
+            self.error_msg += "model.ValidationConfig data is None;"
+            is_valid = False
+        if not isinstance(self.data, DataModule):
+            self.error_msg += "model.ValidationConfig data is not DataModule;"
+            is_valid = False
+        if self.step_interval <= 0 or not isinstance(self.step_interval, int):
+            self.error_msg += (
+                "model.ValidationConfig step_interval is <= 0 or is not int;"
+            )
+            is_valid = False
+        return is_valid
+
+
+class CheckpointConfig(object):
+    def __init__(self):
+        self.need_load = False
+        self.load_dirpath = None
+        self.need_save = False
+        self.save_dirpath = None
+        self.save_step_interval = 1
+        self.error_msg = ""
+
+    def config_load(self, dirpath: str = None):
+        self.need_load = True
+        assert dirpath is not None, "dirpath should not be None"
+        self.load_dirpath = dirpath
+
+    def config_save(self, dirpath: str = None, step_interval: int = 1):
+        self.need_save = True
+        self.save_dirpath = dirpath
+        assert dirpath is not None, "dirpath should not be None"
+        self.save_step_interval = step_interval
+        assert step_interval > 0, "step_interval should not <= 0"
+        assert isinstance(step_interval, int), "step_interval should be int"
+
+    def check_valid(self):
+        is_valid = True
+        self.error_msg = ""
+        return is_valid
+
+
+class Callback(ABC):
+    """ Abstract base class used to build new callbacks.
+    """
+
+    def on_training_step_end(
+        self,
+        outputs: Optional[
+            Union[LocalBlob, Tuple[LocalBlob, ...], Tensor, Tuple[Tensor, ...]]
+        ],
+        step_idx: int = 0,
+        optimizer_idx: int = 0,
+    ):
+        pass
+
+    def on_validation_step_end(
+        self,
+        outputs: Optional[
+            Union[LocalBlob, Tuple[LocalBlob, ...], Tensor, Tuple[Tensor, ...]]
+        ],
+        step_idx: int = 0,
+    ):
+        pass
+
+
+class Model(ABC, Module):
+    """A high level API for model training and validation.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self._is_deprecated_function_style = (
+            kwargs["is_deprecated_function_style"]
+            if "is_deprecated_function_style" in kwargs
+            else False
+        )
+
+    def forward(self, *args, **kwargs):
+        """Same as `nn.Module.forward()`, here is to define the operations you want to use for prediction.
+        """
+        raise NotImplementedError
+
+    def training_step(self, *args, **kwargs):
+        """Operates on a single batch of data from the training set and return loss.
+        """
+        raise NotImplementedError()
+
+    def validation_step(self, *args, **kwargs):
+        """Operates on a single batch of data from the validation set.
+        """
+        raise NotImplementedError()
+
+    def configure_optimizers(self):
+        """Choose what optimizers and learning-rate schedulers to use in your optimization.
+        Normally you'd need one. But in the case of GANs or similar you might have multiple.
+        """
+        raise NotImplementedError()
+
+    def fit(
+        self,
+        training_config: Optional[TrainingConfig] = None,
+        validation_config: Optional[ValidationConfig] = None,
+        checkpoint_config: Optional[CheckpointConfig] = None,
+        callbacks: Optional[Union[Callback, List[Callback]]] = None,
+        max_steps: int = 100,
+    ):
+        """ Runs the full training and validation routine.
+        """
+        self._max_steps = max_steps
+        api_clear_default_session()
+        self._sub_models = self._get_and_check_sub_models(
+            training_config, validation_config, checkpoint_config, callbacks
+        )
+        if len(self._sub_models) == 0:
+            return
+        if self._checkpoint_model.is_valid:
+            self._checkpoint_model.load()
+        for step_idx in range(0, self._max_steps):
+            for sub_model in self._sub_models:
+                try:
+                    sub_model.step(step_idx)
+                except Exception as e:
+                    print(
+                        "Model step_idx {} {} failed.".format(step_idx, sub_model.name)
+                    )
+                    raise e
+
+    def method_overrided(self, method_name: str = None) -> bool:
+        return getattr(self.__class__, method_name) != getattr(Model, method_name)
+
+    def _get_and_check_sub_models(
+        self,
+        training_config: Optional[TrainingConfig] = None,
+        validation_config: Optional[ValidationConfig] = None,
+        checkpoint_config: Optional[CheckpointConfig] = None,
+        callbacks: Optional[Union[Callback, List[Callback]]] = None,
+    ):
+        sub_models = []
+        self._train_model = (
+            TrainModel(training_config, self, callbacks)
+            if self._is_deprecated_function_style
+            else TrainModelOOPStyle(training_config, self, callbacks)
+        )
+        if self._train_model.is_valid:
+            sub_models.append(self._train_model)
+        elif training_config is not None:
+            print(
+                self._train_model.error_msg,
+                "{}'s fit() will not do training.".format(self.__class__.__name__),
+            )
+        self._val_model = (
+            ValidateModel(validation_config, self, callbacks)
+            if self._is_deprecated_function_style
+            else ValidateModelOOPStyle(validation_config, self, callbacks)
+        )
+        if self._val_model.is_valid:
+            sub_models.append(self._val_model)
+        elif validation_config is not None:
+            print(
+                self._val_model.error_msg,
+                "{}'s fit() will not do validation.".format(self.__class__.__name__),
+            )
+        if len(sub_models) == 0:
+            print(
+                "{}'s fit() will do nothing because there has no valid configuration.".format(
+                    self.__class__.__name__
+                )
+            )
+            return sub_models
+        self._checkpoint_model = (
+            CheckpointModel(checkpoint_config, self, callbacks)
+            if self._is_deprecated_function_style
+            else CheckpointModelOOPStyle(checkpoint_config, self, callbacks)
+        )
+        if self._checkpoint_model.is_valid:
+            sub_models.append(self._checkpoint_model)
+        elif checkpoint_config is not None:
+            print(
+                self._checkpoint_model.error_msg,
+                "{}'s fit() will not do checkpoint.".format(self.__class__.__name__),
+            )
+        return sub_models
+
+
+class SubModel(ABC):
+    def __init__(self, name, cfg, model, callbacks):
+        self._cfg = cfg
+        assert isinstance(model, Model)
+        self._model = model
+        self._cbs = callbacks
+        self.name = name
+        self.is_valid = True
+        self.error_msg = (
+            self._model.__class__.__name__ + " " + self.name + " error message: "
+        )
+        if not self._get_and_check_cfg():
+            self.is_valid = False
+        if not self._get_and_check_cbs():
+            self.is_valid = False
+
+    def step(self, step_idx: int = 0):
+        raise NotImplementedError
+
+    def _get_and_check_cfg(self):
+        if self._cfg is None:
+            self.error_msg += "config is None;"
+            return False
+        if not self._cfg.check_valid():
+            self.error_msg += self._cfg.error_msg
+            return False
+        else:
+            return True
+
+    def _get_and_check_cbs(self):
+        if self._cbs is None:
+            self._cbs = []
+            return True
+        if isinstance(self._cbs, Callback):
+            self._cbs = [self._cbs]
+            return True
+        if isinstance(self._cbs, list):
+            for cb in self._cbs:
+                assert isinstance(
+                    cb, Callback
+                ), "model callbacks' type must be model.Callback or List[model.Callback]."
+            return True
+        assert (
+            False
+        ), "model callbacks' type must be model.Callback or List[model.Callback]."
+
+    def _method_callback(self, method_name: str = None, *args, **kwargs):
+        for cb in self._cbs:
+            method = getattr(cb, method_name)
+            method(*args, **kwargs)
+
+
+class TrainModel(SubModel):
+    def __init__(
+        self,
+        cfg: TrainingConfig = None,
+        model: Model = None,
+        callbacks: Optional[Union[Callback, List[Callback]]] = None,
+    ):
+        super().__init__("training", cfg, model, callbacks)
+        if not self._get_and_check_step():
+            self.is_valid = False
+        if not self._get_and_check_opts():
+            self.is_valid = False
+        if self.is_valid and (not self._get_and_check_jobs()):
+            self.is_valid = False
+
+    def step(self, step_idx: int = 0):
+        assert self.is_valid, self.error_msg
+        for optimizer_idx in range(0, len(self._opts)):
+            outputs = None
+            if self._is_numpy_input:
+                batch = None
+                if step_idx == 0:
+                    batch = self._first_numpy_batch[optimizer_idx]
+                else:
+                    batch = self._cfg.data(step_idx, optimizer_idx)
+                outputs = self._jobs[optimizer_idx](*batch).get()
+            else:
+                outputs = self._jobs[optimizer_idx]().get()
+            self._method_callback(
+                "on_training_step_end",
+                outputs=outputs,
+                step_idx=step_idx,
+                optimizer_idx=optimizer_idx,
+            )
+
+    def _get_and_check_step(self):
+        if not self._model.method_overrided("training_step"):
+            self.error_msg += "model.training_step() is empty;"
+            return False
+        else:
+            return True
+
+    def _get_and_check_opts(self):
+        self._opts = []
+        if not self._model.method_overrided("configure_optimizers"):
+            self.error_msg += "model.configure_optimizers() is empty;"
+            return False
+        opt_conf = self._model.configure_optimizers()
+        if isinstance(opt_conf, Optimizer):
+            self._opts = [opt_conf]
+        elif isinstance(opt_conf, (list, tuple)):
+            for opt in opt_conf:
+                assert isinstance(
+                    opt, Optimizer
+                ), "model.configure_optimizers() must return Optimizer                     or List[Optimizer, ...] or Tuple[Optimizer, ...]"
+            self._opts = opt_conf
+        else:
+            assert (
+                False
+            ), "model.configure_optimizers() must return Optimizer                 or List[Optimizer, ...] or Tuple[Optimizer, ...]"
+        return True
+
+    def _get_and_check_jobs(self):
+        self._is_numpy_input = (
+            True if isinstance(self._cfg.data, NumpyDataModule) else False
+        )
+        self._jobs = []
+        if self._is_numpy_input:
+            self._first_numpy_batch = []
+            for optimizer_idx in range(0, len(self._opts)):
+                batch = self._cfg.data(0, optimizer_idx)
+                self._first_numpy_batch.insert(optimizer_idx, batch)
+                self._jobs.insert(
+                    optimizer_idx, self._construct_numpy_job(batch, optimizer_idx)
+                )
+        else:
+            for optimizer_idx in range(0, len(self._opts)):
+                self._jobs.insert(optimizer_idx, self._construct_job(optimizer_idx))
+        return True
+
+    def _construct_job(self, optimizer_idx: int = 0):
+        def job():
+            batch = self._cfg.data(0, optimizer_idx)
+            outputs = self._model.training_step(
+                batch=batch, optimizer_idx=optimizer_idx
+            )
+            loss = None
+            if isinstance(outputs, tuple) and len(outputs) > 0:
+                loss = outputs[0]
+            else:
+                loss = outputs
+            self._opts[optimizer_idx].minimize(loss)
+            return outputs
+
+        job.__name__ = (
+            self._model.__class__.__name__ + "_Model_train_job_" + str(optimizer_idx)
+        )
+        deco = api_oneflow_function(type="train", function_config=self._cfg.exe_cfg)
+        return deco(job)
+
+    def _construct_numpy_job(self, batch, optimizer_idx):
+        def job(*input_batch):
+            outputs = self._model.training_step(
+                batch=input_batch, optimizer_idx=optimizer_idx
+            )
+            loss = None
+            if isinstance(outputs, tuple) and len(outputs) > 0:
+                loss = outputs[0]
+            else:
+                loss = outputs
+            self._opts[optimizer_idx].minimize(loss)
+            return outputs
+
+        _infer_job_signature(self._cfg.data, batch, optimizer_idx, job)
+        job.__name__ = (
+            self._model.__class__.__name__
+            + "_Model_train_numpy_job_"
+            + str(optimizer_idx)
+        )
+        deco = api_oneflow_function(type="train", function_config=self._cfg.exe_cfg)
+        return deco(job)
+
+
+class ValidateModel(SubModel):
+    def __init__(
+        self,
+        cfg: ValidationConfig = None,
+        model: Model = None,
+        callbacks: Optional[Union[Callback, List[Callback]]] = None,
+    ):
+        super().__init__("validation", cfg, model, callbacks)
+        if not self._get_and_check_step():
+            self.is_valid = False
+        if self.is_valid and (not self._get_and_check_job()):
+            self.is_valid = False
+
+    def step(self, step_idx: int = 0):
+        assert self.is_valid
+        if (step_idx + 1) % self._cfg.step_interval == 0:
+            outputs = None
+            if self._is_numpy_input:
+                batch = None
+                if step_idx == 0:
+                    batch = self._first_numpy_batch
+                else:
+                    batch = self._cfg.data(step_idx, 0)
+                outputs = self._job(*batch).get()
+            else:
+                outputs = self._job().get()
+            self._method_callback(
+                "on_validation_step_end", step_idx=step_idx, outputs=outputs
+            )
+
+    def _get_and_check_step(self):
+        if not self._model.method_overrided("validation_step"):
+            self.error_msg += "model.validation_step() is empty;"
+            return False
+        else:
+            return True
+
+    def _get_and_check_job(self):
+        self._is_numpy_input = (
+            True if isinstance(self._cfg.data, NumpyDataModule) else False
+        )
+        self._job = None
+        if not self._is_numpy_input:
+            self._job = self._construct_job()
+        else:
+            batch = self._cfg.data(0, 0)
+            self._first_numpy_batch = batch
+            self._job = self._construct_numpy_job(batch)
+        return True
+
+    def _construct_job(self):
+        def job():
+            batch = self._cfg.data(0, 0)
+            return self._model.validation_step(batch)
+
+        job.__name__ = self._model.__class__.__name__ + "_Model_eval_job"
+        deco = api_oneflow_function(type="predict", function_config=self._cfg.exe_cfg)
+        return deco(job)
+
+    def _construct_numpy_job(self, batch: Tuple[np.ndarray, ...] = None):
+        def job(*input_batch):
+            return self._model.validation_step(batch=input_batch)
+
+        _infer_job_signature(self._cfg.data, batch, 0, job)
+        job.__name__ = self._model.__class__.__name__ + "_Model_eval_numpy_job"
+        deco = api_oneflow_function(type="predict", function_config=self._cfg.exe_cfg)
+        return deco(job)
+
+
+class CheckpointModel(SubModel):
+    def __init__(
+        self,
+        cfg: CheckpointConfig = None,
+        model: Model = None,
+        callbacks: Optional[Union[Callback, List[Callback]]] = None,
+    ):
+        super().__init__("checkpointing", cfg, model, callbacks)
+
+    def load(self):
+        assert self.is_valid
+        if self._cfg.need_load:
+            self._load_checkpoint(self._cfg.load_dirpath)
+
+    def step(self, step_idx: int = 0):
+        assert self.is_valid
+        if self._cfg.need_save:
+            if (step_idx + 1) % self._cfg.save_step_interval == 0:
+                self._save_checkpoint(
+                    dirpath=self._cfg.save_dirpath + "-" + str(step_idx)
+                )
+
+    def _load_checkpoint(self, dirpath: str):
+        """Load model states from a checkpoint.
+        """
+        LoadVariables(GetCheckpoint(path=dirpath))
+
+    def _save_checkpoint(self, dirpath: str):
+        """Save model states as a checkpoint.
+        """
+        SaveVarDict(path=dirpath)
+
+
+class TrainModelOOPStyle(SubModel):
+    def __init__(
+        self,
+        cfg: TrainingConfig = None,
+        model: Model = None,
+        callbacks: Optional[Union[Callback, List[Callback]]] = None,
+    ):
+        super().__init__("training", cfg, model, callbacks)
+        if not self._get_and_check_step():
+            self.is_valid = False
+        if not self._get_and_check_opts():
+            self.is_valid = False
+
+    def step(self, step_idx: int = 0):
+        assert self.is_valid, self.error_msg
+        for optimizer_idx in range(0, len(self._opts)):
+            batch = self._cfg.data(step_idx, optimizer_idx)
+            outputs = self._model.training_step(
+                batch=batch, optimizer_idx=optimizer_idx
+            )
+            loss = None
+            if isinstance(outputs, tuple) and len(outputs) > 0:
+                loss = outputs[0]
+            else:
+                loss = outputs
+            loss.backward()
+            opt = self._opts[optimizer_idx]
+            opt.step()
+            opt.zero_grad()
+            self._method_callback(
+                "on_training_step_end",
+                outputs=outputs,
+                step_idx=step_idx,
+                optimizer_idx=optimizer_idx,
+            )
+
+    def _get_and_check_step(self):
+        if not self._model.method_overrided("training_step"):
+            self.error_msg += "model.training_step() is empty;"
+            return False
+        else:
+            return True
+
+    def _get_and_check_opts(self):
+        self._opts = []
+        if not self._model.method_overrided("configure_optimizers"):
+            self.error_msg += "model.configure_optimizers() is empty;"
+            return False
+        opt_conf = self._model.configure_optimizers()
+        if isinstance(opt_conf, OOPOptimizer):
+            self._opts = [opt_conf]
+        elif isinstance(opt_conf, (list, tuple)):
+            for opt in opt_conf:
+                assert isinstance(
+                    opt, OOPOptimizer
+                ), "model.configure_optimizers() must return Optimizer                     or List[Optimizer, ...] or Tuple[Optimizer, ...]"
+            self._opts = opt_conf
+        else:
+            assert (
+                False
+            ), "model.configure_optimizers() must return Optimizer                 or List[Optimizer, ...] or Tuple[Optimizer, ...]"
+        return True
+
+
+class ValidateModelOOPStyle(SubModel):
+    def __init__(
+        self,
+        cfg: ValidationConfig = None,
+        model: Model = None,
+        callbacks: Optional[Union[Callback, List[Callback]]] = None,
+    ):
+        super().__init__("validation", cfg, model, callbacks)
+        if not self._get_and_check_step():
+            self.is_valid = False
+
+    def step(self, step_idx: int = 0):
+        assert self.is_valid
+        if (step_idx + 1) % self._cfg.step_interval == 0:
+            outputs = None
+            with oneflow._oneflow_internal.autograd.no_grad():
+                inputs = self._cfg.data(step_idx, 0)
+                model_previous_mode = self._model.training
+                self._model.train()
+                outputs = self._model.validation_step(inputs)
+                self._model.train(model_previous_mode)
+            self._method_callback(
+                "on_validation_step_end", step_idx=step_idx, outputs=outputs
+            )
+
+    def _get_and_check_step(self):
+        if not self._model.method_overrided("validation_step"):
+            self.error_msg += "model.validation_step() is empty;"
+            return False
+        else:
+            return True
+
+
+class CheckpointModelOOPStyle(SubModel):
+    def __init__(
+        self,
+        cfg: CheckpointConfig = None,
+        model: Model = None,
+        callbacks: Optional[Union[Callback, List[Callback]]] = None,
+    ):
+        super().__init__("checkpointing", cfg, model, callbacks)
+
+    def load(self):
+        assert self.is_valid
+        if self._cfg.need_load:
+            self._load_checkpoint(self._cfg.load_dirpath)
+
+    def step(self, step_idx: int = 0):
+        assert self.is_valid
+        if self._cfg.need_save:
+            if (step_idx + 1) % self._cfg.save_step_interval == 0:
+                self._save_checkpoint(
+                    dirpath=self._cfg.save_dirpath + "-" + str(step_idx)
+                )
+
+    def _load_checkpoint(self, dirpath: str):
+        """Load model states from a checkpoint.
+        """
+        stat_dict = GetCheckpoint(path=dirpath)
+        self._model.load_state_dict(stat_dict)
+
+    def _save_checkpoint(self, dirpath: str):
+        """Save model states as a checkpoint.
+        """
+        stat_dict = self._model.state_dict()
+        SaveVarDict(path=dirpath, var_dict=stat_dict)
+
+
+def _infer_job_signature(data_module, batch, optimizer_idx, job):
+    para_list = []
+    placeholder_list = data_module.infer_oneflow_data_placeholder(batch, optimizer_idx)
+    for (i, placeholder) in enumerate(placeholder_list):
+        para_name = (
+            data_module.__class__.__name__
+            + "_opt_"
+            + str(optimizer_idx)
+            + "_para_"
+            + str(i)
+        )
+        para_list.append(
+            inspect.Parameter(
+                name=para_name,
+                kind=inspect.Parameter.POSITIONAL_OR_KEYWORD,
+                annotation=placeholder,
+            )
+        )
+    origin_sig = inspect.signature(job)
+    new_sig = origin_sig.replace(parameters=para_list)
+    job.__oneflow_function_signature__ = new_sig
diff --git a/python/oneflow/framework/module.py b/python/oneflow/framework/module.py
new file mode 100644
index 0000000000000000000000000000000000000000..5073e23647693a075a60b0f7fb3b3409d61dcf15
--- /dev/null
+++ b/python/oneflow/framework/module.py
@@ -0,0 +1,45 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow.framework.id_util as id_util
+
+
+class Module(object):
+    def __init__(self, name=None):
+        if name is None:
+            name = id_util.UniqueStr("Module_")
+        self.module_name_ = name
+        self.call_seq_no_ = 0
+
+    @property
+    def module_name(self):
+        return self.module_name_
+
+    @property
+    def call_seq_no(self):
+        return self.call_seq_no_
+
+    def forward(self, *args):
+        raise NotImplementedError()
+
+    def __call__(self, *args):
+        ret = self.forward(*args)
+        self.call_seq_no_ = self.call_seq_no_ + 1
+        return ret
+
+    def __del__(self):
+        assert (
+            getattr(type(self), "__call__") is Module.__call__
+        ), "do not override __call__"
diff --git a/python/oneflow/framework/multi_client_session.py b/python/oneflow/framework/multi_client_session.py
new file mode 100644
index 0000000000000000000000000000000000000000..07308479c35f7113ef105f1337a4d1bb28c601c3
--- /dev/null
+++ b/python/oneflow/framework/multi_client_session.py
@@ -0,0 +1,117 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import enum
+import inspect
+
+from google.protobuf import text_format
+
+import oneflow._oneflow_internal
+import oneflow.core.job.job_set_pb2 as job_set_util
+import oneflow.framework.c_api_util as c_api_util
+
+
+class MultiClientSession(object):
+    class Status(enum.Enum):
+        CREATED = 1
+        INITED = 2
+        CLOSED = 3
+
+    def __init__(self, sess_id):
+        self.sess_ = oneflow._oneflow_internal.RegsiterSession(sess_id)
+        oneflow._oneflow_internal.CreateMultiClientSessionContext()
+        self.config_proto_ = self._make_config_proto()
+        self.function_flag_name2default_val_ = {}
+        self._update_function_flag_name2defaultVal()
+        self.scope_attr_name2default_val_ = {}
+        self._update_scope_attr_name2defaultVal()
+        self.status_ = self.Status.CREATED
+
+    def __del__(self):
+        self.TryClose()
+
+    def TryInit(self):
+        self._check_status(self.Status.CREATED, self.Status.INITED)
+        if self.status_ == self.Status.CREATED:
+            config_proto_str = text_format.MessageToString(self.config_proto)
+            oneflow._oneflow_internal.InitMultiClientSessionContext(config_proto_str)
+            self.status_ = self.Status.INITED
+
+    def TryClose(self):
+        if self.status_ != self.Status.CLOSED:
+            oneflow._oneflow_internal.TryDestroyMultiClientSessionContext()
+            oneflow._oneflow_internal.ClearSessionById(self.id)
+        self.status_ = self.Status.CLOSED
+
+    @property
+    def status(self):
+        return self.status_
+
+    @property
+    def id(self):
+        return self.sess_.id
+
+    @property
+    def config_proto(self):
+        return self.config_proto_
+
+    @property
+    def resource(self):
+        self._check_status(self.Status.INITED)
+        return c_api_util.CurrentResource()
+
+    @property
+    def function_flag_name2default_val(self):
+        return self.function_flag_name2default_val_
+
+    @property
+    def scope_attr_name2default_val(self):
+        return self.scope_attr_name2default_val_
+
+    @property
+    def is_running(self):
+        return self.status_ == self.Status.INITED
+
+    def AnyGlobalFunctionDefined(self):
+        return False
+
+    def _check_status(self, *status):
+        check_success = False
+        for stat in status:
+            if self.status_ == stat:
+                check_success = True
+                break
+        if check_success is False:
+            caller_func_name = inspect.stack()[1].function
+            allowed_status = " or ".join([str(stat) for stat in status])
+            raise ValueError(
+                "The calling to {} is only allowed when status is {}, but current status is {}".format(
+                    caller_func_name, allowed_status, self.status_
+                )
+            )
+
+    def _make_config_proto(self):
+        config_proto = job_set_util.ConfigProto()
+        config_proto.resource.SetInParent()
+        config_proto.session_id = self.id
+        return config_proto
+
+    def _update_function_flag_name2defaultVal(self):
+        items = c_api_util.GetFunctionConfigDef().attr_name2attr_def.items()
+        self.function_flag_name2default_val_ = {k: v.default_val for (k, v) in items}
+
+    def _update_scope_attr_name2defaultVal(self):
+        items = c_api_util.GetScopeConfigDef().attr_name2attr_def.items()
+        self.scope_attr_name2default_val_ = {k: v.default_val for (k, v) in items}
diff --git a/python/oneflow/framework/ofblob.py b/python/oneflow/framework/ofblob.py
new file mode 100644
index 0000000000000000000000000000000000000000..259142d1586ea04e8586f068c8dbe83fd95a3bc2
--- /dev/null
+++ b/python/oneflow/framework/ofblob.py
@@ -0,0 +1,103 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import collections
+from functools import reduce
+
+import numpy as np
+from google.protobuf import text_format
+
+import oneflow as flow
+import oneflow._oneflow_internal
+from oneflow.framework.dtype import convert_proto_dtype_to_oneflow_dtype
+from oneflow.support.box import Box
+
+
+class OfBlob(object):
+    def __init__(self, of_blob_ptr):
+        self.of_blob_ptr_ = of_blob_ptr
+
+    @property
+    def dtype(self):
+        return convert_proto_dtype_to_oneflow_dtype(
+            oneflow._oneflow_internal.Ofblob_GetDataType(self.of_blob_ptr_)
+        )
+
+    @property
+    def static_shape(self):
+        num_axes = oneflow._oneflow_internal.OfBlob_NumAxes(self.of_blob_ptr_)
+        dst_ndarray = np.ndarray(num_axes, dtype=np.int64)
+        oneflow._oneflow_internal.OfBlob_CopyStaticShapeTo(
+            self.of_blob_ptr_, dst_ndarray
+        )
+        return tuple(dst_ndarray.tolist())
+
+    @property
+    def shape(self):
+        num_axes = oneflow._oneflow_internal.OfBlob_NumAxes(self.of_blob_ptr_)
+        dst_ndarray = np.zeros(num_axes, dtype=np.int64)
+        oneflow._oneflow_internal.OfBlob_CopyShapeTo(self.of_blob_ptr_, dst_ndarray)
+        return tuple(dst_ndarray.tolist())
+
+    def set_shape(self, shape):
+        assert isinstance(shape, (list, tuple))
+        assert len(shape) == oneflow._oneflow_internal.OfBlob_NumAxes(self.of_blob_ptr_)
+        oneflow._oneflow_internal.OfBlob_CopyShapeFrom(
+            self.of_blob_ptr_, np.array(shape, dtype=np.int64)
+        )
+
+    @property
+    def num_axes(self):
+        return oneflow._oneflow_internal.OfBlob_NumAxes(self.of_blob_ptr_)
+
+    @property
+    def is_dynamic(self):
+        return oneflow._oneflow_internal.OfBlob_IsDynamic(self.of_blob_ptr_)
+
+    def CopyToNdarray(self):
+        return self._CopyToNdarray()
+
+    def CopyFromNdarray(self, src_ndarray):
+        if self.is_dynamic:
+            self.set_shape(src_ndarray.shape)
+        else:
+            shape_tensor = np.zeros(self.num_axes, dtype=np.int64)
+            oneflow._oneflow_internal.OfBlob_CopyShapeTo(
+                self.of_blob_ptr_, shape_tensor
+            )
+            shape = tuple(shape_tensor.tolist())
+            assert src_ndarray.shape == shape
+        return self._CopyBodyFromNdarray(src_ndarray)
+
+    def _CopyBodyFromNdarray(self, src_ndarray):
+        method_name = oneflow._oneflow_internal.Dtype_GetOfBlobCopyFromBufferFuncName(
+            oneflow._oneflow_internal.deprecated.GetProtoDtype4OfDtype(self.dtype)
+        )
+        copy_method = getattr(oneflow._oneflow_internal, method_name)
+        copy_method(self.of_blob_ptr_, src_ndarray)
+
+    def _CopyToNdarray(self):
+        method_name = oneflow._oneflow_internal.Dtype_GetOfBlobCopyToBufferFuncName(
+            oneflow._oneflow_internal.deprecated.GetProtoDtype4OfDtype(self.dtype)
+        )
+        copy_method = getattr(oneflow._oneflow_internal, method_name)
+        shape_tensor = np.zeros(self.num_axes, dtype=np.int64)
+        oneflow._oneflow_internal.OfBlob_CopyShapeTo(self.of_blob_ptr_, shape_tensor)
+        shape = tuple(shape_tensor.tolist())
+        tensor = np.zeros(
+            shape, dtype=flow.convert_oneflow_dtype_to_numpy_dtype(self.dtype)
+        )
+        copy_method(self.of_blob_ptr_, tensor)
+        return tensor
diff --git a/python/oneflow/framework/op_expr_util.py b/python/oneflow/framework/op_expr_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..25cecc568aac6f07864f80dc4f7b491e0c1c5261
--- /dev/null
+++ b/python/oneflow/framework/op_expr_util.py
@@ -0,0 +1,43 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+import oneflow._oneflow_internal
+from oneflow.framework.attr_util import convert_to_user_attr_value
+
+
+def user_op_expr_call(self, *args, **kwargs):
+    args = list(args)
+    for i in range(len(args)):
+        arg = args[i]
+        if isinstance(arg, flow.Tensor):
+            if not arg.is_determined:
+                arg.determine()
+            args[i] = arg._local_or_consistent_tensor
+    attrs = oneflow._oneflow_internal.MutableCfgAttrMap()
+    for (attr_name, attr_value) in kwargs.items():
+        assert isinstance(attr_name, str)
+        attrs[attr_name] = convert_to_user_attr_value(
+            self.op_type_name, attr_name, attr_value
+        )
+    try:
+        results = self.apply(args, attrs)
+    except flow._oneflow_internal.exception.Exception:
+        raise oneflow._oneflow_internal.exception.GetThreadLocalLastError()
+    return results
+
+
+def RegisterMethod4UserOpExpr():
+    oneflow._oneflow_internal.one.UserOpExpr.__call__ = user_op_expr_call
diff --git a/python/oneflow/framework/op_util.py b/python/oneflow/framework/op_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2902b3a728548bd8f06cdf0f273c3576528c33c
--- /dev/null
+++ b/python/oneflow/framework/op_util.py
@@ -0,0 +1,32 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow._oneflow_internal
+from oneflow.core.operator.op_conf_pb2 import OperatorConf
+
+
+def IsOpConfOnlyCpuSupported(op_conf):
+    assert isinstance(op_conf, OperatorConf)
+    '\n    global _cpu_only_op_type_cases\n    if _cpu_only_op_type_cases == None:\n        _cpu_only_op_type_cases = set()\n        for field in OperatorConf.DESCRIPTOR.oneofs_by_name["op_type"].fields:\n            if oneflow._oneflow_internal.IsOpTypeCaseCpuSupportOnly(field.number):\n                _cpu_only_op_type_cases.add(field.number)\n    op_type_field = op_conf.WhichOneof("op_type")\n    return OperatorConf.DESCRIPTOR.fields_by_name[op_type_field].number in _cpu_only_op_type_cases\n    '
+    op_type_field = op_conf.WhichOneof("op_type")
+    if op_type_field == "user_conf":
+        return IsUserOpOnlyCpuSupported(op_conf.user_conf.op_type_name)
+    else:
+        field_number = OperatorConf.DESCRIPTOR.fields_by_name[op_type_field].number
+        return oneflow._oneflow_internal.IsOpTypeCaseCpuSupportOnly(field_number)
+
+
+def IsUserOpOnlyCpuSupported(op_type_name):
+    return oneflow._oneflow_internal.IsOpTypeNameCpuSupportOnly(op_type_name)
diff --git a/python/oneflow/framework/ops.py b/python/oneflow/framework/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..6826f9b15f0fbb540602341404e724dfbbb6031b
--- /dev/null
+++ b/python/oneflow/framework/ops.py
@@ -0,0 +1,220 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional, Sequence, Union
+
+import oneflow
+import oneflow._oneflow_internal
+import oneflow.core.common.data_type_pb2 as data_type_util
+import oneflow.core.operator.op_conf_pb2 as op_conf_util
+import oneflow.core.register.logical_blob_id_pb2 as logical_blob_id_util
+import oneflow.framework.compile_context as compile_context
+import oneflow.framework.distribute as distribute_util
+import oneflow.framework.hob as hob
+import oneflow.framework.id_util as id_util
+import oneflow.framework.remote_blob as remote_blob_util
+import oneflow.support.enable_if as enable_if
+
+
+@enable_if.condition(hob.in_global_mode & ~hob.eager_execution_enabled)
+def repeat(input, repeat_num, name=None):
+    assert not oneflow.eager_execution_enabled()
+    return (
+        oneflow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("Repeat_")
+        )
+        .Op("repeat")
+        .Input("in", [input])
+        .Output("out")
+        .Attr("repeat_num", repeat_num)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def api_acc(
+    one: oneflow._oneflow_internal.BlobDesc,
+    max_acc_num: int,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    func = enable_if.unique([acc])
+    return func(one, max_acc_num, name=name)
+
+
+@enable_if.condition(hob.in_global_mode & ~hob.eager_execution_enabled)
+def acc(one, max_acc_num, name=None):
+    assert not oneflow.eager_execution_enabled()
+    return (
+        oneflow.user_op_builder(name if name is not None else id_util.UniqueStr("Acc_"))
+        .Op("acc")
+        .Input("in", [one])
+        .Output("out")
+        .Attr("max_acc_num", max_acc_num)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def api_unpack(
+    input: oneflow._oneflow_internal.BlobDesc,
+    unpack_num: int,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    func = enable_if.unique([unpack])
+    return func(input, unpack_num, name=name)
+
+
+@enable_if.condition(hob.in_global_mode & ~hob.eager_execution_enabled)
+def unpack(input, unpack_num, name=None):
+    assert not oneflow.eager_execution_enabled()
+    return (
+        oneflow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("Unpack_")
+        )
+        .Op("unpack")
+        .Input("in", [input])
+        .Output("out")
+        .Attr("unpack_num", unpack_num)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def api_pack(
+    input: oneflow._oneflow_internal.BlobDesc, pack_num: int, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    func = enable_if.unique([pack])
+    return func(input, pack_num, name=name)
+
+
+@enable_if.condition(hob.in_global_mode & ~hob.eager_execution_enabled)
+def pack(input, pack_num, name=None):
+    assert not oneflow.eager_execution_enabled()
+    return (
+        oneflow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("Pack_")
+        )
+        .Op("pack")
+        .Input("in", [input])
+        .Output("out")
+        .Attr("pack_num", pack_num)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def api_parallel_cast(
+    input: oneflow._oneflow_internal.BlobDesc,
+    name: Optional[str] = None,
+    distribute: Optional[oneflow._oneflow_internal.distribute.Distribute] = None,
+    gradient_distribute: Optional[
+        oneflow._oneflow_internal.distribute.Distribute
+    ] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    func = enable_if.unique([parallel_cast])
+    return func(
+        input, name=name, distribute=distribute, gradient_distribute=gradient_distribute
+    )
+
+
+@enable_if.condition(hob.in_global_mode & ~hob.eager_execution_enabled)
+def parallel_cast(input, name=None, distribute=None, gradient_distribute=None):
+    if name is None:
+        name = id_util.UniqueStr("ParallelCast_")
+
+    def distribute_to_str(dist):
+        dist_str = ""
+        if dist is None:
+            pass
+        elif type(dist) is oneflow._oneflow_internal.distribute.SplitDistribute:
+            dist_str = "S({})".format(dist.axis)
+        elif type(dist) is oneflow._oneflow_internal.distribute.BroadcastDistribute:
+            dist_str = "B"
+        else:
+            raise ValueError("unsupported distribute")
+        return dist_str
+
+    sbp_parallel = distribute_to_str(distribute)
+    grad_sbp_parallel = distribute_to_str(gradient_distribute)
+    op = (
+        oneflow.user_op_builder(name)
+        .Op("parallel_cast")
+        .Input("in", [input])
+        .Output("out")
+        .Attr("sbp_parallel", sbp_parallel)
+        .Attr("grad_sbp_parallel", grad_sbp_parallel)
+        .Build()
+    )
+    return op.InferAndTryRun().SoleOutputBlob()
+
+
+def api_hierarchical_parallel_cast(
+    input: oneflow._oneflow_internal.BlobDesc,
+    parallel_distribution: Sequence[str],
+    grad_mode: Optional[str] = None,
+    grad_parallel_distribution: Sequence[str] = None,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    func = enable_if.unique([hierarchical_parallel_cast])
+    return func(
+        input,
+        parallel_distribution=parallel_distribution,
+        grad_mode=grad_mode,
+        grad_parallel_distribution=grad_parallel_distribution,
+        name=name,
+    )
+
+
+@enable_if.condition(hob.in_global_mode & ~hob.eager_execution_enabled)
+def hierarchical_parallel_cast(
+    input, parallel_distribution, grad_mode, grad_parallel_distribution, name
+):
+    if name is None:
+        name = id_util.UniqueStr("HierarchicalParallelCast_")
+
+    def distribute_to_str(dist):
+        if dist is None:
+            return ""
+        elif type(dist) is str:
+            return dist
+        elif type(dist) is oneflow._oneflow_internal.distribute.SplitDistribute:
+            return "S({})".format(dist.axis)
+        elif type(dist) is oneflow._oneflow_internal.distribute.BroadcastDistribute:
+            return "B"
+        else:
+            raise ValueError("unsupported distribute")
+
+    op = (
+        oneflow.user_op_builder(name)
+        .Op("hierarchical_parallel_cast")
+        .Input("in", [input])
+        .Output("out")
+        .Attr(
+            "parallel_distribution", list(map(distribute_to_str, parallel_distribution))
+        )
+        .Attr("grad_mode", grad_mode or "restore")
+        .Attr(
+            "grad_parallel_distribution",
+            list(map(distribute_to_str, grad_parallel_distribution))
+            if grad_parallel_distribution
+            else [],
+        )
+        .Build()
+    )
+    return op.InferAndTryRun().SoleOutputBlob()
diff --git a/python/oneflow/framework/placement_context.py b/python/oneflow/framework/placement_context.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8451b50f141771ee7bdfc9d46e7804cd46645e5
--- /dev/null
+++ b/python/oneflow/framework/placement_context.py
@@ -0,0 +1,120 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import collections
+import re
+
+import oneflow
+import oneflow._oneflow_internal
+import oneflow._oneflow_internal.oneflow.core.job.placement as placement_cfg
+import oneflow.core.job.placement_pb2 as placement_pb
+import oneflow.framework.c_api_util as c_api_util
+import oneflow.framework.op_util as op_util
+import oneflow.framework.session_context as session_ctx
+
+
+class PlacementScope(object):
+    pass
+
+
+class EmptyPlacementScope(PlacementScope):
+    def __init__(self, device_tag, machine_device_ids, hierarchy):
+        if isinstance(machine_device_ids, (list, tuple)) == False:
+            machine_device_ids = [machine_device_ids]
+        self.device_tag_ = device_tag
+        self.machine_device_ids_ = machine_device_ids
+        self.hierarchy_ = hierarchy
+
+    @property
+    def device_tag(self):
+        return self.device_tag_
+
+    @property
+    def machine_device_ids(self):
+        return self.machine_device_ids_
+
+    @property
+    def hierarchy(self):
+        return self.hierarchy_
+
+    def __enter__(self):
+        pass
+
+    def __exit__(self, *args):
+        pass
+
+
+class GlobalModePlacementScope(PlacementScope):
+    def __init__(self, scope_ctx):
+        self.scope_ctx_ = scope_ctx
+
+    def __enter__(self):
+        self.scope_ctx_.__enter__()
+
+    def __exit__(self, *args):
+        self.scope_ctx_.__exit__(*args)
+
+
+def MakeParallelConf4Resource(device_tag, resource):
+    if device_tag == "gpu":
+        assert resource.HasField("gpu_device_num")
+        machine_device_ids = GetGpuMachineDeviceIds(resource)
+    elif device_tag == "cpu":
+        assert resource.HasField("cpu_device_num")
+        machine_device_ids = GetCpuMachineDeviceIds(resource)
+    else:
+        raise NotImplementedError
+    return oneflow._oneflow_internal.MakeParallelConf(device_tag, machine_device_ids)
+
+
+def MakeMachineId2DeviceIdList(parallel_conf):
+    parallel_conf_str = str(parallel_conf)
+    global _parallel_conf_str2ofrecord
+    if parallel_conf_str not in _parallel_conf_str2ofrecord:
+        ofrecord = c_api_util.GetMachine2DeviceIdListOFRecordFromParallelConf(
+            parallel_conf
+        )
+        _parallel_conf_str2ofrecord[parallel_conf_str] = {
+            int(k): list(v.int32_list.value) for (k, v) in ofrecord.feature.items()
+        }
+    return _parallel_conf_str2ofrecord[parallel_conf_str]
+
+
+def GetParallelSize(key2list):
+    size = 0
+    for (k, v) in key2list.items():
+        size += len(v)
+    return size
+
+
+def GetGpuMachineDeviceIds(resource):
+    assert resource.machine_num > 0
+    assert resource.HasField("gpu_device_num")
+    return [
+        "%s:0-%s" % (m_id, resource.gpu_device_num - 1)
+        for m_id in range(resource.machine_num)
+    ]
+
+
+def GetCpuMachineDeviceIds(resource):
+    assert resource.machine_num > 0
+    assert resource.HasField("cpu_device_num")
+    return [
+        "%s:0-%s" % (m_id, resource.cpu_device_num - 1)
+        for m_id in range(resource.machine_num)
+    ]
+
+
+_parallel_conf_str2ofrecord = {}
diff --git a/python/oneflow/framework/placement_util.py b/python/oneflow/framework/placement_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..565e7ebd48409b107d2179bb43d8968f2972577e
--- /dev/null
+++ b/python/oneflow/framework/placement_util.py
@@ -0,0 +1,139 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import re
+import traceback
+
+import oneflow
+import oneflow._oneflow_internal
+import oneflow.framework.hob as hob
+import oneflow.framework.placement_context as placement_ctx
+import oneflow.framework.scope_util as scope_util
+import oneflow.framework.session_context as session_ctx
+import oneflow.support.enable_if as enable_if
+from oneflow import oneflow_deprecate
+
+
+@oneflow_deprecate()
+def deprecated_placement(*args, **kwargs):
+    print(
+        "WARNING:",
+        "oneflow.device_prior_placement/oneflow.fixed_placement",
+        "will be removed in the future, use {} instead.".format(
+            "oneflow.scope.placement"
+        ),
+    )
+    print(traceback.format_stack()[-2])
+    return api_placement(*args, **kwargs)
+
+
+def api_placement(
+    device_tag: str, machine_device_ids: str, hierarchy=None
+) -> placement_ctx.PlacementScope:
+    """Create a scope. All ops within the scope will run on specified device that placed by  "device_tag" and "machine_device_ids".
+
+    Args:
+        device_tag (str): Device tag, "cpu" or "gpu" only
+        machine_device_ids (str): List of string that specifies what machine & device(s) to use, the format is "List[<NODE INDEX>:<DEVICE START INDEX>-<DEVICE END INDEX>, <NODE INDEX>:<DEVICE START INDEX>-<DEVICE END INDEX>, ...]", For example, "0:0" means use the device 0 of machine 0, and "1:4-6" means use device 4, 5, 6 of machine 1.
+
+    Returns:
+        placement_ctx.DevicePriorPlacementScope:  Placement scope
+
+    For example:
+
+    If you run program on single machine, you can assign the specified device like this:
+
+    .. code-block:: python
+
+        with flow.scope.placement("gpu", "0:0"):
+            logits = lenet(images, train=False)
+            loss = flow.nn.sparse_softmax_cross_entropy_with_logits(labels, logits, name="softmax_loss")
+            flow.losses.add_loss(loss)
+
+    Or you run distributed program, you can assign the specified devices like this:
+
+    .. code-block:: python
+
+        # configure machines ids, ips, etc.
+        with flow.scope.placement("gpu", ['0:0-7', '1:0-7']):
+            logits = lenet(images, train=False)
+            loss = flow.nn.sparse_softmax_cross_entropy_with_logits(labels, logits, name="softmax_loss")
+            flow.losses.add_loss(loss)
+
+    """
+    if oneflow._oneflow_internal.flags.with_cuda() == False and device_tag == "gpu":
+        device_tag = "cpu"
+    assert (
+        isinstance(hierarchy, (list, tuple, oneflow._oneflow_internal.Size))
+        or hierarchy is None
+    )
+    func = enable_if.unique(
+        [
+            GetEmptyPlacementScope,
+            GetNormalModePlacementScope,
+            GetGlobalModePlacementScope,
+        ]
+    )
+    return func(device_tag, machine_device_ids, hierarchy)
+
+
+@enable_if.condition(
+    hob.in_normal_mode & hob.env_initialized & ~hob.session_initialized
+)
+def GetEmptyPlacementScope(device_tag, machine_device_ids, hierarchy=None):
+    return placement_ctx.EmptyPlacementScope(device_tag, machine_device_ids, hierarchy)
+
+
+@enable_if.condition(hob.in_normal_mode & hob.session_initialized)
+def GetNormalModePlacementScope(device_tag, machine_device_ids, hierarchy=None):
+    if isinstance(machine_device_ids, tuple):
+        machine_device_ids = list(machine_device_ids)
+    if not isinstance(machine_device_ids, list):
+        machine_device_ids = [machine_device_ids]
+    sess = session_ctx.GetDefaultSession()
+    if hierarchy is not None:
+        hierarchy = oneflow._oneflow_internal.Size(tuple(hierarchy))
+    scope = scope_util.MakeScope(
+        lambda old_scope, builder: builder.BuildScopeWithNewParallelDesc(
+            old_scope, device_tag, machine_device_ids, hierarchy
+        )
+    )
+    return scope_util.ScopeContext(scope)
+
+
+@enable_if.condition(hob.in_global_mode)
+def GetGlobalModePlacementScope(device_tag, machine_device_ids, hierarchy=None):
+    if isinstance(machine_device_ids, (list, tuple)) == False:
+        machine_device_ids = [machine_device_ids]
+    sess = session_ctx.GetDefaultSession()
+    if hierarchy is not None:
+        hierarchy = oneflow._oneflow_internal.Size(tuple(hierarchy))
+
+    def BuildScope(old_scope, builder):
+        return builder.BuildScopeWithNewParallelDesc(
+            old_scope, device_tag, machine_device_ids, hierarchy
+        )
+
+    scope_ctx = scope_util.ScopeContext(scope_util.MakeScope(BuildScope))
+    return placement_ctx.GlobalModePlacementScope(scope_ctx)
+
+
+def GetDefaultMachineDeviceIds(resource):
+    if resource.HasField("gpu_device_num") and resource.gpu_device_num > 0:
+        return ("gpu", placement_ctx.GetGpuMachineDeviceIds(resource))
+    elif resource.HasField("cpu_device_num"):
+        return ("cpu", placement_ctx.GetCpuMachineDeviceIds(resource))
+    else:
+        raise NotImplementedError
diff --git a/python/oneflow/framework/profiler.py b/python/oneflow/framework/profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b2297b57bc5281c4107ac57e5b9c333ccbc99ab
--- /dev/null
+++ b/python/oneflow/framework/profiler.py
@@ -0,0 +1,32 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow._oneflow_internal
+
+
+def RangePush(range_name):
+    oneflow._oneflow_internal.profiler.RangePush(range_name)
+
+
+def RangePop():
+    oneflow._oneflow_internal.profiler.RangePop()
+
+
+def ProfilerStart():
+    oneflow._oneflow_internal.profiler.ProfilerStart()
+
+
+def ProfilerStop():
+    oneflow._oneflow_internal.profiler.ProfilerStop()
diff --git a/python/oneflow/framework/pull_util.py b/python/oneflow/framework/pull_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8b8faada0e8bc5ee16252428d1616d6757bd91a
--- /dev/null
+++ b/python/oneflow/framework/pull_util.py
@@ -0,0 +1,281 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import threading
+
+import numpy as np
+
+import oneflow._oneflow_internal
+import oneflow.framework.local_blob as local_blob_util
+import oneflow.framework.remote_blob as remote_blob_util
+
+
+class FutureRemoteBlobs(object):
+    def __init__(self):
+        self.inited_ = False
+
+    def get(self):
+        raise NotImplementedError
+
+    def async_get(self, callback):
+        raise NotImplementedError
+
+    def SetResult(self, remote_blobs):
+        raise NotImplementedError
+
+    def Inited(self):
+        assert self.inited_ is False
+        self.inited_ = True
+        return self
+
+
+class LazyFutureRemoteBlobs(FutureRemoteBlobs):
+    def __init__(self, session):
+        super().__init__()
+        self.session_ = session
+        self.cond_var_ = threading.Condition()
+        self.out_remote_blob_pullers_ = []
+        self.finished_cnt_ = 0
+        self.data_delivered_ = False
+        self.async_get_callback_ = lambda: None
+
+    def get(self):
+        assert self.inited_
+        assert self.data_delivered_ == False
+        self._Wait()
+        self.data_delivered_ = True
+        return self._TrySyncAndGetResultNdarray(self.out_remote_blob_pullers_)
+
+    def async_get(self, callback):
+        assert self.inited_
+        assert self.data_delivered_ == False
+        pullers_cnt = self._GetPullersCnt()
+
+        def Callback():
+            assert self.finished_cnt_ <= pullers_cnt
+            if self.finished_cnt_ == pullers_cnt:
+                callback(
+                    self._TrySyncAndGetResultNdarray(self.out_remote_blob_pullers_)
+                )
+
+        try:
+            self.cond_var_.acquire()
+            if self.finished_cnt_ == pullers_cnt:
+                Callback()
+            else:
+                self.async_get_callback_ = Callback
+        finally:
+            self.cond_var_.release()
+        self.data_delivered_ = True
+
+    def SetResult(self, out_remote_blobs):
+        assert self.inited_ == False
+        assert isinstance(self.out_remote_blob_pullers_, list)
+        assert len(self.out_remote_blob_pullers_) == 0
+        pullers = self._MakeRemoteBlobPullers(out_remote_blobs)
+        self.out_remote_blob_pullers_ = pullers
+        for puller in self._FlatConsistentBlobPullers(pullers):
+            puller.AsyncPull(self._FinishCallback)
+        return self
+
+    def _FinishCallback(self):
+        self.cond_var_.acquire()
+        self.finished_cnt_ += 1
+        self.cond_var_.notify()
+        self.async_get_callback_()
+        self.cond_var_.release()
+
+    def _Wait(self):
+        pullers_cnt = self._GetPullersCnt()
+        self.cond_var_.acquire()
+        while self.finished_cnt_ != pullers_cnt:
+            self.cond_var_.wait()
+        self.cond_var_.release()
+
+    def _TrySyncAndGetResultNdarray(self, pullers):
+        if self.session_.HasAnyCallbackAfterFunctionReturn():
+            self.session_.Sync()
+        return self._GetResultLocalBlob(pullers)
+
+    def _GetResultLocalBlob(self, pullers):
+        assert self.inited_
+        if isinstance(pullers, _BlobPuller):
+            return pullers.result
+        if isinstance(pullers, (list, tuple)):
+            return type(pullers)((self._GetResultLocalBlob(x) for x in pullers))
+        if isinstance(pullers, dict):
+            return {k: self._GetResultLocalBlob(v) for (k, v) in pullers.items()}
+        raise NotImplementedError
+
+    def _GetPullersCnt(self):
+        cnt = 0
+        for _ in self._FlatConsistentBlobPullers(self.out_remote_blob_pullers_):
+            cnt += 1
+        return cnt
+
+    def _FlatConsistentBlobPullers(self, pullers):
+        if isinstance(pullers, _BlobPuller):
+            for x in pullers.FlatConsistentBlobPullers():
+                yield x
+        elif isinstance(pullers, list) or isinstance(pullers, tuple):
+            for elem in pullers:
+                for x in self._FlatConsistentBlobPullers(elem):
+                    yield x
+        elif isinstance(pullers, dict):
+            for (_, v) in pullers.items():
+                for x in self._FlatConsistentBlobPullers(v):
+                    yield x
+        else:
+            raise NotImplementedError
+
+    def _MakeRemoteBlobPullers(self, out_remote_blobs):
+        if isinstance(out_remote_blobs, oneflow._oneflow_internal.ConsistentBlob):
+            return _ConsistentBlobPuller(out_remote_blobs, self.session_)
+        if isinstance(out_remote_blobs, oneflow._oneflow_internal.MirroredBlob):
+            return _MirroredBlobPuller(out_remote_blobs, self.session_)
+        if isinstance(out_remote_blobs, list) or isinstance(out_remote_blobs, tuple):
+            return type(out_remote_blobs)(
+                (self._MakeRemoteBlobPullers(x) for x in out_remote_blobs)
+            )
+        if isinstance(out_remote_blobs, dict):
+            return {
+                k: self._MakeRemoteBlobPullers(v) for (k, v) in out_remote_blobs.items()
+            }
+        raise NotImplementedError
+
+
+class _BlobPuller(object):
+    def __init__(self, session):
+        self.session_ = session
+
+    def FlatConsistentBlobPullers(self):
+        raise NotImplementedError
+
+    @property
+    def result(self):
+        raise NotImplementedError
+
+
+class _ConsistentBlobPuller(_BlobPuller):
+    def __init__(self, consistent_blob, session):
+        _BlobPuller.__init__(self, session)
+        self.result_ = None
+        self.consistent_blob_ = consistent_blob
+
+    @property
+    def result(self):
+        assert self.result_ is not None
+        return self.result_
+
+    def FlatConsistentBlobPullers(self):
+        yield self
+
+    def AsyncPull(self, pull_cb):
+        def PullCallback(of_blob):
+            self.result_ = local_blob_util.LocalBlob(
+                of_blob.CopyToNdarray(), self.consistent_blob_.is_dynamic
+            )
+            pull_cb()
+
+        self.session_.AsyncPull(self.consistent_blob_.op_name, PullCallback)
+
+
+class _MirroredBlobPuller(_BlobPuller):
+    def __init__(self, mirrored_blob, session):
+        _BlobPuller.__init__(self, session)
+        self.mirrored_blob_ = mirrored_blob
+        self.sub_pullers_ = tuple(
+            (
+                _ConsistentBlobPuller(x, self.session_)
+                for x in mirrored_blob.sub_consistent_blob_list
+            )
+        )
+        self.local_mirrored_blob_ = None
+
+    @property
+    def result(self):
+        if self.local_mirrored_blob_ is not None:
+            return self.local_mirrored_blob_
+        local_blob_list = [x.result.numpy() for x in self.sub_pullers_]
+        local_numpy = local_blob_list[0]
+        if len(local_blob_list) > 1:
+            print("WARNING: return tensor list will concat as axis = 0.")
+            local_numpy = np.concatenate(local_blob_list, axis=0)
+        self.local_mirrored_blob_ = local_blob_util.LocalBlob(
+            local_numpy, self.mirrored_blob_.is_dynamic
+        )
+        return self.local_mirrored_blob_
+
+    def FlatConsistentBlobPullers(self):
+        for x in self.sub_pullers_:
+            yield x
+
+
+class EagerFutureRemoteBlobs(FutureRemoteBlobs):
+    def __init__(self):
+        super().__init__()
+        self.blob_getters_ = None
+
+    def get(self):
+        return self._GetResultLocalBlob(self.blob_getters_)
+
+    def async_get(self, callback):
+        assert callable(callback)
+        callback(self._GetResultLocalBlob(self.blob_getters_))
+
+    def SetResult(self, remote_blobs):
+        assert self.inited_ is False
+        assert self.blob_getters_ is None
+        self.blob_getters_ = self._MakeRemoteBlobGetters(remote_blobs)
+        return self
+
+    def _MakeRemoteBlobGetters(self, remote_blobs):
+        if isinstance(remote_blobs, (list, tuple)):
+            return type(remote_blobs)(
+                (self._MakeRemoteBlobGetters(blob) for blob in remote_blobs)
+            )
+        elif isinstance(remote_blobs, dict):
+            return {
+                k: self._MakeRemoteBlobGetters(v) for (k, v) in remote_blobs.items()
+            }
+        elif isinstance(remote_blobs, oneflow._oneflow_internal.EagerBlobTrait):
+            return _EagerBlobGetter(remote_blobs)
+        else:
+            raise NotImplementedError
+
+    def _GetResultLocalBlob(self, getter):
+        assert self.inited_
+        if isinstance(getter, _EagerBlobGetter):
+            return getter.result
+        elif isinstance(getter, (list, tuple)):
+            return type(getter)((self._GetResultLocalBlob(g) for g in getter))
+        elif isinstance(getter, dict):
+            return {k: self._GetResultLocalBlob(v) for (k, v) in getter.items()}
+        else:
+            raise NotImplementedError(type(getter))
+
+
+class _EagerBlobGetter(object):
+    def __init__(self, eager_blob):
+        assert isinstance(eager_blob, oneflow._oneflow_internal.EagerBlobTrait)
+        self.eager_blob_ = eager_blob
+        self.local_tensor_ = None
+
+    @property
+    def result(self):
+        if self.local_tensor_ is not None:
+            return self.local_tensor_
+        self.local_tensor_ = local_blob_util.MakeLocalBlob4EagerBlob(self.eager_blob_)
+        return self.local_tensor_
diff --git a/python/oneflow/framework/push_util.py b/python/oneflow/framework/push_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..a66477400d3c1b8f84f3ba1db796c97a3ddd5420
--- /dev/null
+++ b/python/oneflow/framework/push_util.py
@@ -0,0 +1,284 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from functools import reduce
+
+import numpy
+
+import oneflow
+import oneflow._oneflow_internal
+import oneflow._oneflow_internal.oneflow.core.register.logical_blob_id as lbi_util
+import oneflow.core.operator.op_conf_pb2 as op_conf_util
+import oneflow.core.register.logical_blob_id_pb2 as logical_blob_id_util
+import oneflow.eager.boxing_util as boxing_util
+import oneflow.framework.balanced_splitter as balanced_splitter
+import oneflow.framework.dtype as dtype_util
+import oneflow.framework.id_util as id_util
+import oneflow.framework.input_blob_def as input_blob_def
+import oneflow.framework.python_callback as python_callback
+import oneflow.framework.remote_blob as remote_blob_util
+
+blob_register = oneflow._oneflow_internal.GetDefaultBlobRegister()
+
+
+def AsyncPush(session, job_func, *arg):
+    assert len(arg) == len(job_func.__oneflow_input_blob_defs__)
+    for i in range(len(arg)):
+        _AsyncPushArg(session, job_func.__oneflow_input_blob_defs__[i], arg[i])
+
+
+def _AsyncPushArg(session, arg_blob_def, arg_ndarray):
+    if isinstance(arg_blob_def, (list, tuple)):
+        assert isinstance(arg_ndarray, (list, tuple)), "type(arg_ndarray): %s" % type(
+            arg_ndarray
+        )
+        assert len(arg_blob_def) == len(arg_ndarray), "%s v.s. %s" % (
+            len(arg_blob_def),
+            len(arg_ndarray),
+        )
+        for (blob_def, ndarray) in zip(arg_blob_def, arg_ndarray):
+            _AsyncPushArg(session, blob_def, ndarray)
+    elif isinstance(arg_blob_def, dict):
+        assert type(arg_blob_def) is type(arg_ndarray)
+        assert set(arg_blob_def.keys()) == set(arg_ndarray.keys())
+        for (k, blob_def) in arg_blob_def.items():
+            _AsyncPushArg(session, blob_def, arg_ndarray[k])
+    else:
+        assert isinstance(arg_blob_def, input_blob_def.ArgBlobDef)
+        arg_blob_def.CheckAndAsyncPush(session, arg_ndarray)
+
+
+def MakeEagerInputBlobs(arg_blob_def, arg_ndarray):
+    if isinstance(arg_blob_def, (list, tuple)):
+        assert isinstance(arg_ndarray, (list, tuple)), "type(arg_ndarray): %s" % type(
+            arg_ndarray
+        )
+        assert len(arg_blob_def) == len(arg_ndarray)
+        return type(arg_blob_def)(
+            (
+                MakeEagerInputBlobs(blob_def, ndarray)
+                for (blob_def, ndarray) in zip(arg_blob_def, arg_ndarray)
+            )
+        )
+    elif isinstance(arg_blob_def, dict):
+        assert type(arg_blob_def) is type(arg_ndarray)
+        assert set(arg_blob_def.keys()) == set(arg_ndarray.keys())
+        return {
+            k: MakeEagerInputBlobs(blob_def, arg_ndarray[k])
+            for (k, blob_def) in arg_blob_def.items()
+        }
+    else:
+        return _CreateEagerInputBlobAndFeedValue(arg_blob_def, arg_ndarray)
+
+
+def _CheckInputArgBlobDefValueMatch(arg_blob_def, arg_value):
+    if isinstance(arg_blob_def, input_blob_def.FixedTensorDef):
+        assert isinstance(arg_value, numpy.ndarray)
+        assert arg_blob_def.shape == arg_value.shape
+    elif isinstance(arg_blob_def, input_blob_def.MirroredTensorDef):
+        assert isinstance(arg_value, (list, tuple))
+        for v in arg_value:
+            assert isinstance(v, numpy.ndarray)
+            assert len(v.shape) == len(arg_blob_def.shape)
+            assert numpy.prod(v.shape) <= numpy.prod(arg_blob_def.shape)
+    else:
+        raise NotImplementedError
+
+
+def FeedValueToEagerBlob(blob_object, blob_def, ndarray):
+    physical_blob_objects = _GetPhysicalBlobObjects(blob_object, None)
+    feed_ctx = FeedContext(blob_object.op_arg_parallel_attr, ndarray)
+    for (i, physical_blob_object) in enumerate(physical_blob_objects):
+        feed_ctx.set_rank(i)
+        _FeedValueToInputPhysicalBlob(feed_ctx, blob_def, physical_blob_object)
+
+
+def _CreateEagerInputBlobAndFeedValue(arg_blob_def, arg_ndarray):
+    _CheckInputArgBlobDefValueMatch(arg_blob_def, arg_ndarray)
+    (arg_blob_object, lbi) = _MakeInputBlobObject(arg_blob_def)
+    FeedValueToEagerBlob(arg_blob_object, arg_blob_def, arg_ndarray)
+    get_blob = None
+    if not isinstance(lbi, lbi_util.LogicalBlobId):
+        cfg_lbi = lbi_util.LogicalBlobId()
+        cfg_lbi.set_op_name(lbi.op_name)
+        cfg_lbi.set_blob_name(lbi.blob_name)
+        lbi = cfg_lbi
+    if isinstance(arg_blob_def, input_blob_def.FixedTensorDef):
+
+        def get_blob(lbi, blob_object, blob_register):
+            blob = oneflow._oneflow_internal.EagerConsistentBlob(
+                lbi, blob_object, blob_register
+            )
+            with oneflow.scope.consistent_view():
+                return oneflow.identity(blob)
+
+    elif isinstance(arg_blob_def, input_blob_def.MirroredTensorDef):
+        get_blob = oneflow._oneflow_internal.EagerMirroredBlob
+    else:
+        raise NotImplementedError
+    return get_blob(lbi, blob_object=arg_blob_object, blob_register=blob_register)
+
+
+def _MakeInputBlobObject(arg_blob_def):
+    (input_op_conf, lbi) = _MakeInputOpConfAndRetLbi(arg_blob_def)
+    bn_in_op2blob_object = oneflow._oneflow_internal.deprecated.BnInOp2BlobObject()
+
+    def BuildInputInstruction(builder):
+        op_attribute = arg_blob_def.EagerAddAndInferOp(input_op_conf)
+        scope = oneflow.current_scope()
+        parallel_conf = scope.device_parallel_desc_symbol.parallel_conf
+        cfg_op_attribute = oneflow._oneflow_internal.deprecated.MakeOpAttributeByString(
+            str(op_attribute)
+        )
+        builder.StatelessCall(
+            cfg_op_attribute, parallel_conf, bn_in_op2blob_object, boxing_util.BoxingTo
+        )
+
+    oneflow._oneflow_internal.deprecated.LogicalRun(BuildInputInstruction)
+    return (bn_in_op2blob_object["out"], lbi)
+
+
+def _GetPhysicalBlobObjects(logical_blob_object, lbi):
+    blob_register = oneflow._oneflow_internal.GetDefaultBlobRegister()
+    physical_blob_objects = None
+
+    def BuildLogical2PhysicalInstruction(builder):
+        nonlocal physical_blob_objects
+        physical_blob_objects = builder.UnpackLogicalBlobToPhysicalBlobs(
+            logical_blob_object
+        )
+
+    oneflow._oneflow_internal.deprecated.LogicalRun(BuildLogical2PhysicalInstruction)
+    return physical_blob_objects
+
+
+def _MakeInputOpConfAndRetLbi(arg_blob_def):
+    assert isinstance(arg_blob_def, input_blob_def.ArgBlobDef)
+    op_conf = op_conf_util.OperatorConf()
+    op_conf.name = id_util.UniqueStr("Input_")
+    op_conf.input_conf.out = "out"
+    op_conf.input_conf.blob_conf.CopyFrom(arg_blob_def.ToInterfaceBlobConf())
+    lbi = logical_blob_id_util.LogicalBlobId()
+    lbi.op_name = op_conf.name
+    lbi.blob_name = op_conf.input_conf.out
+    return (op_conf, lbi)
+
+
+class FeedContext(object):
+    def __init__(self, op_arg_parallel_attr, arg_ndarray, rank=0):
+        self.op_arg_parallel_attr_ = op_arg_parallel_attr
+        self.arg_ndarray_ = arg_ndarray
+        self.rank_ = rank
+        self.balanced_range_ = None
+
+    def set_rank(self, rank):
+        self.rank_ = rank
+
+    def GetFixedTensor(self, logical_shape):
+        assert isinstance(self.arg_ndarray_, numpy.ndarray)
+        assert self.arg_ndarray_.shape == logical_shape, "%s v.s. %s" % (
+            self.arg_ndarray_.shape,
+            logical_shape,
+        )
+        sbp_parallel = self.op_arg_parallel_attr_.sbp_parallel
+        parallel_num = self.op_arg_parallel_attr_.parallel_desc_symbol.parallel_num
+        if sbp_parallel.has_broadcast_parallel() or parallel_num == 1:
+            return self._AsContiguousNdArray(self.arg_ndarray_)
+        elif sbp_parallel.has_split_parallel():
+            axis = sbp_parallel.split_parallel().axis()
+            (start, end) = self._GetBalancedRanges(logical_shape[axis])[self.rank_]
+            slc = [slice(None)] * len(logical_shape)
+            slc[axis] = slice(start, end)
+            ndarray = self.arg_ndarray_[tuple(slc)]
+            return self._AsContiguousNdArray(ndarray)
+        else:
+            raise NotImplementedError
+
+    def _GetBalancedRanges(self, dim):
+        parallel_num = self.op_arg_parallel_attr_.parallel_desc_symbol.parallel_num
+        if self.balanced_range_ is None:
+            self.balanced_range_ = balanced_splitter.BalancedRanges(dim, parallel_num)
+        return self.balanced_range_
+
+    def GetMirroredTensor(self, static_shape):
+        capacity = reduce(lambda x, y: x * y, static_shape, 1)
+        assert isinstance(self.arg_ndarray_, (list, tuple))
+        parallel_num = self.op_arg_parallel_attr_.parallel_desc_symbol.parallel_num
+        assert len(self.arg_ndarray_) == parallel_num
+        assert all((isinstance(a, numpy.ndarray) for a in self.arg_ndarray_))
+        assert self.rank_ >= 0
+        assert self.rank_ < parallel_num
+        ndarray = self.arg_ndarray_[self.rank_]
+        elem_cnt = reduce(lambda x, y: x * y, ndarray.shape, 1)
+        assert elem_cnt <= capacity, "%s v.s. %s" % (ndarray.shape, static_shape)
+        return self._AsContiguousNdArray(ndarray)
+
+    def _AsContiguousNdArray(self, ndarray):
+        if isinstance(ndarray, numpy.ndarray):
+            return (
+                ndarray
+                if ndarray.flags["C_CONTIGUOUS"]
+                else numpy.ascontiguousarray(ndarray)
+            )
+        elif isinstance(ndarray, (tuple, list)):
+            return type(ndarray)((self._AsContiguousNdArray(a) for a in ndarray))
+        else:
+            raise NotImplementedError
+
+
+def _FeedValueToInputPhysicalBlob(feed_ctx, blob_def, blob_object):
+    assert isinstance(blob_def, input_blob_def.ArgBlobDef)
+    assert isinstance(blob_object, oneflow._oneflow_internal.BlobObject)
+    FeedBlob = _MakeFeedBlobCallback(feed_ctx, blob_def, blob_object)
+    assert callable(FeedBlob)
+
+    def BuildFeedInstruction(builder):
+        builder.FeedBlob(
+            blob_object, python_callback.GetIdForRegisteredCallback(FeedBlob)
+        )
+        builder.InsertRemoveForeignCallbackInstruction(
+            blob_object.object_id, python_callback.GetIdForRegisteredCallback(FeedBlob)
+        )
+
+    oneflow._oneflow_internal.deprecated.PhysicalRun(BuildFeedInstruction)
+
+
+def _MakeFeedBlobCallback(feed_ctx, blob_def, blob_object):
+    if isinstance(blob_def, input_blob_def.FixedTensorDef):
+
+        def FeedBlob(ofblob):
+            ndarray = feed_ctx.GetFixedTensor(blob_def.shape)
+            dtype = dtype_util.convert_oneflow_dtype_to_numpy_dtype(ofblob.dtype)
+            assert ndarray.dtype == dtype, "%s v.s. %s" % (ndarray.dtype, dtype)
+            assert ndarray.shape == ofblob.static_shape, "%s v.s. %s" % (
+                ndarray.shape,
+                ofblob.static_shape,
+            )
+            if ofblob.CopyFromNdarray(ndarray) is False:
+                raise ValueError
+
+    elif isinstance(blob_def, input_blob_def.MirroredTensorDef):
+
+        def FeedBlob(ofblob):
+            ndarray = feed_ctx.GetMirroredTensor(ofblob.static_shape)
+            assert isinstance(ndarray, numpy.ndarray)
+            dtype = dtype_util.convert_oneflow_dtype_to_numpy_dtype(ofblob.dtype)
+            assert ndarray.dtype == dtype, "%s v.s. %s" % (ndarray.dtype, dtype)
+            if ofblob.CopyFromNdarray(ndarray) is False:
+                raise ValueError
+
+    else:
+        raise NotImplementedError
+    return FeedBlob
diff --git a/python/oneflow/framework/python_callback.py b/python/oneflow/framework/python_callback.py
new file mode 100644
index 0000000000000000000000000000000000000000..43d4b1098526abf9ff11e9fc816b8220fcb77960
--- /dev/null
+++ b/python/oneflow/framework/python_callback.py
@@ -0,0 +1,99 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import traceback
+
+import oneflow._oneflow_internal
+import oneflow._oneflow_internal.oneflow.core.job.job_conf as job_conf_cfg
+import oneflow._oneflow_internal.oneflow.core.job.placement as placement_cfg
+import oneflow._oneflow_internal.oneflow.core.job.scope as scope_cfg
+import oneflow._oneflow_internal.oneflow.core.operator.op_attribute as op_attribute_cfg
+import oneflow.framework.ofblob as ofblob
+
+
+def GetIdForRegisteredCallback(cb):
+    assert callable(cb)
+    global unique_id2handler
+    unique_id2handler[id(cb)] = cb
+    return id(cb)
+
+
+def DeleteRegisteredCallback(cb):
+    global unique_id2handler
+    assert id(cb) in unique_id2handler
+    del unique_id2handler[id(cb)]
+
+
+class PythonCallback(oneflow._oneflow_internal.ForeignCallback):
+    def __init__(self):
+        oneflow._oneflow_internal.ForeignCallback.__init__(self)
+
+    def OfBlobCall(self, unique_id, of_blob_ptr):
+        try:
+            _WatcherHandler(unique_id, of_blob_ptr)
+        except Exception as e:
+            print(traceback.format_exc())
+            raise e
+
+    def RemoveForeignCallback(self, unique_id):
+        global unique_id2handler
+        try:
+            del unique_id2handler[unique_id]
+        except Exception as e:
+            print(traceback.format_exc())
+            raise e
+
+    def EagerInterpretCompletedOp(self, op_attribute, parallel_conf):
+        try:
+            interpreter_callback.InterpretCompletedOp(str(op_attribute), parallel_conf)
+        except Exception as e:
+            print(traceback.format_exc())
+            raise e
+
+    def EagerMirroredCast(self, op_attribute, parallel_conf):
+        try:
+            interpreter_callback.MirroredCast(str(op_attribute), parallel_conf)
+        except Exception as e:
+            print(traceback.format_exc())
+            raise e
+
+    def MakeScopeSymbol(self, job_conf, parallel_conf, is_mirrored):
+        try:
+            return interpreter_callback.MakeScopeSymbol(
+                job_conf, parallel_conf, is_mirrored
+            )
+        except Exception as e:
+            print(traceback.format_exc())
+            raise e
+
+    def MakeParallelDescSymbol(self, parallel_conf):
+        try:
+            return interpreter_callback.MakeParallelDescSymbol(parallel_conf)
+        except Exception as e:
+            print(traceback.format_exc())
+            raise e
+
+
+def _WatcherHandler(unique_id, of_blob_ptr):
+    global unique_id2handler
+    assert unique_id in unique_id2handler
+    handler = unique_id2handler[unique_id]
+    assert callable(handler)
+    handler(ofblob.OfBlob(of_blob_ptr))
+
+
+unique_id2handler = {}
+global_python_callback = PythonCallback()
+interpreter_callback = None
diff --git a/python/oneflow/framework/register_class_method_util.py b/python/oneflow/framework/register_class_method_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..30a64a3ec88fad5f446dd0e71621857f8461a436
--- /dev/null
+++ b/python/oneflow/framework/register_class_method_util.py
@@ -0,0 +1,37 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow._oneflow_internal
+import oneflow.eager.eager_blob_util as eager_blob_util
+import oneflow.framework.blob_trait as blob_trait
+import oneflow.framework.functional as functional
+import oneflow.framework.generator as generator
+import oneflow.framework.op_expr_util as op_expr_util
+import oneflow.framework.remote_blob as remote_blob_util
+
+
+def RegisterMethod4Class():
+    op_expr_util.RegisterMethod4UserOpExpr()
+    functional.RegisterFunctionalApis()
+    eager_blob_util.RegisterMethod4EagerPhysicalBlob()
+    blob_trait.RegisterBlobOperatorTraitMethod(
+        oneflow._oneflow_internal.EagerPhysicalBlob
+    )
+    blob_trait.RegisterBlobOperatorTraitMethod(oneflow._oneflow_internal.ConsistentBlob)
+    blob_trait.RegisterBlobOperatorTraitMethod(oneflow._oneflow_internal.MirroredBlob)
+    remote_blob_util.RegisterMethod4EagerBlobTrait()
+    remote_blob_util.RegisterMethod4LazyConsistentBlob()
+    remote_blob_util.RegisterMethod4LazyMirroredBlob()
+    remote_blob_util.RegisterMethod4EagerConsistentBlob()
diff --git a/python/oneflow/framework/register_python_callback.py b/python/oneflow/framework/register_python_callback.py
new file mode 100644
index 0000000000000000000000000000000000000000..3080f57cb2871398c5f97ac92fb48d6ac0cb8309
--- /dev/null
+++ b/python/oneflow/framework/register_python_callback.py
@@ -0,0 +1,20 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow._oneflow_internal
+import oneflow.eager.interpreter_callback as interpreter_callback
+import oneflow.framework.python_callback as python_callback
+
+python_callback.interpreter_callback = interpreter_callback
diff --git a/python/oneflow/framework/remote_blob.py b/python/oneflow/framework/remote_blob.py
new file mode 100644
index 0000000000000000000000000000000000000000..69a7c28506031b998e9635f3acb2ed5794b9c5e5
--- /dev/null
+++ b/python/oneflow/framework/remote_blob.py
@@ -0,0 +1,236 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import sys
+import traceback
+
+import oneflow
+import oneflow._oneflow_internal
+import oneflow._oneflow_internal.oneflow.core.job.placement as placement_cfg
+import oneflow._oneflow_internal.oneflow.core.register.logical_blob_id as lbi_util
+import oneflow.core.register.logical_blob_id_pb2 as logical_blob_id_util
+import oneflow.eager.boxing_util as boxing_util
+import oneflow.eager.eager_blob_util as eager_blob_util
+import oneflow.eager.gradient_util as gradient_util
+import oneflow.framework.blob_trait as blob_trait
+import oneflow.framework.c_api_util as c_api_util
+import oneflow.framework.hob as hob
+import oneflow.framework.id_util as id_util
+import oneflow.framework.placement_context as placement_ctx
+import oneflow.support.enable_if as enable_if
+from oneflow.framework.dtype import convert_proto_dtype_to_oneflow_dtype
+
+blob_register = oneflow._oneflow_internal.GetDefaultBlobRegister()
+
+
+def RemoteBlob(lbi, **kw):
+    api = enable_if.unique([EagerLogicalBlob, LazyRemoteBlob])
+    return api(lbi, **kw)
+
+
+@enable_if.condition(hob.in_global_mode & hob.eager_execution_enabled)
+def EagerLogicalBlob(lbi, **kw):
+    job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+    lbn = lbi.op_name + "/" + lbi.blob_name
+    if not isinstance(lbi, lbi_util.LogicalBlobId):
+        cfg_lbi = lbi_util.LogicalBlobId()
+        cfg_lbi.set_op_name(lbi.op_name)
+        cfg_lbi.set_blob_name(lbi.blob_name)
+        lbi = cfg_lbi
+    blob_type = oneflow._oneflow_internal.EagerConsistentBlob
+    if c_api_util.JobBuildAndInferCtx_IsMirroredBlob(job_name, lbn):
+        blob_type = oneflow._oneflow_internal.EagerMirroredBlob
+    job_name = ""
+    if "job_name" in kw and kw["job_name"] is not None:
+        job_name = kw["job_name"]
+    blob_object = None
+    if "blob_object" in kw:
+        blob_object = kw["blob_object"]
+    distribute = oneflow._oneflow_internal.distribute.auto()
+    if "distribute" in kw:
+        distribute = kw["distribute"]
+    return blob_type(lbi, blob_object, blob_register, job_name, distribute)
+
+
+@enable_if.condition(~hob.eager_execution_enabled)
+def LazyRemoteBlob(lbi, **kw):
+    job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+    lbn = lbi.op_name + "/" + lbi.blob_name
+    blob_type = oneflow._oneflow_internal.LazyConsistentBlob
+    if c_api_util.JobBuildAndInferCtx_IsMirroredBlob(job_name, lbn):
+        blob_type = oneflow._oneflow_internal.LazyMirroredBlob
+    if not isinstance(lbi, lbi_util.LogicalBlobId):
+        cfg_lbi = lbi_util.LogicalBlobId()
+        cfg_lbi.set_op_name(lbi.op_name)
+        cfg_lbi.set_blob_name(lbi.blob_name)
+        lbi = cfg_lbi
+    job_name = ""
+    if "job_name" in kw and kw["job_name"] is not None:
+        job_name = kw["job_name"]
+    distribute = oneflow._oneflow_internal.distribute.auto()
+    if "distribute" in kw:
+        distribute = kw["distribute"]
+    return blob_type(lbi, job_name, distribute)
+
+
+@property
+def dtype(self):
+    ret = convert_proto_dtype_to_oneflow_dtype(self.get_dtype())
+    assert isinstance(ret, oneflow.dtype)
+    return ret
+
+
+def with_distribute(self, distribute):
+    new = type(self)(
+        self.lbi, self.job_name, oneflow._oneflow_internal.distribute.auto()
+    )
+    new.set_distribute(distribute)
+    return new
+
+
+def with_gradient_distribute(self, distribute):
+    return oneflow.parallel_cast(self, gradient_distribute=distribute)
+
+
+def get_lazy_shape_log_warning(self):
+    if oneflow.scope.mirrored_view_enabled():
+        return "%s\n%s\n%s" % (
+            "WARNING:",
+            "You access a consistent blob shape in mirrored view, there may be problems,",
+            "you should add 'x = flow.cast_to_current_logical_view(x)'.",
+        )
+    else:
+        return ""
+
+
+def get_mirror_shape_log_warning(self):
+    if oneflow.scope.consistent_view_enabled():
+        return "%s\n%s\n%s" % (
+            "WARNING:",
+            "You access a mirrored blob shape in consistent view, there may be problems,",
+            "you should add 'x = flow.cast_to_current_logical_view(x)'.",
+        )
+    else:
+        return ""
+
+
+def RegisterMethod4BlobDef(blob_class):
+    blob_class.dtype = dtype
+    blob_class.with_distribute = with_distribute
+    blob_class.with_gradient_distribute = with_gradient_distribute
+
+
+def RegisterMethod4LazyConsistentBlob():
+    RegisterMethod4BlobDef(oneflow._oneflow_internal.LazyConsistentBlob)
+    oneflow._oneflow_internal.LazyConsistentBlob.get_lazy_shape_log_warning = (
+        get_lazy_shape_log_warning
+    )
+
+
+def RegisterMethod4LazyMirroredBlob():
+    RegisterMethod4BlobDef(oneflow._oneflow_internal.LazyMirroredBlob)
+    oneflow._oneflow_internal.LazyMirroredBlob.get_mirror_shape_log_warning = (
+        get_mirror_shape_log_warning
+    )
+
+
+@property
+def sub_consistent_blob_list(self):
+    raise NotImplementedError
+
+
+def numpy(self, rank=None):
+    assert rank is None or rank == 0
+    return self._Numpy()
+
+
+def numpy_list(self, rank=None):
+    assert rank is None or rank == 0
+    return [self._Numpy()]
+
+
+def BlobObjectNumpy(blob_object, tmp_name=None):
+    if tmp_name is None:
+        tmp_name = id_util.UniqueStr("numpy-tmp-")
+
+    def FetchBlobNumpy(blob_object):
+        consistent_blob_name = None
+
+        def BoxingToSingleDevice(builder):
+            parallel_conf = placement_cfg.ParallelConf()
+            parallel_conf.set_device_tag(blob_object.parallel_desc_symbol.device_tag)
+            parallel_conf.add_device_name("{}:{}".format(0, 0))
+            tmp_parallel_desc_symbol = builder.GetParallelDescSymbol(parallel_conf)
+            tmp_op_arg_parallel_attr = oneflow._oneflow_internal.OpArgParallelAttribute(
+                tmp_parallel_desc_symbol,
+                str(blob_object.op_arg_parallel_attr.sbp_parallel),
+                str(blob_object.op_arg_parallel_attr.opt_mirrored_parallel),
+            )
+            with oneflow.scope.placement(
+                parallel_conf.device_tag(), list(parallel_conf.device_name())
+            ):
+                tmp_blob_object = boxing_util.BoxingTo(
+                    builder, blob_object, tmp_op_arg_parallel_attr
+                )
+            nonlocal consistent_blob_name
+            consistent_blob_name = tmp_name
+            if not blob_register.HasObject4BlobName(consistent_blob_name):
+                blob_register.SetObject4BlobName(consistent_blob_name, tmp_blob_object)
+
+        oneflow._oneflow_internal.deprecated.LogicalRun(BoxingToSingleDevice)
+        return oneflow._oneflow_internal.EagerPhysicalBlob(
+            consistent_blob_name,
+            blob_register,
+            eager_blob_util._GetPhysicalBlobHeaderCache,
+        ).numpy()
+
+    return FetchBlobNumpy(blob_object)
+
+
+def _Numpy(self):
+    tmp_name = "{}-consistent".format(self.logical_blob_name)
+    return BlobObjectNumpy(self.blob_object, tmp_name)
+
+
+def RegisterMethod4EagerBlobTrait():
+    oneflow._oneflow_internal.EagerBlobTrait.sub_consistent_blob_list = (
+        sub_consistent_blob_list
+    )
+    oneflow._oneflow_internal.EagerBlobTrait.dtype = dtype
+    oneflow._oneflow_internal.EagerBlobTrait._Numpy = _Numpy
+    oneflow._oneflow_internal.EagerBlobTrait.numpy = numpy
+    oneflow._oneflow_internal.EagerBlobTrait.numpy_list = numpy_list
+
+
+def eager_with_distribute(self, distribute):
+    new = type(self)(
+        self.lbi,
+        blob_object=self.blob_object,
+        blob_register=blob_register,
+        job_name=self.job_name,
+        distribute=self.distribute,
+    )
+    new.set_distribute(distribute)
+    return new
+
+
+def RegisterMethod4EagerConsistentBlob():
+    oneflow._oneflow_internal.EagerConsistentBlob.dtype = dtype
+    oneflow._oneflow_internal.EagerConsistentBlob.with_distribute = (
+        eager_with_distribute
+    )
+    oneflow._oneflow_internal.EagerConsistentBlob.with_gradient_distribute = (
+        with_gradient_distribute
+    )
diff --git a/python/oneflow/framework/runtime_mode.py b/python/oneflow/framework/runtime_mode.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e063b035e2201dc89e4a9e7e03f19b8d267937a
--- /dev/null
+++ b/python/oneflow/framework/runtime_mode.py
@@ -0,0 +1,41 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from contextlib import contextmanager
+
+NORMAL_MODE = "NORMAL_MODE"
+GLOBAL_MODE = "GLOBAL_MODE"
+DEVICE_MODE = "DEVICE_MODE"
+
+
+def CurrentMode():
+    return mode_statck[0]
+
+
+def IsValidMode(mode):
+    return mode == NORMAL_MODE or mode == GLOBAL_MODE or mode == DEVICE_MODE
+
+
+@contextmanager
+def ModeScope(mode):
+    global mode_statck
+    mode_statck.insert(0, mode)
+    try:
+        yield
+    finally:
+        mode_statck.pop(0)
+
+
+mode_statck = [NORMAL_MODE]
diff --git a/python/oneflow/framework/scope_symbol.py b/python/oneflow/framework/scope_symbol.py
new file mode 100644
index 0000000000000000000000000000000000000000..43f8c866c46101dbcb445c00c70d066e66637a23
--- /dev/null
+++ b/python/oneflow/framework/scope_symbol.py
@@ -0,0 +1,160 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import collections
+import re
+
+import oneflow._oneflow_internal
+import oneflow._oneflow_internal.oneflow.core.job.placement as placement_cfg
+import oneflow._oneflow_internal.oneflow.core.job.scope as scope_cfg
+import oneflow.eager.symbol_storage as symbol_storage
+from oneflow.eager.symbol import Symbol
+
+
+class ScopeSymbol(Symbol):
+    def __init__(self, symbol_id, scope_proto, parent_scope_symbol=None):
+        Symbol.__init__(self, symbol_id, scope_proto)
+        self.parent_scope_symbol_ = parent_scope_symbol
+        self.job_desc_symbol_ = oneflow._oneflow_internal.GetJobConfSymbol(
+            scope_proto.job_desc_symbol_id()
+        )
+        self.device_parallel_desc_symbol_ = oneflow._oneflow_internal.GetPlacementSymbol(
+            scope_proto.device_parallel_desc_symbol_id()
+        )
+        self.host_parallel_desc_symbol_ = oneflow._oneflow_internal.GetPlacementSymbol(
+            scope_proto.host_parallel_desc_symbol_id()
+        )
+        self.auto_increment_id_ = 0
+
+    def auto_increment_id(self):
+        self.auto_increment_id_ = self.auto_increment_id_ + 1
+        return self.auto_increment_id_
+
+    @property
+    def session_id(self):
+        return self.data.session_id()
+
+    @property
+    def job_desc_symbol(self):
+        return self.job_desc_symbol_
+
+    @property
+    def device_parallel_desc_symbol(self):
+        return self.device_parallel_desc_symbol_
+
+    @property
+    def parent_scope_symbol(self):
+        return self.parent_scope_symbol_
+
+    def BuildBySetter(self, instruction_builder, setter):
+        scope_proto = self._CloneScopeProto()
+        setter(scope_proto)
+        return instruction_builder.GetScopeSymbol(scope_proto)
+
+    def BuildWithNewParallelDesc(
+        self, instruction_builder, device_tag, machine_device_ids
+    ):
+        if isinstance(machine_device_ids, str):
+            machine_device_ids = [machine_device_ids]
+
+        def SetScopeProto(scope_proto):
+            parallel_conf = MakeParallelConf(device_tag, machine_device_ids)
+            device_parallel_desc_sym = instruction_builder.GetParallelDescSymbol(
+                parallel_conf
+            )
+            parallel_conf = MakeParallelConf("cpu", machine_device_ids)
+            host_parallel_desc_sym = instruction_builder.GetParallelDescSymbol(
+                parallel_conf
+            )
+            scope_proto.set_device_parallel_desc_symbol_id(
+                device_parallel_desc_sym.symbol_id
+            )
+            scope_proto.set_host_parallel_desc_symbol_id(
+                host_parallel_desc_sym.symbol_id
+            )
+
+        return self.BuildBySetter(instruction_builder, SetScopeProto)
+
+    def BuildWithNewParallelConf(self, instruction_builder, parallel_conf):
+        (
+            device_tag,
+            machine_device_ids,
+            hierarchy,
+        ) = oneflow._oneflow_internal.GetDeviceTagAndMachineDeviceIdsAndHierarchy(
+            parallel_conf
+        )
+        return self.BuildWithNewParallelDesc(
+            instruction_builder, device_tag, machine_device_ids
+        )
+
+    def BuildWithNewIsMirrored(self, instruction_builder, is_mirrored):
+        def SetScopeProto(scope_proto):
+            if is_mirrored:
+                scope_proto.mutable_opt_mirrored_parallel_conf().mutable_mirrored_parallel()
+            else:
+                scope_proto.mutable_opt_mirrored_parallel_conf().clear_mirrored_parallel()
+
+        return self.BuildBySetter(instruction_builder, SetScopeProto)
+
+    def BuildWithNewScopeName(self, instruction_builder, scope_name):
+        def SetScopeProto(scope_proto):
+            scope_proto.add_scope_op_name_prefixes(scope_name)
+
+        return self.BuildBySetter(instruction_builder, SetScopeProto)
+
+    def _CloneScopeProto(self):
+        scope_proto = scope_cfg.ScopeProto()
+        scope_proto.CopyFrom(self.data)
+        return scope_proto
+
+
+def BuildInitialScope(
+    instruction_builder,
+    session_id,
+    job_conf,
+    device_tag,
+    machine_device_ids,
+    is_mirrored,
+):
+    scope_proto = scope_cfg.ScopeProto()
+    scope_proto.set_session_id(session_id)
+    job_conf_sym = instruction_builder.GetJobConfSymbol(job_conf)
+    scope_proto.set_job_desc_symbol_id(job_conf_sym.symbol_id)
+    parallel_conf = MakeParallelConf(device_tag, machine_device_ids)
+    device_parallel_desc_sym = instruction_builder.GetParallelDescSymbol(parallel_conf)
+    scope_proto.set_device_parallel_desc_symbol_id(device_parallel_desc_sym.symbol_id)
+    parallel_conf = MakeParallelConf("cpu", machine_device_ids)
+    host_parallel_desc_sym = instruction_builder.GetParallelDescSymbol(parallel_conf)
+    scope_proto.set_host_parallel_desc_symbol_id(host_parallel_desc_sym.symbol_id)
+    if is_mirrored:
+        scope_proto.mutable_opt_mirrored_parallel_conf().mutable_mirrored_parallel()
+    else:
+        scope_proto.mutable_opt_mirrored_parallel_conf().clear_mirrored_parallel()
+    return instruction_builder.GetScopeSymbol(scope_proto)
+
+
+def MakeParallelConf(device_tag, machine_device_ids):
+    assert isinstance(machine_device_ids, (list, tuple))
+    parallel_conf = placement_cfg.ParallelConf()
+    parallel_conf.set_device_tag(device_tag)
+    for machine_device_id in machine_device_ids:
+        assert isinstance(
+            machine_device_id, str
+        ), "type of machine_device_id (%s) is not string" % type(machine_device_id)
+        assert re.match("^\\d+:\\d+(-\\d+)?$", machine_device_id) is not None, (
+            "machine_device_id: %s is not valid" % machine_device_id
+        )
+        parallel_conf.add_device_name(machine_device_id)
+    return parallel_conf
diff --git a/python/oneflow/framework/scope_util.py b/python/oneflow/framework/scope_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..82725f77dc23b6467df3b1bac11fb3271363054f
--- /dev/null
+++ b/python/oneflow/framework/scope_util.py
@@ -0,0 +1,117 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import traceback
+from contextlib import contextmanager
+
+from google.protobuf import text_format
+
+import oneflow._oneflow_internal
+import oneflow._oneflow_internal.oneflow.core.job.job_conf as job_conf_cfg
+import oneflow.core.job.scope_pb2 as scope_pb2_util
+import oneflow.framework.attr_util as attr_util
+import oneflow.framework.session_context as session_ctx
+from oneflow import oneflow_deprecate
+
+
+def api_scope_config(**kwargs):
+    name2default = session_ctx.GetDefaultSession().scope_attr_name2default_val
+
+    def SetScopeProto(scope_proto):
+        for (attr_name, py_value) in kwargs.items():
+            assert attr_name in name2default
+            attr_util.SetAttrValue(
+                scope_proto.mutable_attr_name2attr_value()[attr_name],
+                py_value,
+                name2default[attr_name],
+            )
+
+    sess = session_ctx.GetDefaultSession()
+    scope = MakeScope(
+        lambda old_scope, builder: builder.BuildScopeByProtoSetter(
+            old_scope, SetScopeProto
+        )
+    )
+    return ScopeContext(scope)
+
+
+def api_current_scope():
+    """ Return current scope
+    """
+    return oneflow._oneflow_internal.GetCurrentScope()
+
+
+from oneflow import oneflow_deprecate
+
+
+@oneflow_deprecate()
+def deprecated_current_scope(*args, **kwargs):
+    print(
+        "WARNING:",
+        "oneflow.scope.current_scope",
+        "will be removed in the future, use {} instead.".format(
+            "oneflow.current_scope"
+        ),
+    )
+    print(traceback.format_stack()[-2])
+    return api_current_scope(*args, **kwargs)
+
+
+def MakeScope(build_func):
+    scope = None
+    old_scope = oneflow._oneflow_internal.GetCurrentScope()
+    assert old_scope is not None
+
+    def BuildScope(builder):
+        nonlocal scope
+        scope = build_func(old_scope, builder)
+        assert scope is not None
+
+    oneflow._oneflow_internal.deprecated.LogicalRun(BuildScope)
+    return scope
+
+
+def MakeInitialScope(job_conf, device_tag, machine_device_ids, hierarchy, is_mirrored):
+    scope = None
+
+    def BuildInitialScope(builder):
+        nonlocal scope
+        session_id = session_ctx.GetDefaultSession().id
+        scope = builder.BuildInitialScope(
+            session_id, job_conf, device_tag, machine_device_ids, hierarchy, is_mirrored
+        )
+
+    oneflow._oneflow_internal.deprecated.LogicalRun(BuildInitialScope)
+    return scope
+
+
+def InitScopeStack():
+    job_conf = job_conf_cfg.JobConfigProto()
+    job_conf.mutable_predict_conf()
+    job_conf.set_job_name("")
+    scope = MakeInitialScope(job_conf, "cpu", ["0:0"], None, is_mirrored=False)
+    oneflow._oneflow_internal.InitGlobalScopeStack(scope)
+
+
+@contextmanager
+def ScopeContext(scope):
+    old_scope = oneflow._oneflow_internal.GetCurrentScope()
+    oneflow._oneflow_internal.GlobalScopeStackPush(scope)
+    try:
+        yield
+    finally:
+        assert oneflow._oneflow_internal.GetCurrentScope() is scope
+        oneflow._oneflow_internal.GlobalScopeStackPop()
+        assert oneflow._oneflow_internal.GetCurrentScope() is old_scope
diff --git a/python/oneflow/framework/session_context.py b/python/oneflow/framework/session_context.py
new file mode 100644
index 0000000000000000000000000000000000000000..66198047abea28ec051c724d1cbe7e5aa95286b2
--- /dev/null
+++ b/python/oneflow/framework/session_context.py
@@ -0,0 +1,59 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import functools
+
+import oneflow
+import oneflow._oneflow_internal
+
+
+class SessionStatus:
+    OPEN = "OPEN"
+    RUNNING = "RUNNING"
+    CLOSED = "CLOSED"
+
+
+def GetDefaultSession():
+    global _sess_id2sess
+    default_sess_id = oneflow._oneflow_internal.GetDefaultSessionId()
+    assert default_sess_id in _sess_id2sess
+    return _sess_id2sess[default_sess_id]
+
+
+def OpenDefaultSession(sess):
+    global _sess_id2sess
+    assert sess.id not in _sess_id2sess
+    _sess_id2sess[sess.id] = sess
+
+
+def TryCloseDefaultSession():
+    global _sess_id2sess
+    default_sess_id = oneflow._oneflow_internal.GetDefaultSessionId()
+    assert default_sess_id in _sess_id2sess
+    if default_sess_id in _sess_id2sess:
+        _sess_id2sess[default_sess_id].TryClose()
+    del _sess_id2sess[default_sess_id]
+
+
+def try_init_default_session(func):
+    @functools.wraps(func)
+    def Func(*args, **kwargs):
+        GetDefaultSession().TryInit()
+        return func(*args, **kwargs)
+
+    return Func
+
+
+_sess_id2sess = {}
diff --git a/python/oneflow/framework/session_util.py b/python/oneflow/framework/session_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..e75886644a68ae182678dec04b1da5239d70fb84
--- /dev/null
+++ b/python/oneflow/framework/session_util.py
@@ -0,0 +1,507 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import inspect
+import threading
+import traceback
+from contextlib import contextmanager
+from typing import Callable
+
+from google.protobuf import text_format
+
+import oneflow
+import oneflow._oneflow_internal
+import oneflow.core.job.job_set_pb2 as job_set_util
+import oneflow.eager.op_executor as op_executor
+import oneflow.framework.c_api_util as c_api_util
+import oneflow.framework.check_point_v2 as check_point_v2
+import oneflow.framework.compiler as compiler
+import oneflow.framework.config_util as config_util
+import oneflow.framework.env_util as env_util
+import oneflow.framework.hob as hob
+import oneflow.framework.job_instance as job_instance_util
+import oneflow.framework.module as module_util
+import oneflow.framework.push_util as push_util
+import oneflow.framework.session_context as session_ctx
+import oneflow.framework.typing_util as oft_util
+import oneflow.support.enable_if as enable_if
+from oneflow import oneflow_deprecate
+from oneflow.core.job.job_set_pb2 import ConfigProto
+from oneflow.experimental import interface_op_read_and_write
+from oneflow.framework.check_point import SnapshotManager
+from oneflow.framework.function_desc import FunctionDesc
+from oneflow.framework.pull_util import EagerFutureRemoteBlobs, LazyFutureRemoteBlobs
+from oneflow.framework.session_context import SessionStatus
+
+
+class Session(object):
+    def __init__(self, sess_id):
+        self.job_name2function_desc_ = {}
+        self.job_name2job_ = {}
+        self.status_ = SessionStatus.OPEN
+        self.cond_var_ = threading.Condition()
+        self.running_job_cnt_ = 0
+        self.inter_user_job_info_ = None
+        self.uuid2watch_handler_ = {}
+        self.config_proto_ = None
+        self.resource_ = None
+        self.job_name2var_name2var_blob_ = {}
+        self.job_name2module_name2module_ = {}
+        self.existed_module_names_ = set()
+        self.var_name2var_blob_ = {}
+        self.interface_op_name2op_attr_ = {}
+        self.interface_op_name2job_name_ = {}
+        self.lazy_interface_op_name2parallel_conf_ = {}
+        self.op_name2lazy_blob_cache_ = {}
+        self.job_name2name_scope_stack_ = {}
+        self.eager_global_function_desc_stack_ = []
+        self.function_flag_name2default_val_ = {}
+        self._UpdateFunctionFlagName2DefaultVal()
+        self.scope_attr_name2default_val_ = {}
+        self._UpdateScopeAttrName2DefaultVal()
+        self.sess_ = oneflow._oneflow_internal.RegsiterSession(sess_id)
+        self.backward_blob_register_ = oneflow._oneflow_internal.BlobRegister()
+        self.snapshot_mgr_ = SnapshotManager()
+        self.eager_config_proto_ctx_ = None
+
+    @property
+    def id(self):
+        return self.sess_.id
+
+    @property
+    def status(self):
+        return self.status_
+
+    @property
+    def is_running(self):
+        return self.status_ is SessionStatus.RUNNING
+
+    @property
+    def config_proto(self):
+        if self.config_proto_ is None:
+            self.config_proto_ = _GetDefaultConfigProto()
+        return self.config_proto_
+
+    @property
+    def resource(self):
+        if self.resource_ is None:
+            return oneflow.env.current_resource()
+        else:
+            return self.resource_
+
+    @property
+    def uuid2watch_handler(self):
+        return self.uuid2watch_handler_
+
+    @property
+    def function_flag_name2default_val(self):
+        return self.function_flag_name2default_val_
+
+    @property
+    def scope_attr_name2default_val(self):
+        return self.scope_attr_name2default_val_
+
+    @property
+    def inter_user_job_info(self):
+        return self.inter_user_job_info_
+
+    @property
+    def job_name2name_scope_stack(self):
+        return self.job_name2name_scope_stack_
+
+    @property
+    def backward_blob_register(self):
+        return self.backward_blob_register_
+
+    @property
+    def snapshot_mgr(self):
+        return self.snapshot_mgr_
+
+    @property
+    def var_name2var_blob(self):
+        return self.var_name2var_blob_
+
+    def GetLazyFunctionDesc(self, job_name):
+        if job_name in self.job_name2function_desc_:
+            return self.job_name2function_desc_[job_name]
+        return None
+
+    def AnyGlobalFunctionDefined(self):
+        return len(self.job_name2function_desc_) > 0
+
+    def GetJobConfigProto(self, job_name):
+        return self.job_name2function_desc_[job_name].job_config_proto
+
+    def GetFunctionDesc(self, job_name):
+        return self.job_name2function_desc_[job_name]
+
+    def _UpdateFunctionFlagName2DefaultVal(self):
+        items = c_api_util.GetFunctionConfigDef().attr_name2attr_def.items()
+        self.function_flag_name2default_val_ = {k: v.default_val for (k, v) in items}
+
+    def _UpdateScopeAttrName2DefaultVal(self):
+        items = c_api_util.GetScopeConfigDef().attr_name2attr_def.items()
+        self.scope_attr_name2default_val_ = {k: v.default_val for (k, v) in items}
+
+    def TryInit(self):
+        if self.status_ is SessionStatus.OPEN:
+            self.Init()
+        return self
+
+    def UpdateInfo4InterfaceOp(self):
+        for op_attr in c_api_util.GetInterfaceOpAttributes().op_attribute:
+            self.interface_op_name2op_attr_[op_attr.op_conf.name] = op_attr
+        for job in c_api_util.GetJobSet().job:
+            op_name2parallel_conf = {}
+            for placement_group in job.placement.placement_group:
+                for op_name in placement_group.op_set.op_name:
+                    op_name2parallel_conf[op_name] = placement_group.parallel_conf
+            for op_conf in job.net.op:
+                if c_api_util.IsInterfaceOpConf(op_conf):
+                    self.interface_op_name2job_name_[
+                        op_conf.name
+                    ] = job.job_conf.job_name
+                    self.lazy_interface_op_name2parallel_conf_[
+                        op_conf.name
+                    ] = op_name2parallel_conf[op_conf.name]
+
+    def Init(self):
+        assert self.status_ is SessionStatus.OPEN
+        self.status_ = SessionStatus.RUNNING
+        if not oneflow._oneflow_internal.IsEnvInited():
+            oneflow.env.init()
+        _TryCompleteConfigProto(self.config_proto)
+        self.resource_ = self.config_proto.resource
+        if not oneflow._oneflow_internal.EagerExecutionEnabled():
+            c_api_util.InitLazyGlobalSession(self.config_proto)
+            for (job_name, func_desc) in self.job_name2function_desc_.items():
+                compiler.Compile(self, func_desc, self.config_proto)
+                self.existed_module_names_ = set()
+            self.job_name2var_name2var_blob_ = dict()
+            assert len(self.job_name2function_desc_.items()) > 0
+            oneflow._oneflow_internal.StartLazyGlobalSession()
+            self.inter_user_job_info_ = c_api_util.GetInterUserJobInfo()
+            self.UpdateInfo4InterfaceOp()
+            if not config_util.api_legacy_model_io_enabled():
+                check_point_v2.Init()
+        else:
+            self.eager_config_proto_ctx_ = oneflow._oneflow_internal.LogicalConfigProtoContext(
+                str(self.config_proto)
+            )
+        return self
+
+    def FindOrCreateLazyBlob(self, op_name, Create):
+        if op_name not in self.op_name2lazy_blob_cache_:
+            self.op_name2lazy_blob_cache_[op_name] = Create()
+        return self.op_name2lazy_blob_cache_[op_name]
+
+    def TryClose(self):
+        if self.status_ is SessionStatus.RUNNING:
+            self.Close()
+        if self.status_ != SessionStatus.CLOSED:
+            oneflow._oneflow_internal.ClearSessionById(self.id)
+        self.status_ = SessionStatus.CLOSED
+
+    def Close(self):
+        assert self.status_ is SessionStatus.RUNNING
+        self.Sync()
+        assert len(self.job_name2var_name2var_blob_) == 0
+        del self.var_name2var_blob_
+        del self.job_name2module_name2module_
+        self.ReleaseLazyRefBlob()
+        self.ForceReleaseEagerBlobs()
+        oneflow._oneflow_internal.StopLazyGlobalSession()
+        oneflow._oneflow_internal.DestroyLazyGlobalSession()
+        self.resource_ = None
+        if self.eager_config_proto_ctx_:
+            del self.eager_config_proto_ctx_
+
+    def AddJob(self, function_desc):
+        assert self.status_ is SessionStatus.OPEN
+        assert isinstance(function_desc, FunctionDesc)
+        self.job_name2function_desc_[function_desc.job_func.__name__] = function_desc
+
+    def StashJob(self, job_name=None, key=None):
+        assert self.status_ is SessionStatus.RUNNING, "current status {}".format(
+            self.status_
+        )
+        job = c_api_util.GetCurrentJob()
+        if job_name is not None:
+            assert (
+                job.job_conf.job_name == job_name
+            ), "{} is not current job name".format(job_name)
+        else:
+            job_name = job.job_conf.job_name
+        if key is None:
+            key = job_name
+        self.job_name2job_[key] = job
+
+    def Job(self, job_name):
+        assert self.status_ is SessionStatus.RUNNING
+        if job_name not in self.job_name2job_:
+            return None
+        return self.job_name2job_[job_name]
+
+    def Sync(self):
+        assert self.status_ is SessionStatus.RUNNING
+        self.cond_var_.acquire()
+        while self.running_job_cnt_ > 0:
+            self.cond_var_.wait()
+        assert self.running_job_cnt_ == 0
+        self.cond_var_.release()
+
+    def ReleaseLazyRefBlob(self):
+        self.op_name2lazy_blob_cache_.clear()
+
+    def ForceReleaseEagerBlobs(self):
+        oneflow._oneflow_internal.GetDefaultBlobRegister().ForceReleaseAll()
+        self.backward_blob_register_.ForceReleaseAll()
+
+    def LazyRun(self, job_func, *arg):
+        assert self.status_ is SessionStatus.RUNNING
+        remote_blobs = self.LaunchUserJob(job_func, *arg)
+        if remote_blobs is None:
+            return
+        future_blob = LazyFutureRemoteBlobs(self).SetResult(remote_blobs).Inited()
+        annotation = inspect.signature(job_func).return_annotation
+        return oft_util.TransformGlobalFunctionResult(future_blob, annotation)
+
+    def EagerRun(self, function_desc, *arg):
+        with self._EagerGlobalFunctionDescScope(function_desc):
+            remote_blobs = compiler.EagerRun(
+                self, function_desc, self.config_proto, arg
+            )
+            if remote_blobs is None:
+                return
+            future_blob = EagerFutureRemoteBlobs().SetResult(remote_blobs).Inited()
+        annotation = inspect.signature(function_desc.job_func).return_annotation
+        return oft_util.TransformGlobalFunctionResult(future_blob, annotation)
+
+    def LaunchUserJob(self, job_func, *arg):
+        assert self.status_ is SessionStatus.RUNNING
+        job_name = job_func.__name__
+        push_util.AsyncPush(self, job_func, *arg)
+        self.LaunchJob(job_instance_util.MakeUserJobInstance(job_name))
+        return job_func.__oneflow_output_remote_blobs__
+
+    def LaunchJob(self, job_instance):
+        assert self.status_ is SessionStatus.RUNNING
+        self._IncRunningJobCnt()
+        job_instance.AddPostFinishCallback(lambda _: self._DecRunningJobCnt())
+        oneflow._oneflow_internal.LaunchJob(job_instance)
+
+    def AsyncPush(self, op_name, push_data_cb):
+        assert self.status_ is SessionStatus.RUNNING
+        push_job_name = self.inter_user_job_info.input_or_var_op_name2push_job_name[
+            op_name
+        ]
+        self.LaunchJob(
+            job_instance_util.MakePushJobInstance(push_job_name, op_name, push_data_cb)
+        )
+
+    def AsyncPull(self, op_name, pull_data_cb):
+        assert self.status_ is SessionStatus.RUNNING
+        pull_job_name = self.inter_user_job_info.output_or_var_op_name2pull_job_name[
+            op_name
+        ]
+        self.LaunchJob(
+            job_instance_util.MakePullJobInstance(pull_job_name, op_name, pull_data_cb)
+        )
+
+    def HasAnyCallbackAfterFunctionReturn(self):
+        return len(self.uuid2watch_handler) > 0
+
+    def StashVariableBlob4Job(self, job_name, var_name, var_blob):
+        if var_name not in self.var_name2var_blob_:
+            self.var_name2var_blob_[var_name] = var_blob
+        if job_name not in self.job_name2var_name2var_blob_:
+            self.job_name2var_name2var_blob_[job_name] = dict()
+        assert var_name not in self.job_name2var_name2var_blob_[job_name]
+        self.job_name2var_name2var_blob_[job_name][var_name] = var_blob
+
+    def AddInfo4InterfaceOpName(self, interface_op_name, op_attribute):
+        if oneflow.eager_execution_enabled():
+            self.interface_op_name2op_attr_[interface_op_name] = op_attribute
+            self.interface_op_name2job_name_[
+                interface_op_name
+            ] = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+        else:
+            pass
+
+    def OpAttribute4InterfaceOpName(self, interface_op_name):
+        return self.interface_op_name2op_attr_[interface_op_name]
+
+    def ParallelConf4LazyInterfaceOpName(self, interface_op_name):
+        return self.lazy_interface_op_name2parallel_conf_[interface_op_name]
+
+    def JobName4InterfaceOpName(self, interface_op_name):
+        return self.interface_op_name2job_name_[interface_op_name]
+
+    @property
+    def interface_ops(self):
+        return self.interface_op_name2op_attr_.keys()
+
+    def TryGetVariableBlobOfJobFromStash(self, job_name, var_name):
+        if var_name not in self.var_name2var_blob_:
+            return (None, None)
+        global_variable_blob = self.var_name2var_blob_[var_name]
+        if job_name not in self.job_name2var_name2var_blob_:
+            return (global_variable_blob, None)
+        var_name2var_blob = self.job_name2var_name2var_blob_[job_name]
+        if var_name not in var_name2var_blob:
+            return (global_variable_blob, None)
+        return (global_variable_blob, var_name2var_blob[var_name])
+
+    def CurrentEagerGlobalFunctionDesc(self):
+        if len(self.eager_global_function_desc_stack_) == 0:
+            return None
+        return self.eager_global_function_desc_stack_[0]
+
+    def has_empty_is_mirrored_strategy_enabled_stack(self):
+        return self.sess_.is_mirrored_strategy_enabled_stack_size() == 0
+
+    def push_mirrored_strategy_enabled(self, val):
+        assert isinstance(val, bool)
+        self.sess_.push_mirrored_strategy_enabled(val)
+
+    def pop_mirrored_strategy_enabled(self):
+        self.sess_.pop_mirrored_strategy_enabled()
+
+    def is_mirrored_strategy_enabled(self):
+        return self.sess_.is_mirrored_strategy_enabled()
+
+    def is_consistent_strategy_enabled(self):
+        return self.sess_.is_consistent_strategy_enabled()
+
+    @contextmanager
+    def _EagerGlobalFunctionDescScope(self, function_desc):
+        assert len(self.backward_blob_register.blob_name2object) == 0
+        assert len(self.job_name2var_name2var_blob_) == 0
+        self.eager_global_function_desc_stack_.insert(0, function_desc)
+        try:
+            yield
+        finally:
+            self.existed_module_names_ = set()
+            self.job_name2var_name2var_blob_ = dict()
+            self.eager_global_function_desc_stack_.pop(0)
+            keys = list(dict(self.backward_blob_register.blob_name2object).keys())
+            for key in keys:
+                self.backward_blob_register.ClearObject4BlobName(key)
+
+    def _IncRunningJobCnt(self):
+        assert self.status_ is SessionStatus.RUNNING
+        self.cond_var_.acquire()
+        self.running_job_cnt_ += 1
+        self.cond_var_.release()
+
+    def _DecRunningJobCnt(self):
+        self.cond_var_.acquire()
+        self.running_job_cnt_ -= 1
+        self.cond_var_.notify()
+        self.cond_var_.release()
+
+    def __del__(self):
+        self.TryClose()
+
+
+def api_find_or_create_module(
+    module_name: str, create: Callable[[], None], reuse: bool = False
+):
+    func = enable_if.unique([find_or_create_module])
+    return func(module_name, create, reuse)
+
+
+@enable_if.condition(hob.in_global_mode)
+def find_or_create_module(module_name, create, reuse=False):
+    assert callable(create)
+    sess = session_ctx.GetDefaultSession()
+    job_name = oneflow.current_global_function_desc().job_config_proto.job_name()
+    if job_name not in sess.job_name2module_name2module_:
+        sess.job_name2module_name2module_[job_name] = {}
+    module_name2module = sess.job_name2module_name2module_[job_name]
+    if module_name not in module_name2module:
+        module = create()
+        assert isinstance(module, module_util.Module)
+        module_name2module[module_name] = module
+    elif not reuse:
+        assert module_name not in sess.existed_module_names_, (
+            "duplicated module_name `%s' in global_function `%s'"
+            % (module_name, job_name)
+        )
+    else:
+        pass
+    sess.existed_module_names_.add(module_name)
+    return module_name2module[module_name]
+
+
+def api_eager_execution_enabled() -> bool:
+    """Get current setting of the job, if enable eager execution mode ,then return True
+
+    Returns:
+        bool: [description]
+    """
+    return oneflow._oneflow_internal.EagerExecutionEnabled()
+
+
+def api_clear_default_session() -> None:
+    """Clear the default session. All compiled OneFlow functions will be deleted.
+    """
+    func = enable_if.unique([clear_default_session])
+    return func()
+
+
+@enable_if.condition(hob.in_normal_mode)
+def clear_default_session():
+    is_multi_client = oneflow._oneflow_internal.IsMultiClient()
+    if not is_multi_client:
+        session_ctx.TryCloseDefaultSession()
+        session_ctx.OpenDefaultSession(
+            Session(oneflow._oneflow_internal.NewSessionId())
+        )
+
+
+def api_sync_default_session() -> None:
+    """Synchronize the default session. Block until every synchronous OneFlow function and its callback finishes running.
+    """
+    func = enable_if.unique([sync_default_session])
+    return func()
+
+
+@enable_if.condition(hob.in_normal_mode)
+def sync_default_session() -> None:
+    session_ctx.GetDefaultSession().Sync()
+
+
+def _TryCompleteConfigProto(config_proto):
+    if config_proto.resource.machine_num == 0:
+        config_proto.resource.machine_num = oneflow._oneflow_internal.GetNodeSize()
+
+
+def _GetDefaultConfigProto():
+    config_proto = job_set_util.ConfigProto()
+    config_proto.resource.machine_num = 0
+    if oneflow._oneflow_internal.flags.with_cuda():
+        config_proto.resource.gpu_device_num = 1
+    else:
+        config_proto.resource.cpu_device_num = 1
+        config_proto.resource.gpu_device_num = 0
+    config_proto.session_id = session_ctx.GetDefaultSession().id
+    return config_proto
+
+
+def TmpInitEagerGlobalSession():
+    config_pb = _GetDefaultConfigProto()
+    config_proto_str = text_format.MessageToString(config_pb)
+    oneflow._oneflow_internal.InitEagerGlobalSession(config_proto_str)
diff --git a/python/oneflow/framework/sysconfig.py b/python/oneflow/framework/sysconfig.py
new file mode 100644
index 0000000000000000000000000000000000000000..69211eb2b5157856656c8f83751abd6397f97fa4
--- /dev/null
+++ b/python/oneflow/framework/sysconfig.py
@@ -0,0 +1,75 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import imp
+import importlib.util
+import os
+from typing import List
+
+import oneflow
+import oneflow._oneflow_internal
+
+
+def get_include() -> str:
+    return os.path.join(os.path.dirname(oneflow.__file__), "include")
+
+
+def get_lib() -> str:
+    return os.path.dirname(oneflow.__file__)
+
+
+def get_compile_flags() -> List[str]:
+    flags = []
+    flags.append("-I{}".format(get_include()))
+    flags.append("-DHALF_ENABLE_CPP11_USER_LITERALS=0")
+    if oneflow._oneflow_internal.flags.with_cuda():
+        flags.append("-DWITH_CUDA")
+    if oneflow._oneflow_internal.flags.use_cxx11_abi():
+        flags.append("-D_GLIBCXX_USE_CXX11_ABI=1")
+    else:
+        flags.append("-D_GLIBCXX_USE_CXX11_ABI=0")
+    return flags
+
+
+def get_link_flags() -> List[str]:
+    flags = []
+    flags.append("-L{}".format(get_lib()))
+    (file, oneflow_internal_lib_path, _) = imp.find_module(
+        "_oneflow_internal", [get_lib()]
+    )
+    if file:
+        file.close()
+    flags.append("-l:{}".format(os.path.basename(oneflow_internal_lib_path)))
+    return flags
+
+
+def with_cuda() -> bool:
+    return oneflow._oneflow_internal.flags.with_cuda()
+
+
+def with_xla() -> bool:
+    return oneflow._oneflow_internal.flags.with_xla()
+
+
+def has_rpc_backend_grpc() -> bool:
+    return oneflow._oneflow_internal.flags.has_rpc_backend_grpc()
+
+
+def has_rpc_backend_local() -> bool:
+    return oneflow._oneflow_internal.flags.has_rpc_backend_local()
+
+
+def cmake_build_type() -> str:
+    return oneflow._oneflow_internal.flags.cmake_build_type()
diff --git a/python/oneflow/framework/tensor.py b/python/oneflow/framework/tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..29500ebeef4975469adf476d0b1078140aca2fff
--- /dev/null
+++ b/python/oneflow/framework/tensor.py
@@ -0,0 +1,949 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import inspect
+from typing import Union
+
+import numpy as np
+
+import oneflow as flow
+import oneflow._oneflow_internal
+import oneflow._oneflow_internal.lazy_mode as lazy_mode
+import oneflow._oneflow_internal.oneflow.core.job.placement as placement_cfg
+import oneflow.core.job.initializer_conf_pb2 as initializer_conf_util
+import oneflow.framework.check_point_v2 as check_point_v2
+import oneflow.framework.dtype as dtype_util
+import oneflow.framework.id_util as id_util
+import oneflow.framework.ofblob as ofblob_util
+import oneflow.framework.remote_blob as remote_blob_util
+import oneflow.framework.runtime_mode as rt_mode
+import oneflow.framework.tensor_str as tensor_str_util
+import oneflow.ops.initializer_util as initializer_util
+import oneflow.support.async_util as async_util
+from oneflow._oneflow_internal.exception import IndexException
+from oneflow.framework.function_util import global_function_or_identity
+
+
+def register_local_tensor_method(name=None):
+    def decorator(method):
+        if name is None:
+            op_name = method.__name__
+        else:
+            op_name = name
+        setattr(oneflow._oneflow_internal.Tensor, op_name, method)
+        return method
+
+    return decorator
+
+
+@register_local_tensor_method("numpy")
+def _local_tensor_numpy(eager_local_tensor):
+    if eager_local_tensor.dtype == flow.tensor_buffer:
+        (shapes, dtypes) = eager_local_tensor._tensor_buffer_shapes_and_dtypes
+        tensors = flow.tensor_buffer_to_list_of_tensors(
+            Tensor(eager_local_tensor), shapes, dtypes
+        )
+        return [t.numpy() for t in tensors]
+    method_name = eager_local_tensor._get_copy_mirrored_tensor_to_numpy_func_name()
+    copy_to_numpy = getattr(eager_local_tensor, method_name)
+    ndarray = np.empty(
+        tuple(eager_local_tensor.shape),
+        dtype=flow.convert_oneflow_dtype_to_numpy_dtype(eager_local_tensor.dtype),
+    )
+    copy_to_numpy(ndarray)
+    return ndarray
+
+
+@register_local_tensor_method("copy_")
+def _copy_from_numpy_to_eager_local_tensor(eager_local_tensor, np_arr):
+    method_name = eager_local_tensor._get_copy_mirrored_tensor_from_numpy_func_name()
+    copy_from_numpy = getattr(eager_local_tensor, method_name)
+    assert np_arr.dtype == flow.convert_oneflow_dtype_to_numpy_dtype(
+        eager_local_tensor.dtype
+    )
+    if np_arr.shape == ():
+        assert tuple(eager_local_tensor.shape) == (1,)
+    else:
+        assert np_arr.shape == tuple(eager_local_tensor.shape)
+    copy_from_numpy(np_arr)
+
+
+@register_local_tensor_method("_init_by_initializer_conf")
+def _init_eager_local_tensor_by_initializer_conf(
+    eager_local_tensor, initializer_conf, random_seed=0
+):
+    shape = tuple(eager_local_tensor.shape)
+    initializer = initializer_util.GetInitializer(initializer_conf, random_seed, shape)
+    if initializer is None:
+        return
+    _copy_from_numpy_to_eager_local_tensor(
+        eager_local_tensor,
+        check_point_v2.generate_values_by_initializer(
+            initializer, shape, eager_local_tensor.dtype
+        ),
+    )
+
+
+def construct_tensor(
+    data,
+    dtype=None,
+    device=None,
+    requires_grad=False,
+    placement=None,
+    sbp=None,
+    is_consistent=False,
+    is_lazy=False,
+):
+    if _is_scalar(data) or _input_args_is_data(data):
+        if (
+            not _input_args_is_numpy(data)
+            and dtype is None
+            and _input_dtype_is_float(data)
+        ):
+            dtype = flow.float32
+        data = np.array(data)
+        if dtype is None:
+            dtype = dtype_util.convert_numpy_dtype_to_oneflow_dtype(data.dtype)
+        return Tensor(
+            data,
+            dtype=dtype,
+            device=device,
+            requires_grad=requires_grad,
+            placement=placement,
+            sbp=sbp,
+            is_consistent=is_consistent,
+            is_lazy=is_lazy,
+        )
+    else:
+        raise TypeError("Construction error, invalid combination of arguments")
+
+
+class Tensor:
+    def __init__(
+        self,
+        *args,
+        dtype=None,
+        device=None,
+        requires_grad=False,
+        placement=None,
+        sbp=None,
+        is_consistent=False,
+        is_lazy=False,
+        data_initializer=None,
+        determining_initializer=None,
+    ):
+        assert len(args) > 0
+        dtype = dtype if dtype is not None else oneflow._oneflow_internal.float32
+        if isinstance(device, str):
+            device = flow.device(device)
+        if placement is None:
+            device = (
+                device
+                if device is not None
+                else oneflow._oneflow_internal.device("cpu")
+            )
+        if _input_args_is_tensor(*args):
+            self._local_or_consistent_tensor = flow.to(
+                *args, device=args[0].device, dtype=args[0].dtype, copy=True
+            )
+            self._undetermined_tensor = None
+        elif _input_args_is_consistent_or_local(*args):
+            self._local_or_consistent_tensor = args[0]
+            self._undetermined_tensor = None
+        elif _input_args_is_data(*args):
+            self._local_or_consistent_tensor = None
+            self._construct_with_data(
+                *args,
+                dtype=dtype,
+                device=device,
+                requires_grad=requires_grad,
+                placement=placement,
+                sbp=sbp,
+                is_consistent=is_consistent,
+                is_lazy=is_lazy,
+            )
+        elif _input_args_is_shape(*args):
+            shape = args
+            self._local_or_consistent_tensor = None
+            self._undetermined_tensor = UndeterminedTensor(
+                shape,
+                dtype,
+                device=device,
+                requires_grad=requires_grad,
+                placement=placement,
+                sbp=sbp,
+                is_consistent=is_consistent,
+                is_lazy=is_lazy,
+                data_initializer=data_initializer,
+            )
+            if determining_initializer is None:
+                determining_initializer = _default_initializer_for_determining
+            self._determining_initializer = determining_initializer
+        else:
+            raise TypeError("new() received an invalid combination of arguments")
+
+    @property
+    def shape(self):
+        if self._local_or_consistent_tensor is not None:
+            return self._local_or_consistent_tensor.shape
+        else:
+            return self._undetermined_tensor.shape
+
+    def stride(self):
+        assert self.is_determined
+        return self._local_or_consistent_tensor.stride()
+
+    def storage_offset(self):
+        assert self.is_determined
+        return self._local_or_consistent_tensor.storage_offset()
+
+    def is_contiguous(self):
+        assert self.is_determined
+        return self._local_or_consistent_tensor.is_contiguous()
+
+    @property
+    def device(self):
+        if self._local_or_consistent_tensor is not None:
+            return self._local_or_consistent_tensor.device
+        else:
+            return self._undetermined_tensor.device
+
+    @register_local_tensor_method("ndim")
+    @property
+    def ndim(self):
+        return len(self.shape)
+
+    @property
+    def is_cuda(self):
+        if self._local_or_consistent_tensor is not None:
+            return self._local_or_consistent_tensor.is_cuda
+        else:
+            return self._undetermined_tensor.is_cuda
+
+    @property
+    def dtype(self):
+        if self._local_or_consistent_tensor is not None:
+            return self._local_or_consistent_tensor.dtype
+        else:
+            return self._undetermined_tensor.dtype
+
+    def _auto_determine(func):
+        def wrapped_func(*args, **kwargs):
+            tensor = args[0]
+            if not tensor.is_determined:
+                tensor.determine()
+            return func(*args, **kwargs)
+
+        return wrapped_func
+
+    @property
+    @_auto_determine
+    def data(self):
+        if self._local_or_consistent_tensor is not None:
+            return flow.Tensor(self._local_or_consistent_tensor.data)
+        else:
+            return None
+
+    @property
+    def grad(self):
+        if self._local_or_consistent_tensor is not None:
+            if self._local_or_consistent_tensor.grad is not None:
+                return flow.Tensor(self._local_or_consistent_tensor.grad)
+        else:
+            return None
+
+    @grad.setter
+    @_auto_determine
+    def grad(self, new_grad):
+        def check_grad(grad, new_grad):
+            assert (
+                grad.shape == new_grad.shape
+            ), f"Shape of grads are not equal, {grad.shape} vs {new_grad.shape}"
+            assert (
+                grad.device == new_grad.device
+            ), f"Device of grads are not equal, {grad.device} vs {new_grad.device}"
+            assert (
+                grad.dtype == new_grad.dtype
+            ), f"Data type of grads are not equal, {grad.dtype} vs {new_grad.dtype}"
+
+        if self._local_or_consistent_tensor is not None:
+            if new_grad is None:
+                self._local_or_consistent_tensor.set_grad(None)
+            else:
+                if isinstance(new_grad, Tensor):
+                    if not new_grad.is_determined:
+                        new_grad.determine()
+                    new_grad = new_grad._local_or_consistent_tensor
+                new_grad_detach = new_grad.detach()
+                check_grad(self.grad, new_grad_detach)
+                self._local_or_consistent_tensor.set_grad(new_grad_detach)
+
+    @property
+    def grad_fn(self):
+        if self._local_or_consistent_tensor is not None:
+            return self._local_or_consistent_tensor.grad_fn
+        else:
+            return None
+
+    @property
+    def requires_grad(self):
+        if self._local_or_consistent_tensor is not None:
+            return self._local_or_consistent_tensor.requires_grad
+        else:
+            return self._undetermined_tensor.requires_grad
+
+    @property
+    def is_leaf(self):
+        if self._local_or_consistent_tensor is not None:
+            return self._local_or_consistent_tensor.is_leaf
+        else:
+            return True
+
+    @requires_grad.setter
+    def requires_grad(self, requires_grad):
+        if self._local_or_consistent_tensor is not None:
+            self._local_or_consistent_tensor.requires_grad = requires_grad
+        else:
+            self._undetermined_tensor.requires_grad = requires_grad
+
+    @register_local_tensor_method()
+    def size(self, idx=None):
+        if idx is None:
+            return self.shape
+        else:
+            return self.shape[idx]
+
+    @register_local_tensor_method()
+    def dim(self):
+        return self.ndim
+
+    @register_local_tensor_method()
+    def ndimension(self):
+        return self.ndim
+
+    @_auto_determine
+    def detach(self):
+        if self._local_or_consistent_tensor is not None:
+            return flow.Tensor(self._local_or_consistent_tensor.detach())
+        else:
+            return None
+
+    @_auto_determine
+    def clone(self):
+        if self._local_or_consistent_tensor is not None:
+            return flow.Tensor(self._local_or_consistent_tensor.clone())
+        else:
+            return None
+
+    def requires_grad_(self, requires_grad=True):
+        self.requires_grad = requires_grad
+
+    def get_device(self):
+        if self._local_or_consistent_tensor is not None:
+            return self._local_or_consistent_tensor.device
+        else:
+            return self._undetermined_tensor.device
+
+    @register_local_tensor_method()
+    def nelement(self):
+        prod = 1
+        for dim in self.shape:
+            prod *= dim
+        return prod
+
+    @register_local_tensor_method()
+    def numel(self):
+        return self.nelement()
+
+    def retain_grad(self):
+        assert self.is_determined
+        self._local_or_consistent_tensor.retain_grad()
+
+    def data_ptr(self):
+        TODO()
+
+    def element_size(self):
+        return self.dtype.bytes
+
+    @_auto_determine
+    def numpy(self):
+        internal_tensor = self._local_or_consistent_tensor
+        if not internal_tensor.is_lazy and (not internal_tensor.is_consistent):
+            return _local_tensor_numpy(internal_tensor)
+        raise NotImplementedError()
+
+    @register_local_tensor_method()
+    def tolist(self):
+        return self.numpy().tolist()
+
+    @_auto_determine
+    @register_local_tensor_method()
+    def backward(self, gradient=None, retain_graph=False, create_graph=False):
+        if not lazy_mode.is_enabled():
+            flow.autograd.backward(self, gradient, retain_graph, create_graph)
+        else:
+            assert (
+                self.is_lazy
+            ), "nn.Graph only accept lazy tensor to call backward() in lazy mode."
+            flow._oneflow_internal.nn.graph.AddTensorAsGraphLoss(self)
+
+    @register_local_tensor_method()
+    def _transform_ellipsis_type(self, key):
+        d = self.ndim - len(key)
+        new_key = list()
+        for k in key:
+            if isinstance(k, type(Ellipsis)):
+                new_key.append(slice(None, None, None))
+                while d > 0:
+                    new_key.append(slice(None, None, None))
+                    d -= 1
+            else:
+                new_key.append(k)
+        return tuple(new_key)
+
+    @register_local_tensor_method()
+    def _get_slice_obj(self, key):
+        def get_or_default(x, default):
+            return x if x is not None else default
+
+        def get_canonical_index(index, length, *, start=0):
+            if index < 0:
+                index += length
+            if index > length or index < 0:
+                raise IndexError(f"Index should be in [0, {length}), but got {index}")
+            return max(min(index, length), start)
+
+        def get_slice_if_int(x):
+            if isinstance(x, slice):
+                return x
+            return slice(x, x + 1)
+
+        if isinstance(key, tuple):
+            assert all((isinstance(x, (slice, int)) for x in key))
+        else:
+            assert isinstance(key, (slice, int))
+            key = (key,)
+        key = list(map(get_slice_if_int, key))
+        assert len(key) <= len(self.shape)
+        for i in range(len(key), len(self.shape)):
+            key += (slice(None, None, None),)
+        starts = [
+            get_canonical_index(get_or_default(x.start, 0), self.shape[i])
+            for (i, x) in enumerate(key)
+        ]
+        stops = [
+            get_canonical_index(
+                get_or_default(x.stop, self.shape[i]), self.shape[i], start=starts[i]
+            )
+            for (i, x) in enumerate(key)
+        ]
+        steps = [get_or_default(x.step, 1) for x in key]
+        assert all((x > 0 for x in steps))
+        shape = (np.abs(np.array(stops) - np.array(starts)) - 1) // np.abs(
+            np.array(steps)
+        ) + 1
+        shape = shape.tolist()
+        return (starts, stops, steps, shape)
+
+    @_auto_determine
+    @register_local_tensor_method()
+    def __getitem__(self, key):
+        try:
+            return flow.F.tensor_getitem(self, key)
+        except IndexException as e:
+            raise IndexError(e)
+
+    @_auto_determine
+    @register_local_tensor_method()
+    def __setitem__(self, key, value):
+        if isinstance(value, (int, float)):
+            value = flow.F.constant([1], value, self.dtype)
+        flow.F.tensor_setitem(self, key, value)
+        return self
+
+    @register_local_tensor_method()
+    def __str__(self):
+        return self.__repr__()
+
+    @register_local_tensor_method()
+    def __repr__(self):
+        return tensor_str_util._gen_tensor_str(self)
+
+    @register_local_tensor_method()
+    def __gt__(self, other):
+        return self.gt(other)
+
+    @register_local_tensor_method()
+    def __lt__(self, other):
+        return self.lt(other)
+
+    @register_local_tensor_method()
+    def __ge__(self, other):
+        return self.ge(other)
+
+    @register_local_tensor_method()
+    def __le__(self, other):
+        return self.le(other)
+
+    def __array__(self):
+        TODO()
+
+    def __sizeof__(self):
+        TODO()
+
+    def __deepcopy__(self, memo):
+        TODO()
+
+    @register_local_tensor_method()
+    def __mul__(self, other):
+        return self.mul(other)
+
+    @register_local_tensor_method()
+    def __rmul__(self, other):
+        return self.mul(other)
+
+    @register_local_tensor_method()
+    def __add__(self, other):
+        return self.add(other)
+
+    @register_local_tensor_method()
+    def __iadd__(self, other):
+        return self.add_(other)
+
+    @register_local_tensor_method()
+    def __radd__(self, other):
+        return self.add(other)
+
+    @register_local_tensor_method()
+    def __sub__(self, other):
+        return self.sub(other)
+
+    @register_local_tensor_method()
+    def __rsub__(self, other):
+        return flow.sub(other, self)
+
+    @register_local_tensor_method()
+    def __truediv__(self, other):
+        return self.div(other)
+
+    @register_local_tensor_method()
+    def __rtruediv__(self, other):
+        return flow.div(other, self)
+
+    @register_local_tensor_method()
+    def __neg__(self):
+        return flow.neg(self)
+
+    @register_local_tensor_method()
+    def __pow__(self, b):
+        return flow.pow(self, b)
+
+    @register_local_tensor_method()
+    def __mod__(self, other):
+        return flow.experimental.fmod(self, other)
+
+    def _determine_if_needed(self, determining_initializer=None):
+        if not self.is_determined:
+            self.determine(determining_initializer)
+
+    def determine(self, determining_initializer=None):
+        assert not self.is_determined
+        if determining_initializer is None:
+            determining_initializer = self._determining_initializer
+        self._local_or_consistent_tensor = determining_initializer(self)
+        self._undetermined_tensor = None
+
+    @property
+    def is_determined(self):
+        if self._local_or_consistent_tensor is not None:
+            assert self._undetermined_tensor is None
+            return True
+        else:
+            assert self._undetermined_tensor is not None
+            return False
+
+    def set_placement(self, placement):
+        assert isinstance(placement, flow.placement)
+        assert self._local_or_consistent_tensor is None
+        assert self._undetermined_tensor is not None
+        self._undetermined_tensor.placement = placement
+        self._undetermined_tensor.device = None
+
+    def set_sbp(self, sbp):
+        assert isinstance(sbp, oneflow._oneflow_internal.Distribute)
+        assert self._local_or_consistent_tensor is None
+        assert self._undetermined_tensor is not None
+        self._undetermined_tensor.sbp = sbp
+
+    def set_is_consistent(self, is_consistent):
+        assert isinstance(is_consistent, bool)
+        assert self._local_or_consistent_tensor is None
+        assert self._undetermined_tensor is not None
+        self._undetermined_tensor.is_consistent = is_consistent
+
+    def set_is_lazy(self, is_lazy):
+        assert isinstance(is_lazy, bool)
+        assert self._local_or_consistent_tensor is None
+        assert self._undetermined_tensor is not None
+        self._undetermined_tensor.is_lazy = is_lazy
+
+    def set_data_initializer(self, data_initializer):
+        assert isinstance(data_initializer, initializer_conf_util.InitializerConf)
+        assert self._local_or_consistent_tensor is None
+        assert self._undetermined_tensor is not None
+        self._undetermined_tensor.data_initializer = data_initializer
+
+    @property
+    def placement(self):
+        if self._local_or_consistent_tensor is not None:
+            return self._local_or_consistent_tensor.placement
+        else:
+            return self._undetermined_tensor.placement
+
+    @property
+    def is_lazy(self):
+        if self._local_or_consistent_tensor is not None:
+            return self._local_or_consistent_tensor.is_lazy
+        else:
+            return self._undetermined_tensor.is_lazy
+
+    @property
+    def is_consistent(self):
+        if self._local_or_consistent_tensor is not None:
+            return self._local_or_consistent_tensor.is_consistent
+        else:
+            return self._undetermined_tensor.is_consistent
+
+    @property
+    def sbp(self):
+        if self._local_or_consistent_tensor is not None:
+            return self._local_or_consistent_tensor.sbp
+        else:
+            return self._undetermined_tensor.sbp
+
+    @register_local_tensor_method()
+    def uniform_(self, a=0, b=1):
+        initializer_conf = flow.random_uniform_initializer(
+            minval=a, maxval=b, dtype=self.dtype
+        )
+        return self._init_by_initializer_conf(initializer_conf)
+
+    @register_local_tensor_method()
+    def kaiming_uniform_(
+        self, a=0, mode="fan_in", nonlinearity="leaky_relu", *, data_format="NCHW"
+    ):
+        initializer_conf = flow.kaiming_initializer(
+            shape=self.shape,
+            distribution="random_uniform",
+            mode=mode,
+            nonlinearity=nonlinearity,
+            negative_slope=a,
+            data_format=data_format,
+        )
+        return self._init_by_initializer_conf(initializer_conf)
+
+    @register_local_tensor_method()
+    def kaiming_normal_(
+        self, a=0, mode="fan_in", nonlinearity="leaky_relu", *, data_format="NCHW"
+    ):
+        initializer_conf = flow.kaiming_initializer(
+            shape=self.shape,
+            distribution="random_normal",
+            mode=mode,
+            nonlinearity=nonlinearity,
+            negative_slope=a,
+            data_format=data_format,
+        )
+        return self._init_by_initializer_conf(initializer_conf)
+
+    @register_local_tensor_method()
+    def xavier_normal_(self, gain=1.0, *, data_format="NCHW"):
+        assert gain == 1.0, "Only gain == 1.0 is supported now"
+        initializer_conf = flow.xavier_normal_initializer(data_format=data_format)
+        return self._init_by_initializer_conf(initializer_conf)
+
+    @register_local_tensor_method()
+    def xavier_uniform_(self, gain=1.0, *, data_format="NCHW"):
+        assert gain == 1.0, "Only gain == 1.0 is supported now"
+        initializer_conf = flow.xavier_uniform_initializer(data_format=data_format)
+        return self._init_by_initializer_conf(initializer_conf)
+
+    @register_local_tensor_method()
+    def normal_(self, mean=0, std=1):
+        initializer_conf = flow.random_normal_initializer(mean=mean, stddev=std)
+        return self._init_by_initializer_conf(initializer_conf)
+
+    @register_local_tensor_method()
+    def fill_(self, value):
+        initializer_conf = flow.constant_initializer(value=value, dtype=self.dtype)
+        return self._init_by_initializer_conf(initializer_conf)
+
+    @_auto_determine
+    def zeros_(self):
+        internal_tensor = self._local_or_consistent_tensor
+        if internal_tensor.is_lazy:
+            TODO()
+        if internal_tensor.is_consistent:
+            TODO()
+        internal_tensor.zeros_()
+
+    @_auto_determine
+    @register_local_tensor_method()
+    def register_hook(self, hook):
+        assert self.is_leaf, "register_hook only supports leaf tensor for now"
+        assert (
+            self.requires_grad
+        ), "register_hook only supports tensor with requires_grad=True"
+
+        def hook_returning_determined_tensor(grad):
+            new_grad = hook(grad)
+            if isinstance(new_grad, Tensor) and (not new_grad.is_determined):
+                new_grad.determine()
+                new_grad = new_grad._local_or_consistent_tensor
+            return new_grad
+
+        self._local_or_consistent_tensor._register_hook(
+            hook_returning_determined_tensor
+        )
+
+    @_auto_determine
+    def copy_(self, other: Union["Tensor", np.ndarray]):
+        internal_tensor = self._local_or_consistent_tensor
+        if internal_tensor.is_lazy:
+            TODO()
+        if internal_tensor.is_consistent:
+            TODO()
+        if isinstance(other, (Tensor, check_point_v2.FileBackendVariableBlob)):
+            src_np = other.numpy()
+        else:
+            assert isinstance(other, np.ndarray)
+            src_np = other
+        _copy_from_numpy_to_eager_local_tensor(internal_tensor, src_np)
+
+    def _init_by_initializer_conf(self, initializer_conf):
+        if self.is_determined:
+            if self.is_consistent:
+                with self._placement_scope():
+                    check_point_v2.init_by_initializer_conf(
+                        self, initializer_conf, True, None
+                    )
+            else:
+                _init_eager_local_tensor_by_initializer_conf(
+                    self._local_or_consistent_tensor, initializer_conf
+                )
+        else:
+            self.set_data_initializer(initializer_conf)
+        return self
+
+    def _placement_scope(self):
+        if self.is_consistent:
+            return _convert_to_placement_scope(self.placement)
+        else:
+            return _convert_to_placement_scope(self.device)
+
+    def _construct_with_data(
+        self,
+        *args,
+        dtype=None,
+        device=None,
+        requires_grad=False,
+        placement=None,
+        sbp=None,
+        is_consistent=False,
+        is_lazy=False,
+    ):
+        numpy_data = None
+        if _input_args_is_tuple_or_list(*args):
+            numpy_data = np.array(args[0])
+        elif _input_args_is_numpy(*args):
+            numpy_data = np.ascontiguousarray(args[0])
+        numpy_data = numpy_data.astype(flow.convert_oneflow_dtype_to_numpy_dtype(dtype))
+        shape = oneflow._oneflow_internal.Size(tuple(numpy_data.shape))
+        self._determining_initializer = _numpy_initializer_for_determining
+        self._undetermined_tensor = UndeterminedTensor(
+            shape,
+            dtype,
+            device=device,
+            requires_grad=requires_grad,
+            placement=placement,
+            sbp=sbp,
+            is_consistent=is_consistent,
+            is_lazy=is_lazy,
+            numpy_data=numpy_data,
+        )
+
+
+class UndeterminedTensor:
+    def __init__(
+        self,
+        shape,
+        dtype,
+        device=None,
+        requires_grad=False,
+        placement=None,
+        sbp=None,
+        is_consistent=False,
+        is_lazy=False,
+        data_initializer=None,
+        numpy_data=None,
+    ):
+        if not isinstance(shape, oneflow._oneflow_internal.Size):
+            if not isinstance(shape, tuple):
+                shape = tuple(shape)
+            shape = oneflow._oneflow_internal.Size(shape)
+        data_initializer = (
+            data_initializer
+            if data_initializer is not None
+            else flow.empty_initializer(dtype=dtype)
+        )
+        device = (
+            device if device is not None else oneflow._oneflow_internal.device("cpu")
+        )
+        self.shape = shape
+        self.dtype = dtype
+        self.device = device
+        self.requires_grad = requires_grad
+        self.placement = placement
+        self.sbp = sbp
+        self.is_consistent = is_consistent
+        self.is_lazy = is_lazy
+        self.data_initializer = data_initializer
+        self.numpy_data = numpy_data
+
+    @property
+    def is_cuda(self):
+        device_type = None
+        if self.placement is not None:
+            device_type = self.placement.device_tag
+        elif self.device is not None:
+            device_type = self.device.type
+        else:
+            raise ValueError("Neither placement nor device found.")
+        return device_type == "gpu" or device_type == "cuda"
+
+
+def _default_initializer_for_determining(tensor):
+    assert not tensor.is_determined
+    undetermined_tensor = tensor._undetermined_tensor
+    if undetermined_tensor.is_consistent:
+        raise NotImplementedError()
+    else:
+        shape = undetermined_tensor.shape
+        dtype = undetermined_tensor.dtype
+        determined_tensor = oneflow._oneflow_internal.Tensor(
+            shape,
+            dtype,
+            undetermined_tensor.device,
+            undetermined_tensor.is_lazy,
+            undetermined_tensor.requires_grad,
+            True,
+        )
+        _init_eager_local_tensor_by_initializer_conf(
+            determined_tensor, undetermined_tensor.data_initializer
+        )
+    return determined_tensor
+
+
+def _numpy_initializer_for_determining(tensor):
+    assert not tensor.is_determined
+    undetermined_tensor = tensor._undetermined_tensor
+    numpy_data = undetermined_tensor.numpy_data
+    assert numpy_data is not None
+    if undetermined_tensor.is_consistent:
+        raise NotImplementedError()
+    else:
+        determined_tensor = oneflow._oneflow_internal.Tensor(
+            undetermined_tensor.shape,
+            undetermined_tensor.dtype,
+            undetermined_tensor.device,
+            undetermined_tensor.is_lazy,
+            undetermined_tensor.requires_grad,
+            True,
+        )
+        _copy_from_numpy_to_eager_local_tensor(determined_tensor, numpy_data)
+    return determined_tensor
+
+
+def _input_args_is_tuple_or_list(*args):
+    return len(args) == 1 and isinstance(args[0], (tuple, list))
+
+
+def _input_args_is_numpy(*args):
+    return len(args) == 1 and isinstance(args[0], np.ndarray)
+
+
+def _input_args_is_consistent_or_local(*args):
+    return len(args) == 1 and isinstance(args[0], oneflow._oneflow_internal.Tensor)
+
+
+def _input_args_is_tensor(*args):
+    return len(args) == 1 and isinstance(args[0], flow.Tensor)
+
+
+def _input_args_is_data(*args):
+    return _input_args_is_numpy(*args) or _input_args_is_tuple_or_list(*args)
+
+
+def _input_args_is_shape(*args):
+    return all((isinstance(x, int) for x in args))
+
+
+def register_tensor_op(op_name):
+    def set_tensor_op(method):
+        setattr(Tensor, op_name, method)
+        setattr(oneflow._oneflow_internal.Tensor, op_name, method)
+        return method
+
+    return set_tensor_op
+
+
+def _convert_to_placement_scope(placement_or_device):
+    if isinstance(placement_or_device, flow.placement):
+        placement = placement_or_device
+        return flow.scope.placement(
+            placement.device_tag,
+            list(placement.parallel_conf.device_name()),
+            placement.hierarchy,
+        )
+    else:
+        device = placement_or_device
+        machine_id = 0
+        if device.type == "cuda":
+            device_tag = "gpu"
+        else:
+            device_tag = device.type
+        return flow.scope.placement(
+            device_tag, "{}:{}".format(machine_id, device.index), None
+        )
+
+
+def _is_scalar(data):
+    return isinstance(data, (int, float, bool, complex))
+
+
+def _flatten_list_or_tuple(list_or_tuple):
+    for item in list_or_tuple:
+        if isinstance(item, (list, tuple)):
+            yield from _flatten_list_or_tuple(item)
+        else:
+            yield item
+
+
+def _input_dtype_is_float(data):
+    if _is_scalar(data):
+        return isinstance(data, float)
+    elif isinstance(data, (list, tuple)):
+        return any((isinstance(x, float) for x in _flatten_list_or_tuple(data)))
+    return False
diff --git a/python/oneflow/framework/tensor_str.py b/python/oneflow/framework/tensor_str.py
new file mode 100644
index 0000000000000000000000000000000000000000..beb700024776295a56821419690dbf2360e571d0
--- /dev/null
+++ b/python/oneflow/framework/tensor_str.py
@@ -0,0 +1,54 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import numpy as np
+
+import oneflow as flow
+
+
+def _add_suffixes(tensor_str, suffixes, indent):
+    tensor_strs = [tensor_str]
+    last_line_len = len(tensor_str) - tensor_str.rfind("\n") + 1
+    linewidth = 80
+    for suffix in suffixes:
+        suffix_len = len(suffix)
+        if last_line_len + suffix_len + 2 > linewidth:
+            tensor_strs.append(",\n" + " " * indent + suffix)
+            last_line_len = indent + suffix_len
+        else:
+            tensor_strs.append(", " + suffix)
+            last_line_len += suffix_len + 2
+    tensor_strs.append(")")
+    return "".join(tensor_strs)
+
+
+def _gen_tensor_str(tensor):
+    prefix = "tensor("
+    indent = len(prefix)
+    suffixes = []
+    if tensor.device.type != "cpu" or (
+        tensor.device.type == "cuda" and tensor.device.index != 0
+    ):
+        suffixes.append("device='" + str(tensor.device) + "'")
+    suffixes.append("dtype=" + str(tensor.dtype))
+    if tensor.grad_fn is not None:
+        name = tensor.grad_fn.name()
+        suffixes.append("grad_fn=<{}>".format(name))
+    elif tensor.requires_grad:
+        suffixes.append("requires_grad=True")
+    tensor_str = np.array2string(
+        tensor.numpy(), precision=4, separator=", ", prefix=prefix
+    )
+    return _add_suffixes(prefix + tensor_str, suffixes, indent)
diff --git a/python/oneflow/framework/tensor_tuple_util.py b/python/oneflow/framework/tensor_tuple_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1eba0fa527c38f86c7c37efaf636c117ce4ffdab
--- /dev/null
+++ b/python/oneflow/framework/tensor_tuple_util.py
@@ -0,0 +1,45 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import collections
+from typing import Optional, Sequence, Tuple, Union
+
+from oneflow._oneflow_internal import Tensor, TensorTuple
+from oneflow.framework.tensor import Tensor as PyTensor
+
+
+def convert_to_tensor_tuple(
+    args: Optional[Union[PyTensor, Sequence[PyTensor], Tensor, Sequence[Tensor]]]
+):
+    if args is None:
+        return TensorTuple()
+    elif isinstance(args, collections.abc.Sequence):
+        if len(args) == 0:
+            return TensorTuple()
+        if isinstance(args[0], PyTensor):
+            for tensor in args:
+                if not tensor.is_determined:
+                    tensor.determine()
+            return TensorTuple([x._local_or_consistent_tensor for x in args])
+        return TensorTuple(args)
+    else:
+        tensor_tuple = TensorTuple()
+        if isinstance(args, PyTensor):
+            if not args.is_determined:
+                args.determine()
+            tensor_tuple.append(args._local_or_consistent_tensor)
+        else:
+            tensor_tuple.append(args)
+        return tensor_tuple
diff --git a/python/oneflow/framework/typing.py b/python/oneflow/framework/typing.py
new file mode 100644
index 0000000000000000000000000000000000000000..553e0105d2c474cb178e43741ac829d16bd91818
--- /dev/null
+++ b/python/oneflow/framework/typing.py
@@ -0,0 +1,151 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import inspect
+import sys
+import typing
+from typing import Optional, Sequence
+
+import oneflow
+import oneflow.framework.input_blob_def as input_blob_def
+
+
+class PyStructCompatibleToBlob(object):
+    pass
+
+
+class Numpy(PyStructCompatibleToBlob):
+    """`Numpy` is a type hint for numpy output of a OneFlow global function
+    For instance::
+
+        @oneflow.global_function()
+        def foo() -> oneflow.typing.Numpy:
+            loss = ... # your network
+            return loss
+
+        loss = foo() # get a numpy.ndarray
+        print(loss)
+    """
+
+    def Placeholder(shape: Sequence[int], dtype=oneflow.float):
+        """`Numpy.Placeholder` is a typing function for numpy input of a OneFlow global function.
+        A `numpy.ndarray` takes a `Numpy.Placeholder`'s place must have an identical shape.
+        For instance::
+
+            @oneflow.global_function()
+            def foo(
+                image_blob: oneflow.typing.Numpy.Placeholder(
+                    (2, 255, 255, 3), dtype=flow.float32
+                )
+            ):
+                # your network
+
+            foo(np.random.randn(2, 255, 255, 3).astype(np.float32))
+
+        """
+        assert type(shape) is tuple, "shape should be a tuple. %s found" % shape
+        return type("Numpy.Placeholder", (NumpyDef,), dict(shape=shape, dtype=dtype))
+
+
+class ListNumpy(PyStructCompatibleToBlob):
+    """`ListNumpy` is a type hint for numpy output of a OneFlow global function
+    For instance::
+
+        @oneflow.global_function()
+        def foo() -> oneflow.typing.ListNumpy:
+            mirrored_tensors = ... # your network
+            return mirrored_tensors
+
+        mirrored_tensors = foo() # get a list of numpy.ndarray
+        for tensor in mirrored_tensors:
+            print(mirrored_tensors)
+    """
+
+    def Placeholder(shape: Sequence[int], dtype=oneflow.float):
+        """`ListNumpy.Placeholder` is a typing function for numpy input of a OneFlow global function.
+        A `list` of `numpy.ndarray` takes a `ListNumpy.Placeholder`'s place. Each `numpy.ndarray` in the `list` could have any shape as long as it has the same rank and a smaller/equal size.
+        For instance::
+
+            @oneflow.global_function()
+            def foo(
+                image_blob: oneflow.typing.ListNumpy.Placeholder(
+                    (2, 255, 255, 3), dtype=flow.float32
+                )
+            ):
+                # your network
+
+            input1 = np.random.randn(2, 255, 255, 3).astype(np.float32)
+            input2 = np.random.randn(2, 251, 251, 3).astype(np.float32)
+            foo([input1])
+            foo([input2])
+
+        """
+        assert type(shape) is tuple, "shape should be a tuple. %s found" % shape
+        return type(
+            "ListNumpy.Placeholder", (ListOfNumpyDef,), dict(shape=shape, dtype=dtype)
+        )
+
+
+class OneflowNumpyDef(object):
+    @classmethod
+    def NewInputBlobDef(subclass):
+        raise NotImplementedError
+
+
+class NumpyDef(OneflowNumpyDef):
+    @classmethod
+    def NewInputBlobDef(subclass):
+        return input_blob_def.FixedTensorDef(subclass.shape, dtype=subclass.dtype)
+
+
+class ListOfNumpyDef(OneflowNumpyDef):
+    @classmethod
+    def NewInputBlobDef(subclass):
+        return input_blob_def.MirroredTensorDef(subclass.shape, dtype=subclass.dtype)
+
+
+class Callback(typing.Generic[typing.TypeVar("T")]):
+    pass
+
+
+class Bundle(typing.Generic[typing.TypeVar("T")]):
+    """
+    One or a collection of  typing.Numpy/typing.ListNumpy,
+    such as x, [x], (x,), {"key": x} and the mixed form of them.
+    """
+
+    pass
+
+
+def OriginFrom(parameterised, generic):
+    if inspect.isclass(parameterised) and inspect.isclass(generic):
+        return issubclass(parameterised, generic)
+    if generic == OneflowNumpyDef:
+        assert not inspect.isclass(parameterised)
+        return False
+    if (sys.version_info.major, sys.version_info.minor) >= (3, 7):
+        if not hasattr(parameterised, "__origin__"):
+            return False
+        if generic == typing.Dict:
+            return parameterised.__origin__ is dict
+        if generic == typing.Tuple:
+            return parameterised.__origin__ is tuple
+        if generic == typing.List:
+            return parameterised.__origin__ is list
+        if generic == Callback:
+            return parameterised.__origin__ is Callback
+        if generic == Bundle:
+            return parameterised.__origin__ is Bundle
+    raise NotImplementedError("python typing is a monster torturing everyone.")
diff --git a/python/oneflow/framework/typing_util.py b/python/oneflow/framework/typing_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbabdb159e888233b9fa25d60a25d07505ca8cff
--- /dev/null
+++ b/python/oneflow/framework/typing_util.py
@@ -0,0 +1,298 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import inspect
+import typing
+
+import oneflow._oneflow_internal
+import oneflow.experimental.typing_check as enable_typing_check
+import oneflow.framework.local_blob as local_blob_util
+import oneflow.framework.pull_util as pull_util
+import oneflow.framework.remote_blob as remote_blob_util
+import oneflow.framework.typing as oft
+
+
+def CheckGlobalFunctionAnnotation(signature):
+    parameters = signature.parameters
+    if all((p.annotation is not inspect._empty for (_, p) in parameters.items())):
+        for (_, p) in parameters.items():
+            assert (
+                p.kind == inspect._ParameterKind.POSITIONAL_OR_KEYWORD
+            ), "no parameters like *args or **kwargs supported"
+            CheckGlobalFunctionParamAnnotation(p.annotation)
+    elif enable_typing_check.typing_check_enabled:
+        for (param_name, p) in parameters.items():
+            if p.annotaion is inspect._empty:
+                raise NotImplementedError("parameter %s is not annotated" % param_name)
+    else:
+        pass
+    return_annotation = signature.return_annotation
+    if return_annotation is not inspect._empty:
+        CheckGlobalFunctionReturnAnnotation(return_annotation)
+    elif enable_typing_check.typing_check_enabled:
+        raise NotImplementedError("no return annotation found.")
+    else:
+        pass
+
+
+def CheckGlobalFunctionParamAnnotation(cls):
+    if oft.OriginFrom(cls, typing.Tuple):
+        assert cls.__args__ is not None, "T in typing.Tuple[T, ...] cannot be omitted"
+        assert len(cls.__args__) > 0
+        for cls_arg in cls.__args__:
+            CheckGlobalFunctionParamAnnotation(cls_arg)
+    elif oft.OriginFrom(cls, oft.OneflowNumpyDef):
+        pass
+    else:
+        raise NotImplementedError("invalid parameter annotation %s found" % cls)
+
+
+def CheckGlobalFunctionReturnAnnotation(cls):
+    if cls is None:
+        pass
+    elif oft.OriginFrom(cls, oft.Callback):
+        assert (
+            cls.__args__ is not None
+        ), "T in oneflow.typing.Callback[T] cannot be omitted"
+        assert len(cls.__args__) == 1
+        _CheckGlobalFunctionReturnAnnotation(cls.__args__[0])
+    elif oft.OriginFrom(cls, oft.Bundle):
+        assert cls.__args__[0] in (
+            oft.Numpy,
+            oft.ListNumpy,
+        ), "T in oneflow.typing.Bundle[T] must be one of (oneflow.typing.Numpy, oneflow.typing.ListNumpy)"
+        assert len(cls.__args__) == 1
+        _CheckGlobalFunctionReturnAnnotation(cls.__args__[0])
+    else:
+        _CheckGlobalFunctionReturnAnnotation(cls)
+
+
+def _CheckGlobalFunctionReturnAnnotation(cls):
+    if oft.OriginFrom(cls, typing.Tuple):
+        assert cls.__args__ is not None, "T in typing.Tuple[T, ...] cannot be omitted"
+        assert len(cls.__args__) > 0
+        for cls_arg in cls.__args__:
+            _CheckGlobalFunctionReturnAnnotation(cls_arg)
+    elif oft.OriginFrom(cls, typing.List):
+        assert cls.__args__ is not None, "T in typing.List[T] cannot be omitted"
+        assert len(cls.__args__) == 1
+        _CheckGlobalFunctionReturnAnnotation(cls.__args__[0])
+    elif oft.OriginFrom(cls, typing.Dict):
+        assert cls.__args__ is not None, "(K, V) in typing.Dict[K,V] cannot be omitted"
+        assert len(cls.__args__) == 2
+        _CheckGlobalFunctionReturnAnnotation(cls.__args__[1])
+    elif oft.OriginFrom(cls, oft.PyStructCompatibleToBlob):
+        pass
+    else:
+        raise NotImplementedError("invalid return annotation %s found" % cls)
+
+
+def CheckReturnByAnnotation(function_name, ret, annotation):
+    if annotation is inspect._empty:
+        return
+    if annotation is None:
+        error_str = (
+            "%s does not matched return annotation %s of global_function %s."
+            % (ret, annotation, function_name)
+        )
+        assert ret is None, error_str
+    elif oft.OriginFrom(annotation, oft.Callback):
+        _CheckReturnByAnnotation(function_name, ret, annotation.__args__[0])
+    elif oft.OriginFrom(annotation, oft.Bundle):
+        if isinstance(ret, oneflow._oneflow_internal.BlobDesc):
+            _CheckReturnByAnnotation(function_name, ret, annotation.__args__[0])
+        elif isinstance(ret, (list, tuple)):
+            for elem in ret:
+                CheckReturnByAnnotation(function_name, elem, annotation)
+        elif type(ret) is dict:
+            for val in ret.values():
+                CheckReturnByAnnotation(function_name, val, annotation)
+        else:
+            raise NotImplementedError("invalid return  %s found" % type(ret))
+    else:
+        _CheckReturnByAnnotation(function_name, ret, annotation)
+
+
+def _CheckReturnByAnnotation(function_name, ret, annotation):
+    error_str = "%s does not matched return annotation %s of global_function %s." % (
+        ret,
+        annotation,
+        function_name,
+    )
+    if oft.OriginFrom(annotation, typing.Tuple):
+        assert type(ret) is tuple, error_str
+        assert len(ret) == len(annotation.__args__), "%s length compare: %s v.s. %s" % (
+            error_str,
+            len(ret),
+            len(annotation.__args__),
+        )
+        for (ret_i, annotation_i) in zip(ret, annotation.__args__):
+            _CheckReturnByAnnotation(function_name, ret_i, annotation_i)
+    elif oft.OriginFrom(annotation, typing.List):
+        assert type(ret) is list, error_str
+        assert len(annotation.__args__) == 1, (
+            "%s element type in list must be unique" % error_str
+        )
+        for ret_i in ret:
+            _CheckReturnByAnnotation(function_name, ret_i, annotation.__args__[0])
+    elif oft.OriginFrom(annotation, typing.Dict):
+        assert len(annotation.__args__) == 2
+        assert type(ret) is dict, error_str
+        for (key, val) in ret.items():
+            assert type(key) is annotation.__args__[0], (
+                "type of %s:%s and %s:%s do not matched return annotation (%s, %s) of global_function %s."
+                % (
+                    key,
+                    type(key),
+                    val,
+                    type(val),
+                    annotation.__args__[0],
+                    annotation.__args__[1],
+                    function_name,
+                )
+            )
+            _CheckReturnByAnnotation(function_name, val, annotation.__args__[1])
+    elif oft.OriginFrom(annotation, oft.Numpy):
+        assert isinstance(
+            ret, oneflow._oneflow_internal.BlobDesc
+        ), "type(ret): %s" % type(ret)
+        assert (
+            not ret.is_dynamic
+        ), "only fixed shaped blob compatible to oneflow.typing.Numpy. you can change annotation to oneflow.typing.ListNumpy "
+    elif oft.OriginFrom(annotation, oft.ListNumpy):
+        assert isinstance(
+            ret, oneflow._oneflow_internal.BlobDesc
+        ), "type(ret): %s" % type(ret)
+    else:
+        raise NotImplementedError("invalid return annotation %s found" % annotation)
+
+
+def TransformGlobalFunctionResult(future_blob, annotation):
+    if annotation is inspect._empty:
+        return future_blob
+    elif annotation is None:
+        assert future_blob is None
+        return None
+    elif oft.OriginFrom(annotation, oft.Callback):
+        annotation = annotation.__args__[0]
+
+        def Transform(f):
+            return lambda x: f(TransformReturnedLocalBlob(x, annotation))
+
+        return lambda f: future_blob.async_get(Transform(f))
+    elif oft.OriginFrom(annotation, oft.Bundle):
+        return TransformReturnedBundle(future_blob.get(), annotation)
+    else:
+        return TransformReturnedLocalBlob(future_blob.get(), annotation)
+
+
+def TransformReturnedBundle(bundle_blob, annotation):
+    """
+    Transform returned bundle blob from global_function(job_func),
+    the returned bundle blob could be the form like x, [x], (x, ),
+    {"key": x} or the mixed form of them.
+    """
+    if isinstance(bundle_blob, (local_blob_util.LocalBlob,)):
+        return TransformReturnedLocalBlob(bundle_blob, annotation.__args__[0])
+    elif isinstance(bundle_blob, (list, tuple)):
+        return type(bundle_blob)(
+            (TransformReturnedBundle(elem, annotation) for elem in bundle_blob)
+        )
+    elif type(bundle_blob) is dict:
+        return {
+            key: TransformReturnedBundle(val, annotation)
+            for (key, val) in bundle_blob.items()
+        }
+    else:
+        raise NotImplementedError(
+            "invalid return  %s : %s found" % (bundle_blob, type(bundle_blob))
+        )
+
+
+def TransformReturnedLocalBlob(local_blob, annotation):
+    if oft.OriginFrom(annotation, typing.Tuple):
+        assert type(local_blob) is tuple
+        assert len(local_blob) == len(annotation.__args__)
+        pairs = zip(local_blob, annotation.__args__)
+        return tuple((TransformReturnedLocalBlob(*pair) for pair in pairs))
+    elif oft.OriginFrom(annotation, typing.List):
+        assert type(local_blob) is list
+        assert len(annotation.__args__) == 1
+        return [
+            TransformReturnedLocalBlob(elem, annotation.__args__[0])
+            for elem in local_blob
+        ]
+    elif oft.OriginFrom(annotation, typing.Dict):
+        assert type(local_blob) is dict
+        assert len(annotation.__args__) == 2
+        vals = [
+            TransformReturnedLocalBlob(val, annotation.__args__[1])
+            for val in local_blob.values()
+        ]
+        return dict(zip(local_blob.keys(), vals))
+    elif oft.OriginFrom(annotation, oft.PyStructCompatibleToBlob):
+        return TransformLocalBlob(local_blob, annotation)
+    else:
+        raise NotImplementedError(
+            "invalid watch callback parameter annotation %s found" % annotation
+        )
+
+
+def CheckWatchCallbackParameterAnnotation(parameters):
+    assert len(parameters) == 1, "watch callback should accept only one parameter"
+    annotation = parameters[list(parameters.keys())[0]].annotation
+    if annotation is inspect._empty:
+        if enable_typing_check.typing_check_enabled:
+            raise NotImplementedError("the watch callback's parameter is not annotated")
+        return
+    if not oft.OriginFrom(annotation, oft.PyStructCompatibleToBlob):
+        raise NotImplementedError(
+            "invalid watch callback paremeter annotation %s found. " % annotation
+            + "candidate annotations: oneflow.typing.Numpy, oneflow.typing.ListNumpy. "
+        )
+
+
+def CheckWatchedBlobByAnnotation(blob, annotation):
+    if annotation is inspect._empty:
+        return
+    if oft.OriginFrom(annotation, oft.Numpy):
+        assert (
+            not blob.is_dynamic
+        ), "only fixed shaped blob compatible to oneflow.typing.Numpy. you can change annotation to oneflow.typing.ListNumpy "
+    elif oft.OriginFrom(annotation, oft.ListNumpy):
+        pass
+    else:
+        raise NotImplementedError(
+            "invalid watch callback parameter annotation %s found" % annotation
+        )
+
+
+def TransformWatchedBlob(future_blob, handler):
+    parameters = inspect.signature(handler).parameters
+    annotation = parameters[list(parameters.keys())[0]].annotation
+    if annotation is inspect._empty:
+        return future_blob
+    return TransformLocalBlob(future_blob, annotation)
+
+
+def TransformLocalBlob(future_blob, annotation):
+    if oft.OriginFrom(annotation, oft.Numpy):
+        return future_blob.numpy()
+    elif oft.OriginFrom(annotation, oft.ListNumpy):
+        return future_blob.numpy_list()
+    else:
+        raise NotImplementedError(
+            "invalid watch callback parameter annotation %s found" % annotation
+        )
diff --git a/python/oneflow/framework/unittest.py b/python/oneflow/framework/unittest.py
new file mode 100644
index 0000000000000000000000000000000000000000..3eb2542398301eb10897e1db9b779e70a7aa4cf6
--- /dev/null
+++ b/python/oneflow/framework/unittest.py
@@ -0,0 +1,372 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import atexit
+import imp
+import os
+import socket
+import subprocess
+import sys
+import unittest
+import uuid
+from contextlib import closing
+from tempfile import NamedTemporaryFile
+from typing import Any, Callable, Dict
+
+import google.protobuf.text_format as pbtxt
+
+import oneflow
+import oneflow.env
+import oneflow.framework.env_util as env_util
+from oneflow.core.job.env_pb2 import EnvProto
+
+
+class _ClearDefaultSession(object):
+    def setUp(self):
+        oneflow.clear_default_session()
+        oneflow.enable_eager_execution(False)
+
+
+def register_test_cases(
+    scope: Dict[str, Any],
+    directory: str,
+    filter_by_num_nodes: Callable[[bool], int],
+    base_class: unittest.TestCase = unittest.TestCase,
+    test_case_mixin=_ClearDefaultSession,
+) -> None:
+    def FilterTestPyFile(f):
+        return (
+            os.path.isfile(os.path.join(directory, f))
+            and f.endswith(".py")
+            and f.startswith("test")
+        )
+
+    def FilterMethodName(module, name):
+        method = getattr(module, name)
+        return (
+            name.startswith("test")
+            and callable(method)
+            and filter_by_num_nodes(_GetNumOfNodes(method))
+        )
+
+    onlytest_files = [f for f in os.listdir(directory) if FilterTestPyFile(f)]
+    for f in onlytest_files:
+        class_name = f[0:-3]
+        module = imp.load_source(class_name, os.path.join(directory, f))
+        test_func_names = [
+            name for name in dir(module) if FilterMethodName(module, name)
+        ]
+        method_dict = {k: getattr(module, k) for k in test_func_names}
+        scope[class_name] = type(class_name, (test_case_mixin, base_class), method_dict)
+
+
+def num_nodes_required(num_nodes: int) -> Callable[[Callable], Callable]:
+    def Decorator(f):
+        f.__oneflow_test_case_num_nodes_required__ = num_nodes
+        return f
+
+    return Decorator
+
+
+def _GetNumOfNodes(func):
+    if hasattr(func, "__oneflow_test_case_num_nodes_required__") == False:
+        return 1
+    return getattr(func, "__oneflow_test_case_num_nodes_required__")
+
+
+def eager_execution_enabled():
+    return os.getenv("ONEFLOW_TEST_ENABLE_EAGER") == "1"
+
+
+def typing_check_enabled():
+    return os.getenv("ONEFLOW_TEST_ENABLE_TYPING_CHECK") == "1"
+
+
+def node_list():
+    node_list_str = os.getenv("ONEFLOW_TEST_NODE_LIST")
+    assert node_list_str
+    return node_list_str.split(",")
+
+
+def has_node_list():
+    if os.getenv("ONEFLOW_TEST_NODE_LIST"):
+        return True
+    else:
+        return False
+
+
+def node_size():
+    if has_node_list():
+        node_list_from_env = node_list()
+        return len(node_list_from_env)
+    else:
+        return 1
+
+
+def has_world_size():
+    if oneflow.distributed.is_multi_client():
+        return True
+    if os.getenv("ONEFLOW_TEST_WORLD_SIZE"):
+        assert os.getenv(
+            "ONEFLOW_TEST_WORLD_SIZE"
+        ).isdigit(), "env var ONEFLOW_TEST_WORLD_SIZE must be num"
+        return True
+    else:
+        return False
+
+
+def world_size():
+    if oneflow.distributed.is_multi_client():
+        return oneflow.distributed.get_world_size()
+    return int(os.getenv("ONEFLOW_TEST_WORLD_SIZE"))
+
+
+def device_num():
+    device_num_str = os.getenv("ONEFLOW_TEST_DEVICE_NUM")
+    if device_num_str:
+        return int(device_num_str)
+    else:
+        return 1
+
+
+def enable_init_by_host_list():
+    return os.getenv("ONEFLOW_TEST_ENABLE_INIT_BY_HOST_LIST") == "1"
+
+
+def enable_multi_process():
+    return os.getenv("ONEFLOW_TEST_MULTI_PROCESS") == "1"
+
+
+def find_free_port():
+    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
+        s.bind(("localhost", 0))
+        s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+        return s.getsockname()[1]
+
+
+_unittest_env_initilized = False
+_unittest_worker_initilized = False
+
+
+def worker_agent_port():
+    port_txt = os.getenv("ONEFLOW_TEST_WORKER_AGENT_PORT")
+    if port_txt:
+        return int(port_txt)
+    else:
+        return None
+
+
+def worker_agent_authkey():
+    key = os.getenv("ONEFLOW_TEST_WORKER_AGENT_AUTHKEY")
+    assert key
+    return key
+
+
+def use_worker_agent():
+    return worker_agent_port() is not None
+
+
+def cast(conn=None, cmd=None, msg=None):
+    cmd = "cast/" + cmd
+    print("[unittest]", f"[{cmd}]", msg)
+    conn.send(cmd.encode())
+    conn.send(msg.encode())
+
+
+def call(conn=None, cmd=None, msg=None):
+    cmd = "call/" + cmd
+    print("[unittest]", f"[{cmd}]", msg)
+    conn.send(cmd.encode())
+    msg_ = ""
+    if msg is not None:
+        msg_ = msg
+    conn.send(msg_.encode())
+    return conn.recv().decode()
+
+
+def launch_worker_via_agent(host=None, env_proto=None):
+    print("[unittest]", "launching worker via agent at", host)
+    from multiprocessing.connection import Client
+
+    address = ("localhost", worker_agent_port())
+    conn = Client(address, authkey=worker_agent_authkey().encode())
+    cast(conn=conn, cmd="host", msg=host)
+    cast(conn=conn, cmd="env_proto", msg=pbtxt.MessageToString(env_proto))
+    assert call(conn=conn, cmd="start_worker") == "ok"
+    print("[unittest]", "worker launched via agent at", host)
+    conn.close()
+
+
+class TestCase(unittest.TestCase):
+    def setUp(self):
+        global _unittest_env_initilized
+        global _unittest_worker_initilized
+        if has_node_list():
+            assert node_size() > 1
+            if _unittest_worker_initilized == False:
+                master_port = os.getenv("ONEFLOW_TEST_MASTER_PORT")
+                assert master_port, "env var ONEFLOW_TEST_MASTER_PORT not set"
+                oneflow.env.ctrl_port(int(master_port))
+                data_port = os.getenv("ONEFLOW_TEST_DATA_PORT")
+                if data_port:
+                    oneflow.env.data_port(int(data_port))
+                if enable_init_by_host_list():
+                    oneflow.env.machine(node_list())
+                    data_port = os.getenv("ONEFLOW_TEST_DATA_PORT")
+                    print("initializing worker...")
+                    for machine in env_util.default_env_proto.machine:
+                        if machine.id == 0:
+                            pass
+                        else:
+                            launch_worker_via_agent(
+                                host=machine.addr, env_proto=env_util.default_env_proto
+                            )
+                else:
+                    ctrl_port = os.getenv("ONEFLOW_TEST_CTRL_PORT")
+                    config_rank_ctrl_port = -1
+                    if ctrl_port:
+                        config_rank_ctrl_port = int(ctrl_port)
+                    if has_world_size():
+                        config_world_size = world_size()
+                    else:
+                        config_world_size = 0
+                    config_node_size = -1
+                    env_node_size = os.getenv("ONEFLOW_TEST_NODE_SIZE")
+                    if env_node_size:
+                        config_node_size = int(env_node_size)
+                    bootstrap_conf_list = oneflow.env.init_bootstrap_confs(
+                        node_list(),
+                        int(master_port),
+                        config_world_size,
+                        config_rank_ctrl_port,
+                        config_node_size,
+                    )
+                    worker_env_proto = EnvProto()
+                    worker_env_proto.CopyFrom(env_util.default_env_proto)
+                    worker_env_proto.ClearField("ctrl_bootstrap_conf")
+                    for bootstrap_conf in bootstrap_conf_list:
+                        if bootstrap_conf.rank == 0:
+                            continue
+                        assert bootstrap_conf.HasField("host")
+                        worker_env_proto.ctrl_bootstrap_conf.CopyFrom(bootstrap_conf)
+                        launch_worker_via_agent(
+                            host=bootstrap_conf.host, env_proto=worker_env_proto
+                        )
+                _unittest_worker_initilized = True
+        elif device_num() > 1 and enable_multi_process():
+            master_port = find_free_port()
+            oneflow.env.ctrl_port(master_port)
+            config_world_size = device_num()
+            bootstrap_conf_list = oneflow.env.init_bootstrap_confs(
+                ["127.0.0.1"], master_port, config_world_size
+            )
+            env_proto = env_util.default_env_proto
+            assert (
+                len(env_proto.machine) == 1
+                and env_proto.HasField("ctrl_bootstrap_conf") == 1
+            )
+            run_dir = os.getenv("HOME") + "/oneflow_temp/" + str(uuid.uuid1())
+            run_dir = os.path.abspath(os.path.expanduser(run_dir))
+            if not os.path.exists(run_dir):
+                os.makedirs(run_dir)
+            for rank in range(1, config_world_size):
+                worker_env_proto = EnvProto()
+                worker_env_proto.CopyFrom(env_proto)
+                worker_env_proto.ctrl_bootstrap_conf.rank = rank
+                worker_env_proto.cpp_logging_conf.log_dir = (
+                    run_dir + "/log_" + str(rank)
+                )
+                env_file = NamedTemporaryFile(delete=False)
+                if sys.version_info >= (3, 0):
+                    env_file.write(pbtxt.MessageToString(worker_env_proto).encode())
+                else:
+                    env_file.write(pbtxt.MessageToString(worker_env_proto))
+                env_file.close()
+                if not os.path.exists(run_dir + "/log_" + str(rank)):
+                    os.mkdir(run_dir + "/log_" + str(rank))
+                os.system(
+                    "cp "
+                    + env_file.name
+                    + " "
+                    + run_dir
+                    + "/log_"
+                    + str(rank)
+                    + "/env_proto_"
+                    + str(rank)
+                    + ".proto"
+                )
+                oneflow_cmd = (
+                    "python3 -m oneflow --start_worker"
+                    + " --env_proto="
+                    + run_dir
+                    + "/log_"
+                    + str(rank)
+                    + "/"
+                    + "env_proto_"
+                    + str(rank)
+                    + ".proto"
+                )
+                subprocess.Popen(
+                    oneflow_cmd,
+                    stdout=subprocess.DEVNULL,
+                    stderr=subprocess.DEVNULL,
+                    shell=True,
+                )
+                os.remove(env_file.name)
+            atexit.register(
+                oneflow.deprecated.delete_worker_of_multi_process, run_dir=run_dir
+            )
+        log_dir = os.getenv("ONEFLOW_TEST_LOG_DIR")
+        if log_dir:
+            oneflow.env.log_dir(log_dir)
+        if _unittest_env_initilized == False:
+            oneflow.env.init()
+            _unittest_env_initilized = True
+        oneflow.clear_default_session()
+        oneflow.enable_eager_execution(eager_execution_enabled())
+        oneflow.experimental.enable_typing_check(typing_check_enabled())
+
+
+def skip_unless(n, d):
+    if node_size() == n and device_num() == d:
+        return lambda func: func
+    else:
+        return unittest.skip(
+            "only runs when node_size is {} and device_num is {}".format(n, d)
+        )
+
+
+def skip_unless_1n1d():
+    return skip_unless(1, 1)
+
+
+def skip_unless_1n2d():
+    return skip_unless(1, 2)
+
+
+def skip_unless_1n4d():
+    return skip_unless(1, 4)
+
+
+def skip_unless_2n1d():
+    return skip_unless(2, 1)
+
+
+def skip_unless_2n2d():
+    return skip_unless(2, 2)
+
+
+def skip_unless_2n4d():
+    return skip_unless(2, 4)
diff --git a/python/oneflow/framework/variable_getter_composite.py b/python/oneflow/framework/variable_getter_composite.py
new file mode 100644
index 0000000000000000000000000000000000000000..86e76cfd07d04af13e424a803ca2cdb270b61fe4
--- /dev/null
+++ b/python/oneflow/framework/variable_getter_composite.py
@@ -0,0 +1,37 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import functools
+
+
+class VariableGetterComposite(object):
+    def __init__(self):
+        self.getter_stack = []
+
+    def __call__(self, var_gen_fn, *args, **kwargs):
+        def make_inner(outter, inner):
+            @functools.wraps(inner)
+            def inner_fn():
+                return outter(inner, *args, **kwargs)
+
+            return inner_fn
+
+        fn = var_gen_fn
+        for getter in self.getter_stack:
+            fn = make_inner(getter, fn)
+        return fn()
+
+    def register(self, fn):
+        self.getter_stack.append(fn)
diff --git a/python/oneflow/framework/watcher.py b/python/oneflow/framework/watcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b6af5dacbd5e93bbde584312936b06175cbe295
--- /dev/null
+++ b/python/oneflow/framework/watcher.py
@@ -0,0 +1,56 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import traceback
+
+from google.protobuf import text_format
+
+import oneflow._oneflow_internal
+import oneflow.core.record.record_pb2 as record_util
+import oneflow.framework.local_blob as local_blob_util
+import oneflow.framework.ofblob as ofblob
+import oneflow.framework.remote_blob as remote_blob_util
+import oneflow.framework.session_context as session_ctx
+import oneflow.framework.typing_util as oft_util
+
+
+def BindUuidAndHandler(uuid, blob_watched, handler):
+    assert isinstance(blob_watched, oneflow._oneflow_internal.ConsistentBlob)
+    session_ctx.GetDefaultSession().uuid2watch_handler[uuid] = (blob_watched, handler)
+
+
+class _Watcher(oneflow._oneflow_internal.ForeignWatcher):
+    def __init__(self):
+        oneflow._oneflow_internal.ForeignWatcher.__init__(self)
+
+    def Call(self, handler_uuid, of_blob_ptr):
+        try:
+            _WatcherHandler(handler_uuid, of_blob_ptr)
+        except Exception as e:
+            print(traceback.format_exc())
+            raise e
+
+
+def _WatcherHandler(handler_uuid, of_blob_ptr):
+    uuid2handler = session_ctx.GetDefaultSession().uuid2watch_handler
+    assert handler_uuid in uuid2handler
+    (blob_watched, handler) = uuid2handler[handler_uuid]
+    assert callable(handler)
+    ndarray = ofblob.OfBlob(of_blob_ptr).CopyToNdarray()
+    local_blob = local_blob_util.LocalBlob(ndarray, blob_watched.is_dynamic)
+    handler(oft_util.TransformWatchedBlob(local_blob, handler))
+
+
+_global_watcher = _Watcher()
diff --git a/python/oneflow/layers.py b/python/oneflow/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4cfdf7997ea1b224733c26304ecd0e45323c309
--- /dev/null
+++ b/python/oneflow/layers.py
@@ -0,0 +1,17 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.ops.categorical_ordinal_encode_op import categorical_ordinal_encoder
+from oneflow.ops.prelu import prelu
diff --git a/python/oneflow/linalg.py b/python/oneflow/linalg.py
new file mode 100644
index 0000000000000000000000000000000000000000..a33ef9c06543c30461006235141f3627c1cd50f7
--- /dev/null
+++ b/python/oneflow/linalg.py
@@ -0,0 +1,18 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.nn.modules.norm import matrix_norm_tensor_op as matrix_norm
+from oneflow.nn.modules.norm import norm_op as norm
+from oneflow.nn.modules.norm import vector_norm_tensor_op as vector_norm
diff --git a/python/oneflow/math.py b/python/oneflow/math.py
new file mode 100644
index 0000000000000000000000000000000000000000..addb8101969687eca907ff32dc97ac0f8ee785fb
--- /dev/null
+++ b/python/oneflow/math.py
@@ -0,0 +1,23 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.ops.math_binary_elementwise_ops import atan2, floordiv, pow, xdivy, xlogy
+from oneflow.ops.reduce_mean import reduce_mean
+from oneflow.ops.two_stage_reduce import (
+    api_two_stage_reduce_max as two_stage_reduce_max,
+)
+from oneflow.ops.two_stage_reduce import (
+    api_two_stage_reduce_min as two_stage_reduce_min,
+)
diff --git a/python/oneflow/model.py b/python/oneflow/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..5713c2d791838a536acbb2bc303de79e7d4b7550
--- /dev/null
+++ b/python/oneflow/model.py
@@ -0,0 +1,18 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.framework.model import Callback, CheckpointConfig, DataModule
+from oneflow.framework.model import Model as Model
+from oneflow.framework.model import NumpyDataModule, TrainingConfig, ValidationConfig
diff --git a/python/oneflow/nn/__init__.py b/python/oneflow/nn/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..890810e61cbcb4375a3ecfeb5ced73616baa5b26
--- /dev/null
+++ b/python/oneflow/nn/__init__.py
@@ -0,0 +1,110 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.nn.graph import Graph
+from oneflow.nn.module import Module
+from oneflow.nn.modules.activation import (
+    ELU,
+    GELU,
+    Hardsigmoid,
+    Hardswish,
+    Hardtanh,
+    LeakyReLU,
+    LogSigmoid,
+    LogSoftmax,
+    Mish,
+    PReLU,
+    ReLU,
+    ReLU6,
+    Sigmoid,
+    Softmax,
+    Softplus,
+    Tanh,
+)
+from oneflow.nn.modules.adaptive_pool import (
+    AdaptiveAvgPool1d,
+    AdaptiveAvgPool2d,
+    AdaptiveAvgPool3d,
+)
+from oneflow.nn.modules.batchnorm import BatchNorm1d, BatchNorm2d
+from oneflow.nn.modules.container import (
+    ModuleDict,
+    ModuleList,
+    ParameterDict,
+    ParameterList,
+    Sequential,
+)
+from oneflow.nn.modules.conv import Conv1d, Conv2d
+from oneflow.nn.modules.dataset import (
+    COCOReader,
+    CoinFlip,
+    CropMirrorNormalize,
+    OFRecordImageDecoder,
+    OFRecordImageDecoderRandomCrop,
+    OfrecordRawDecoder,
+    OfrecordReader,
+)
+from oneflow.nn.modules.deconv import ConvTranspose2d
+from oneflow.nn.modules.dropout import Dropout
+from oneflow.nn.modules.flatten import Flatten
+from oneflow.nn.modules.instancenorm import (
+    InstanceNorm1d,
+    InstanceNorm2d,
+    InstanceNorm3d,
+)
+from oneflow.nn.modules.linear import Identity, Linear
+from oneflow.nn.modules.loss import (
+    BCELoss,
+    BCEWithLogitsLoss,
+    CrossEntropyLoss,
+    CTCLoss,
+    KLDivLoss,
+    L1Loss,
+    MarginRankingLoss,
+    MSELoss,
+    NLLLoss,
+    SmoothL1Loss,
+)
+from oneflow.nn.modules.normalization import GroupNorm, LayerNorm
+from oneflow.nn.modules.padding import (
+    ConstantPad1d,
+    ConstantPad2d,
+    ConstantPad3d,
+    ReflectionPad2d,
+    ReplicationPad2d,
+)
+from oneflow.nn.modules.pixelshuffle import PixelShufflev2 as PixelShuffle
+from oneflow.nn.modules.pooling import (
+    AvgPool1d,
+    AvgPool2d,
+    AvgPool3d,
+    MaxPool1d,
+    MaxPool2d,
+    MaxPool3d,
+)
+from oneflow.nn.modules.sparse import Embedding
+from oneflow.nn.modules.upsampling import (
+    Upsample,
+    UpsamplingBilinear2d,
+    UpsamplingNearest2d,
+)
+from oneflow.nn.modules.zeropad2d import ZeroPad2d
+from oneflow.nn.parameter import Parameter
+from oneflow.ops.domain_ops import (
+    api_fused_self_attention_query_mul_key_and_value as fused_self_attention_query_mul_key_and_value,
+)
+from oneflow.ops.loss_ops import ctc_greedy_decoder
+
+from . import functional
diff --git a/python/oneflow/nn/common_types.py b/python/oneflow/nn/common_types.py
new file mode 100644
index 0000000000000000000000000000000000000000..d91ca3eda286f89df41573a5ffc4a0007afdbf8d
--- /dev/null
+++ b/python/oneflow/nn/common_types.py
@@ -0,0 +1,35 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Tuple, TypeVar, Union
+
+T = TypeVar("T")
+_scalar_or_tuple_any_t = Union[T, Tuple[T, ...]]
+_scalar_or_tuple_1_t = Union[T, Tuple[T]]
+_scalar_or_tuple_2_t = Union[T, Tuple[T, T]]
+_scalar_or_tuple_3_t = Union[T, Tuple[T, T, T]]
+_scalar_or_tuple_4_t = Union[T, Tuple[T, T, T, T]]
+_scalar_or_tuple_5_t = Union[T, Tuple[T, T, T, T, T]]
+_scalar_or_tuple_6_t = Union[T, Tuple[T, T, T, T, T, T]]
+_size_any_t = _scalar_or_tuple_any_t[int]
+_size_1_t = _scalar_or_tuple_1_t[int]
+_size_2_t = _scalar_or_tuple_2_t[int]
+_size_3_t = _scalar_or_tuple_3_t[int]
+_size_4_t = _scalar_or_tuple_4_t[int]
+_size_5_t = _scalar_or_tuple_5_t[int]
+_size_6_t = _scalar_or_tuple_6_t[int]
+_ratio_2_t = _scalar_or_tuple_2_t[float]
+_ratio_3_t = _scalar_or_tuple_3_t[float]
+_ratio_any_t = _scalar_or_tuple_any_t[float]
diff --git a/python/oneflow/nn/functional.py b/python/oneflow/nn/functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cae9c66bbbaeee005a7fe836c629e88fcd06c97
--- /dev/null
+++ b/python/oneflow/nn/functional.py
@@ -0,0 +1,16 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.nn.modules.interpolate import interpolate
diff --git a/python/oneflow/nn/graph.py b/python/oneflow/nn/graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0456ac68f8c4d67896034ec781e00111e72e989
--- /dev/null
+++ b/python/oneflow/nn/graph.py
@@ -0,0 +1,279 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from collections import OrderedDict
+from functools import partial
+from typing import Dict
+
+import oneflow._oneflow_internal
+import oneflow.framework.c_api_util as c_api_util
+import oneflow.framework.graph_build_util as graph_build_util
+import oneflow.framework.session_context as session_ctx
+from oneflow._oneflow_internal import Tensor as InternalTensor
+from oneflow.framework.function_util import FunctionConfig
+from oneflow.framework.multi_client_session import MultiClientSession
+from oneflow.framework.tensor_tuple_util import convert_to_tensor_tuple
+from oneflow.nn.graph_block import Block, BlockType
+from oneflow.nn.graph_optimizer import OptimizerConfig
+from oneflow.nn.module import Module
+from oneflow.nn.optimizer.optimizer import Optimizer
+from oneflow.nn.utils import add_indent
+
+
+class Graph(object):
+    _child_init_cnt = dict()
+
+    def __init__(self):
+        self.config = GraphConfig()
+        self._generate_name()
+        self.config.proto.set_job_name(self._name)
+        self._c_nn_graph = oneflow._oneflow_internal.nn.graph.CNNGraph(self._name)
+        self._blocks = OrderedDict()
+        self._optimizers = OrderedDict()
+        self._is_compiled = False
+        self._var2var_op_name = dict()
+        self._job_proto = None
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def training(self):
+        return self.config.training
+
+    @property
+    def _graph_proto(self):
+        return self._job_proto
+
+    def build(self, *args):
+        raise NotImplementedError()
+
+    def add_optimizer(
+        self,
+        name: str,
+        optimizer: Optimizer = None,
+        lr_scheduler=None,
+        grad_clipping_conf=None,
+        weight_decay_conf=None,
+    ):
+        assert name is not None, "name cannot be None"
+        assert type(name) is str, "name must be an instance of str"
+        assert optimizer is not None, "optimizer cannot be None"
+        assert isinstance(
+            optimizer, Optimizer
+        ), "optimizer must be an instance of Optimizer"
+        self._optimizers[name] = OptimizerConfig(
+            name, optimizer, lr_scheduler, grad_clipping_conf, weight_decay_conf
+        )
+
+    def _generate_name(self):
+        child_name = self.__class__.__name__
+        if Graph._child_init_cnt.get(child_name) is None:
+            Graph._child_init_cnt[child_name] = 0
+        self._name = child_name + "_" + str(Graph._child_init_cnt[child_name])
+        Graph._child_init_cnt[child_name] += 1
+
+    def _state(self):
+        for (_, b) in self._blocks.items():
+            pa_gen = b.parameters(recurse=True)
+            for pa in pa_gen:
+                yield pa
+            bu_gen = b.buffers(recurse=True)
+            for bu in bu_gen:
+                yield bu
+
+    def _preprocess_state(self):
+        state_list = list()
+        for state_block in self._state():
+            state_list.append(state_block.origin)
+            if state_block.type == BlockType.PARAMETER:
+                self._var2var_op_name[state_block.origin] = (
+                    state_block.name_prefix + state_block.name
+                )
+
+    def _complete_graph_config(self):
+        if len(self._optimizers):
+            self.config._train(True)
+        for (name, opt_config) in self._optimizers.items():
+            self.config.add_optimizer_config(opt_config, self._var2var_op_name)
+
+    def _compile(self, *args):
+        assert not self._is_compiled, (
+            "nn.Graph " + self._name + " has already been compiled."
+        )
+        self._preprocess_state()
+        self._complete_graph_config()
+        session = session_ctx.GetDefaultSession()
+        assert type(session) is MultiClientSession
+        session.TryInit()
+        with graph_build_util.graph_build_context(self.config.proto, session):
+            lazy_args = []
+            lazy_arg_op_names = []
+            for (idx, arg) in enumerate(args):
+                op_name = "_" + self.name + "-input_" + str(idx)
+                lazy_args.append(graph_build_util.build_graph_input_arg(op_name, arg))
+                lazy_arg_op_names.append(op_name)
+            state_op_names = []
+            state_tensors = []
+            for state_block in self._state():
+                op_name = state_block.name_prefix + state_block.name
+                state_tensor = state_block.origin
+                state_op_names.append(op_name)
+                state_tensors.append(state_tensor)
+                state_block.set_lazy_origin_builder(
+                    partial(graph_build_util.build_graph_state, op_name, state_tensor)
+                )
+            self._variables = convert_to_tensor_tuple(state_tensors)
+            outputs = self.build(*lazy_args)
+            if not (type(outputs) is tuple or type(outputs) is list):
+                if outputs is None:
+                    outputs = ()
+                else:
+                    assert type(outputs) is InternalTensor
+                    outputs = (outputs,)
+            eager_outputs = []
+            eager_output_op_names = []
+            for (idx, out) in enumerate(outputs):
+                op_name = "_" + self.name + "-output_" + str(idx)
+                eager_outputs.append(graph_build_util.build_graph_output(op_name, out))
+                eager_output_op_names.append(op_name)
+            if len(eager_outputs) == 0:
+                eager_outputs = None
+            elif len(eager_outputs) == 1:
+                eager_outputs = eager_outputs[0]
+            else:
+                eager_outputs = tuple(eager_outputs)
+            self._outputs = convert_to_tensor_tuple(eager_outputs)
+            self._eager_outputs = eager_outputs
+            self._c_nn_graph.register_input_op_names(lazy_arg_op_names)
+            self._c_nn_graph.register_output_op_names(eager_output_op_names)
+            self._c_nn_graph.register_variable_op_names_and_tensors(
+                state_op_names, self._variables
+            )
+            self._job_proto = c_api_util.GetCurrentJob()
+        self._c_nn_graph.complie_and_init_runtime()
+        self._is_compiled = True
+        return eager_outputs
+
+    def _launch(self, *args):
+        oneflow._oneflow_internal.nn.graph.RunLazyNNGraph(
+            convert_to_tensor_tuple(args),
+            self._outputs,
+            self._variables,
+            self._c_nn_graph,
+        )
+        return self._eager_outputs
+
+    def __call__(self, *args):
+        if not self._is_compiled:
+            self._compile(*args)
+        return self._launch(*args)
+
+    def _add_block(self, name: str, module: Module = None) -> None:
+        """Adds a module to the current graph as a block.
+
+        The block can be accessed as an attribute using the given name.
+
+        Args:
+            name (string): name of the child block. The child block can be
+                accessed from this graph using the given name
+            module (Module): child module to be added to the graph.
+        """
+        if not isinstance(module, Module) and module is not None:
+            raise TypeError("{} is not a Module subclass".format(type(module)))
+        elif not isinstance(name, str):
+            raise TypeError("module name should be a string. Got {}".format(type(name)))
+        elif hasattr(self, name) and name not in self._blocks:
+            raise KeyError("attribute '{}' already exists".format(name))
+        elif "." in name:
+            raise KeyError('module name can\'t contain ".", got: {}'.format(name))
+        elif name == "":
+            raise KeyError('module name can\'t be empty string ""')
+        self._blocks[name] = Block("", name, module)
+
+    def __setattr__(self, name: str, value=None):
+        if isinstance(value, Module):
+            self._add_block(name, value)
+        elif isinstance(value, Optimizer):
+            raise AttributeError(
+                "'{}' object are not allowed to set Optimizer attribute named '{}', please use add_optimizer(...) instead.".format(
+                    type(self).__name__, name
+                )
+            )
+        else:
+            object.__setattr__(self, name, value)
+
+    def __getattr__(self, name: str):
+        if "_blocks" in self.__dict__:
+            if name in self._blocks:
+                return self._blocks[name]
+        if name in self.__dict__:
+            return self.__dict__[name]
+        raise AttributeError(
+            "'{}' object has no attribute '{}'".format(type(self).__name__, name)
+        )
+
+    def __repr__(self):
+        lines = None
+        if len(self._blocks) > 0:
+            child_lines = []
+            for (n, m) in self._blocks.items():
+                mod_str = repr(m)
+                mod_str = add_indent(mod_str, 2)
+                child_lines.append(mod_str)
+            lines = child_lines
+        main_str = "(" + self._name + ":" + self.__class__.__name__ + ":GRAPH): ("
+        if lines is not None:
+            main_str += "\n  " + "\n  ".join(lines) + "\n"
+        main_str += ")"
+        return main_str
+
+
+class GraphConfig(FunctionConfig):
+    def __init__(self):
+        super().__init__()
+        self._train(False)
+
+    @property
+    def proto(self):
+        return self.function_desc.job_config_proto
+
+    @property
+    def training(self):
+        if self.proto.has_train_conf():
+            return True
+        if self.proto.has_predict_conf():
+            return False
+        raise NotImplementedError
+
+    def _train(self, mode: bool = True):
+        if mode:
+            self.proto.mutable_train_conf()
+            self.proto.mutable_train_conf().set_loss_scale_factor(1.0)
+        else:
+            self.proto.mutable_predict_conf()
+
+    def add_optimizer_config(
+        self, optimizer_config: OptimizerConfig = None, var2var_op_name: Dict = None
+    ):
+        optimizer_config.optimizer.add_to_graph_train_config(
+            self.proto.mutable_train_conf(), var2var_op_name
+        )
+
+
+from oneflow.nn.graph import Graph as Graph
+from oneflow.nn.graph_block import Block, BlockConfig
+from oneflow.nn.graph_optimizer import OptimizerConfig
diff --git a/python/oneflow/nn/graph_block.py b/python/oneflow/nn/graph_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..35634cd09f2c05c27b44f105f2f6978f72d68eaf
--- /dev/null
+++ b/python/oneflow/nn/graph_block.py
@@ -0,0 +1,307 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from collections import OrderedDict
+from typing import Iterator, Optional, Set, Union
+
+import oneflow._oneflow_internal
+import oneflow.framework.graph_build_util as graph_build_util
+from oneflow.framework.tensor import Tensor
+from oneflow.nn.module import Module
+from oneflow.nn.parameter import Parameter
+from oneflow.nn.utils import add_indent
+
+
+class BlockType:
+    NONE = "NONE"
+    MODULE = "MODULE"
+    PARAMETER = "PARAMETER"
+    BUFFER = "BUFFER"
+
+
+class Block(object):
+    def __init__(
+        self,
+        prefix: str = "",
+        name: str = "",
+        value: Union[Module, Parameter, Tensor] = None,
+    ):
+        assert not isinstance(value, Block)
+        self._name = name
+        self._name_prefix = prefix
+        self._type = BlockType.NONE
+        self._origin = value
+        self.config = BlockConfig()
+        self._scope = None
+        self._prev_scope = None
+        if isinstance(value, Module):
+            self._type = BlockType.MODULE
+            self._is_executing_forward = False
+            self._modules = OrderedDict()
+            self._parameters = OrderedDict()
+            self._buffers = OrderedDict()
+            for (n, m) in list(value.named_children()):
+                self.__setattr__(n, Block(self._name_prefix + self._name + ".", n, m))
+            for (n, p) in list(value.named_parameters("", False)):
+                self.__setattr__(n, Block(self._name_prefix + self._name + ".", n, p))
+            for (n, b) in list(value.named_buffers("", False)):
+                self.__setattr__(n, Block(self._name_prefix + self._name + ".", n, b))
+        elif isinstance(value, Parameter):
+            self._type = BlockType.PARAMETER
+            self._lazy_origin = None
+            self._lazy_origin_builder = None
+        elif isinstance(value, Tensor):
+            self._type = BlockType.BUFFER
+            self._lazy_origin = None
+            self._lazy_origin_builder = None
+        else:
+            raise NotImplementedError()
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def name_prefix(self):
+        return self._name_prefix
+
+    @property
+    def type(self):
+        return self._type
+
+    @property
+    def origin(self):
+        return self._origin
+
+    @property
+    def lazy_origin(self):
+        assert (
+            self._type == BlockType.PARAMETER or self._type == BlockType.BUFFER
+        ), "Only Parameter or Buffer Block has lazy_origin"
+        return self._lazy_origin
+
+    def lazy_origin_builder(self):
+        assert (
+            self._type == BlockType.PARAMETER or self._type == BlockType.BUFFER
+        ), "Only Parameter or Buffer Block has lazy_origin_builder"
+        return self._lazy_origin_builder
+
+    def set_lazy_origin_builder(self, fn=None):
+        assert (
+            self._type == BlockType.PARAMETER or self._type == BlockType.BUFFER
+        ), "Only Parameter or Buffer Block has lazy_origin_builder"
+        self._lazy_origin_builder = fn
+
+    @property
+    def prev_scope(self):
+        if self._prev_scope is None:
+            self._prev_scope = oneflow._oneflow_internal.GetCurrentScope()
+        return self._prev_scope
+
+    @property
+    def scope(self):
+        if self._scope is None:
+            self._scope = graph_build_util.make_new_block_scope(self.prev_scope, self)
+        return self._scope
+
+    def scope_context(self):
+        return graph_build_util.BlockScopeContext(self.prev_scope, self.scope)
+
+    def __call__(self, *args):
+        assert self._type == BlockType.MODULE
+        result = self._origin.__class__.__call__(self, *args)
+        return result
+
+    def forward(self, *args):
+        assert self._type == BlockType.MODULE
+        self._is_executing_forward = True
+        with self.scope_context():
+            result = self._origin.__class__.forward(self, *args)
+        self._is_executing_forward = False
+        return result
+
+    def modules(self, memo: Optional[Set["Block"]] = None) -> Iterator["Block"]:
+        assert self._type == BlockType.MODULE
+        if memo is None:
+            memo = set()
+        if self not in memo:
+            memo.add(self)
+            yield self
+            for (name, module) in self._modules.items():
+                if module is None:
+                    continue
+                for m in module.modules(memo):
+                    yield m
+
+    def _members(self, get_members_fn, recurse=True) -> Iterator["Block"]:
+        assert self._type == BlockType.MODULE
+        memo = set()
+        modules = self.modules() if recurse else [self]
+        for module in modules:
+            members = get_members_fn(module)
+            for (k, v) in members:
+                if v is None or v in memo:
+                    continue
+                memo.add(v)
+                yield v
+
+    def parameters(self, recurse: bool = True) -> Iterator["Block"]:
+        assert self._type == BlockType.MODULE
+        gen = self._members(lambda module: module._parameters.items(), recurse=recurse)
+        for elem in gen:
+            yield elem
+
+    def buffers(self, recurse: bool = True) -> Iterator["Block"]:
+        assert self._type == BlockType.MODULE
+        gen = self._members(lambda module: module._buffers.items(), recurse=recurse)
+        for elem in gen:
+            yield elem
+
+    def __setattr__(self, name: str, value=None) -> None:
+        if value is None or not isinstance(value, Block):
+            self.__dict__[name] = value
+        else:
+            dicts_or_sets = (
+                self.__dict__,
+                self._modules,
+                self._parameters,
+                self._buffers,
+            )
+            for d in dicts_or_sets:
+                if name in d:
+                    raise AttributeError(
+                        "'{}' object has duplicated attribute named '{}'".format(
+                            self._name, name
+                        )
+                    )
+            if value.type == BlockType.MODULE:
+                self._modules[name] = value
+            elif value.type == BlockType.PARAMETER:
+                self._parameters[name] = value
+            elif value.type == BlockType.BUFFER:
+                self._buffers[name] = value
+            else:
+                raise AttributeError(
+                    "'{}' object are not allowed to set attribute named '{}'".format(
+                        type(self).__name__, name
+                    )
+                )
+
+    def __getattr__(self, name: str):
+        if name in self.__dict__:
+            return self.__dict__[name]
+        if self._type == BlockType.MODULE:
+            if "_modules" in self.__dict__:
+                modules = self.__dict__["_modules"]
+                if name in modules:
+                    return modules[name]
+            if "_parameters" in self.__dict__:
+                _parameters = self.__dict__["_parameters"]
+                if name in _parameters:
+                    p_block = _parameters[name]
+                    if self._is_executing_forward:
+                        if graph_build_util.lazy_mode.is_enabled():
+                            if p_block._lazy_origin is None:
+                                assert p_block._lazy_origin_builder is not None, (
+                                    repr(p_block)
+                                    + " has no lazy Tensor creation function."
+                                )
+                                with p_block.scope_context():
+                                    p_block._lazy_origin = (
+                                        p_block._lazy_origin_builder()
+                                    )
+                            return p_block._lazy_origin
+                        else:
+                            return p_block.origin
+                    else:
+                        return p_block
+            if "_buffers" in self.__dict__:
+                _buffers = self.__dict__["_buffers"]
+                if name in _buffers:
+                    b_block = _buffers[name]
+                    if self._is_executing_forward:
+                        if graph_build_util.lazy_mode.is_enabled():
+                            if b_block._lazy_origin is None:
+                                assert b_block._lazy_origin_builder is not None, (
+                                    repr(b_block)
+                                    + " has no lazy Tensor creation function."
+                                )
+                                with b_block.scope_context():
+                                    b_block._lazy_origin = (
+                                        b_block._lazy_origin_builder()
+                                    )
+                            return b_block._lazy_origin
+                        else:
+                            return b_block.origin
+                    else:
+                        return b_block
+            if name in self._origin.__dict__:
+                return self._origin.__dict__[name]
+        raise AttributeError(
+            "'{}' object has no attribute '{}'".format(type(self).__name__, name)
+        )
+
+    def __repr__(self):
+        lines = None
+        if self._type == BlockType.MODULE:
+            child_lines = []
+
+            def _append_child(d):
+                for (_, n) in d.items():
+                    n_str = repr(n)
+                    n_str = add_indent(n_str, 2)
+                    child_lines.append(n_str)
+
+            _append_child(self._modules)
+            _append_child(self._parameters)
+            _append_child(self._buffers)
+            if len(child_lines) > 0:
+                lines = child_lines
+        main_str = (
+            "("
+            + self._name_prefix
+            + self._name
+            + ":"
+            + self._origin.__class__.__name__
+            + ":"
+            + self._type
+            + "): ("
+        )
+        if lines is not None:
+            main_str += "\n  " + "\n  ".join(lines) + "\n"
+        main_str += ")"
+        return main_str
+
+
+class BlockConfig(object):
+    def __init__(self):
+        self._stage_id = None
+        self._activation_checkpointing = None
+
+    @property
+    def stage_id(self):
+        return self._stage_id
+
+    @stage_id.setter
+    def stage_id(self, value: int = None):
+        self._stage_id = value
+
+    @property
+    def activation_checkpointing(self):
+        return self._activation_checkpointing
+
+    @activation_checkpointing.setter
+    def activation_checkpointing(self, value: bool = False):
+        self._activation_checkpointing = value
diff --git a/python/oneflow/nn/graph_optimizer.py b/python/oneflow/nn/graph_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2ad3c51710a4c44aa76c1e1b386205fbe8f29eb
--- /dev/null
+++ b/python/oneflow/nn/graph_optimizer.py
@@ -0,0 +1,32 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.nn.optimizer.optimizer import Optimizer
+
+
+class OptimizerConfig(object):
+    def __init__(
+        self,
+        name: str,
+        optimizer: Optimizer = None,
+        lr_scheduler=None,
+        grad_clipping_conf=None,
+        weight_decay_conf=None,
+    ):
+        self.name = name
+        self.optimizer = optimizer
+        self.lr_scheduler = lr_scheduler
+        self.grad_clipping_conf = grad_clipping_conf
+        self.weight_decay_conf = weight_decay_conf
diff --git a/python/oneflow/nn/image.py b/python/oneflow/nn/image.py
new file mode 100644
index 0000000000000000000000000000000000000000..5818d3bf5561fe3d0acd3ca414be4dd8dd6bfbb2
--- /dev/null
+++ b/python/oneflow/nn/image.py
@@ -0,0 +1,20 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.nn.modules.dataset import ImageBatchAlign as batch_align
+from oneflow.nn.modules.dataset import ImageDecode as decode
+from oneflow.nn.modules.dataset import ImageFlip as flip
+from oneflow.nn.modules.dataset import ImageNormalize as normalize
+from oneflow.nn.modules.dataset import ImageResize as Resize
diff --git a/python/oneflow/nn/init.py b/python/oneflow/nn/init.py
new file mode 100644
index 0000000000000000000000000000000000000000..2092b462687ee6f52e9375c666df0781d22c0f24
--- /dev/null
+++ b/python/oneflow/nn/init.py
@@ -0,0 +1,77 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.ops.initializer_util import CalcGain
+
+
+def calculate_gain(nonlinearity, param=None):
+    return CalcGain(nonlinearity, param)
+
+
+def uniform_(tensor, a=0.0, b=1.0):
+    tensor.uniform_(a, b)
+
+
+def normal_(tensor, mean=0.0, std=1.0):
+    tensor.normal_(mean, std)
+
+
+def xavier_uniform_(tensor, gain=1.0, *, data_format="NCHW"):
+    tensor.xavier_uniform_(gain, data_format=data_format)
+
+
+def xavier_normal_(tensor, gain=1.0, *, data_format="NCHW"):
+    tensor.xavier_normal_(gain, data_format=data_format)
+
+
+def kaiming_uniform_(
+    tensor, a=0, mode="fan_in", nonlinearity="leaky_relu", *, data_format="NCHW"
+):
+    tensor.kaiming_uniform_(a, mode, nonlinearity, data_format=data_format)
+
+
+def kaiming_normal_(
+    tensor, a=0, mode="fan_in", nonlinearity="leaky_relu", *, data_format="NCHW"
+):
+    tensor.kaiming_normal_(a, mode, nonlinearity, data_format=data_format)
+
+
+def constant_(tensor, val):
+    tensor.fill_(val)
+
+
+def ones_(tensor):
+    tensor.fill_(1)
+
+
+def zeros_(tensor):
+    tensor.fill_(0)
+
+
+def _calculate_fan_in_and_fan_out(tensor):
+    dimensions = tensor.ndimension()
+    if dimensions < 2:
+        raise ValueError(
+            "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions"
+        )
+    num_input_fmaps = tensor.size(1)
+    num_output_fmaps = tensor.size(0)
+    receptive_field_size = 1
+    if tensor.ndimension() > 2:
+        for s in tensor.size()[2:]:
+            receptive_field_size *= s
+    fan_in = num_input_fmaps * receptive_field_size
+    fan_out = num_output_fmaps * receptive_field_size
+    return (fan_in, fan_out)
diff --git a/python/oneflow/nn/module.py b/python/oneflow/nn/module.py
new file mode 100644
index 0000000000000000000000000000000000000000..daf47c240d13c6bf280d89e8e8cf397db16db0c7
--- /dev/null
+++ b/python/oneflow/nn/module.py
@@ -0,0 +1,528 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import itertools
+from collections import OrderedDict, namedtuple
+from typing import Callable, Dict, Iterator, List, Optional, Set, Tuple, TypeVar, Union
+
+import numpy as np
+
+import oneflow as flow
+from oneflow.framework.check_point_v2 import FeedValueToVariable
+from oneflow.framework.function_util import global_function_or_identity
+from oneflow.framework.tensor import Tensor
+from oneflow.nn.parameter import Parameter
+
+
+class _IncompatibleKeys(
+    namedtuple("IncompatibleKeys", ["missing_keys", "unexpected_keys"])
+):
+    def __repr__(self):
+        if not self.missing_keys and (not self.unexpected_keys):
+            return "<All keys matched successfully>"
+        return super(_IncompatibleKeys, self).__repr__()
+
+    __str__ = __repr__
+
+
+def _addindent(s_, numSpaces):
+    s = s_.split("\n")
+    if len(s) == 1:
+        return s_
+    first = s.pop(0)
+    s = [numSpaces * " " + line for line in s]
+    s = "\n".join(s)
+    s = first + "\n" + s
+    return s
+
+
+T = TypeVar("T", bound="Module")
+
+
+class Module(object):
+    def __init__(self):
+        self.training = True
+        self._consistent = False
+        self._parameters = OrderedDict()
+        self._buffers = OrderedDict()
+        self._non_persistent_buffers_set = set()
+        self._backward_hooks = OrderedDict()
+        self._is_full_backward_hook = None
+        self._forward_hooks = OrderedDict()
+        self._forward_pre_hooks = OrderedDict()
+        self._state_dict_hooks = OrderedDict()
+        self._load_state_dict_pre_hooks = OrderedDict()
+        self._modules = OrderedDict()
+
+    @property
+    def consistent(self):
+        return self._consistent
+
+    def forward(self, *args):
+        raise NotImplementedError()
+
+    def consistent_forward(self, *args):
+        return self.forward(*args)
+
+    def force_mirrored_forward(self, *args):
+        raise NotImplementedError()
+
+    def __call__(self, *args):
+        for hook in itertools.chain(self._forward_pre_hooks.values()):
+            result = hook(self, args)
+            if result is not None:
+                if not isinstance(result, tuple):
+                    result = (result,)
+                args = result
+        res = self.forward(*args)
+        return res
+
+    def add_module(self, name: str, module: Optional["Module"]) -> None:
+        """Adds a child module to the current module.
+
+        The module can be accessed as an attribute using the given name.
+
+        Args:
+            name (string): name of the child module. The child module can be
+                accessed from this module using the given name
+            module (Module): child module to be added to the module.
+        """
+        if not isinstance(module, Module) and module is not None:
+            raise TypeError("{} is not a Module subclass".format(type(module)))
+        elif not isinstance(name, str):
+            raise TypeError("module name should be a string. Got {}".format(type(name)))
+        elif hasattr(self, name) and name not in self._modules:
+            raise KeyError("attribute '{}' already exists".format(name))
+        elif "." in name:
+            raise KeyError('module name can\'t contain ".", got: {}'.format(name))
+        elif name == "":
+            raise KeyError('module name can\'t be empty string ""')
+        self._modules[name] = module
+
+    def register_buffer(
+        self, name: str, tensor: Optional[Tensor], persistent: bool = True
+    ) -> None:
+        if "_buffers" not in self.__dict__:
+            raise AttributeError("cannot assign buffer before Module.__init__() call")
+        elif not isinstance(name, str):
+            raise TypeError("buffer name should be a string. Got {}".format(type(name)))
+        elif "." in name:
+            raise KeyError('buffer name can\'t contain "."')
+        elif name == "":
+            raise KeyError('buffer name can\'t be empty string ""')
+        elif hasattr(self, name) and name not in self._buffers:
+            raise KeyError("attribute '{}' already exists".format(name))
+        elif tensor is not None and (not isinstance(tensor, Tensor)):
+            raise TypeError(
+                "cannot assign '{}' object to buffer '{}' (Tensor or None required)".format(
+                    type(tensor), name
+                )
+            )
+        else:
+            self._buffers[name] = tensor
+            if persistent:
+                self._non_persistent_buffers_set.discard(name)
+            else:
+                self._non_persistent_buffers_set.add(name)
+
+    def register_parameter(self, name: str, param: Optional[Parameter]) -> None:
+        if "_parameters" not in self.__dict__:
+            raise AttributeError(
+                "cannot assign parameter before Module.__init__() call"
+            )
+        elif not isinstance(name, str):
+            raise TypeError(
+                "parameter name should be a string. Got {}".format(type(name))
+            )
+        elif "." in name:
+            raise KeyError('parameter name can\'t contain "."')
+        elif name == "":
+            raise KeyError('parameter name can\'t be empty string ""')
+        elif hasattr(self, name) and name not in self._parameters:
+            raise KeyError("attribute '{}' already exists".format(name))
+        if param is None:
+            self._parameters[name] = None
+        elif not isinstance(param, Parameter):
+            raise TypeError(
+                "cannot assign '{}' object to parameter '{}' (nn.Parameter or None required)".format(
+                    type(param), name
+                )
+            )
+        else:
+            self._parameters[name] = param
+
+    def __getattr__(self, name: str) -> Union[Tensor, "Module"]:
+        if "_parameters" in self.__dict__:
+            _parameters = self.__dict__["_parameters"]
+            if name in _parameters:
+                return _parameters[name]
+        if "_buffers" in self.__dict__:
+            _buffers = self.__dict__["_buffers"]
+            if name in _buffers:
+                return _buffers[name]
+        if "_modules" in self.__dict__:
+            modules = self.__dict__["_modules"]
+            if name in modules:
+                return modules[name]
+        raise AttributeError(
+            "'{}' object has no attribute '{}'".format(type(self).__name__, name)
+        )
+
+    def __setattr__(self, name: str, value: Union[Tensor, "Module"]) -> None:
+        def remove_from(*dicts_or_sets):
+            for d in dicts_or_sets:
+                if name in d:
+                    if isinstance(d, dict):
+                        del d[name]
+                    else:
+                        d.discard(name)
+
+        params = self.__dict__.get("_parameters")
+        if isinstance(value, Parameter):
+            if params is None:
+                raise AttributeError(
+                    "cannot assign parameters before Module.__init__() call"
+                )
+            remove_from(
+                self.__dict__,
+                self._buffers,
+                self._modules,
+                self._non_persistent_buffers_set,
+            )
+            self.register_parameter(name, value)
+        elif params is not None and name in params:
+            if value is not None:
+                raise TypeError(
+                    "cannot assign '{}' as parameter '{}' (nn.Parameter or None expected)".format(
+                        type(value), name
+                    )
+                )
+            self.register_parameter(name, value)
+        else:
+            modules = self.__dict__.get("_modules")
+            if isinstance(value, Module):
+                if modules is None:
+                    raise AttributeError(
+                        "cannot assign module before Module.__init__() call"
+                    )
+                remove_from(
+                    self.__dict__,
+                    self._parameters,
+                    self._buffers,
+                    self._non_persistent_buffers_set,
+                )
+                modules[name] = value
+            elif modules is not None and name in modules:
+                if value is not None:
+                    raise TypeError(
+                        "cannot assign '{}' as child module '{}' (nn.Module or None expected)".format(
+                            type(value), name
+                        )
+                    )
+                modules[name] = value
+            else:
+                buffers = self.__dict__.get("_buffers")
+                if buffers is not None and name in buffers:
+                    if value is not None and (not isinstance(value, Tensor)):
+                        raise TypeError(
+                            "cannot assign '{}' as buffer '{}' (Tensor or None expected)".format(
+                                type(value), name
+                            )
+                        )
+                    buffers[name] = value
+                else:
+                    object.__setattr__(self, name, value)
+
+    def _named_members(self, get_members_fn, prefix="", recurse=True):
+        memo = set()
+        modules = self.named_modules(prefix=prefix) if recurse else [(prefix, self)]
+        for (module_prefix, module) in modules:
+            members = get_members_fn(module)
+            for (k, v) in members:
+                if v is None or v in memo:
+                    continue
+                memo.add(v)
+                name = module_prefix + ("." if module_prefix else "") + k
+                yield (name, v)
+
+    def parameters(self, recurse: bool = True) -> Iterator[Parameter]:
+        for (name, param) in self.named_parameters(recurse=recurse):
+            yield param
+
+    def named_parameters(
+        self, prefix: str = "", recurse: bool = True
+    ) -> Iterator[Tuple[str, Tensor]]:
+        gen = self._named_members(
+            lambda module: module._parameters.items(), prefix=prefix, recurse=recurse
+        )
+        for elem in gen:
+            yield elem
+
+    def buffers(self, recurse: bool = True) -> Iterator[Tensor]:
+        for (name, buf) in self.named_buffers(recurse=recurse):
+            yield buf
+
+    def named_buffers(
+        self, prefix: str = "", recurse: bool = True
+    ) -> Iterator[Tuple[str, Tensor]]:
+        gen = self._named_members(
+            lambda module: module._buffers.items(), prefix=prefix, recurse=recurse
+        )
+        for elem in gen:
+            yield elem
+
+    def children(self) -> Iterator["Module"]:
+        for (name, module) in self.named_children():
+            yield module
+
+    def named_children(self) -> Iterator[Tuple[str, "Module"]]:
+        memo = set()
+        for (name, module) in self._modules.items():
+            if module is not None and module not in memo:
+                memo.add(module)
+                yield (name, module)
+
+    def modules(self) -> Iterator["Module"]:
+        for (name, module) in self.named_modules():
+            yield module
+
+    def named_modules(self, memo: Optional[Set["Module"]] = None, prefix: str = ""):
+        if memo is None:
+            memo = set()
+        if self not in memo:
+            memo.add(self)
+            yield (prefix, self)
+            for (name, module) in self._modules.items():
+                if module is None:
+                    continue
+                submodule_prefix = prefix + ("." if prefix else "") + name
+                for m in module.named_modules(memo, submodule_prefix):
+                    yield m
+
+    def train(self: T, mode: bool = True) -> T:
+        self.training = mode
+        for module in self.children():
+            module.train(mode)
+        return self
+
+    def eval(self: T) -> T:
+        return self.train(False)
+
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        for (name, param) in self._parameters.items():
+            if param is not None:
+                destination[prefix + name] = param
+        for (name, buf) in self._buffers.items():
+            if buf is not None and name not in self._non_persistent_buffers_set:
+                destination[prefix + name] = buf
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        for hook in self._load_state_dict_pre_hooks.values():
+            hook(
+                state_dict,
+                prefix,
+                local_metadata,
+                strict,
+                missing_keys,
+                unexpected_keys,
+                error_msgs,
+            )
+        persistent_buffers = {
+            k: v
+            for (k, v) in self._buffers.items()
+            if k not in self._non_persistent_buffers_set
+        }
+        local_name_params = itertools.chain(
+            self._parameters.items(), persistent_buffers.items()
+        )
+        local_state = {k: v for (k, v) in local_name_params if v is not None}
+        for (name, param) in local_state.items():
+            key = prefix + name
+            if key in state_dict:
+                input_param = state_dict[key]
+                if tuple(input_param.shape) != tuple(param.shape):
+                    error_msgs.append(
+                        "size mismatch for {}: copying a param with shape {} from checkpoint, the shape in current model is {}.".format(
+                            key, input_param.shape, param.shape
+                        )
+                    )
+                    continue
+                try:
+                    param.copy_(input_param)
+                except Exception as ex:
+                    error_msgs.append(
+                        'While copying the parameter named "{}", whose dimensions in the model are {} and whose dimensions in the checkpoint are {}, an exception occurred : {}.'.format(
+                            key, param.shape, input_param.shape, ex.args
+                        )
+                    )
+            elif strict:
+                missing_keys.append(key)
+        if strict:
+            for key in state_dict.keys():
+                if key.startswith(prefix):
+                    input_name = key[len(prefix) :]
+                    input_name = input_name.split(".", 1)[0]
+                    if (
+                        input_name not in self._modules
+                        and input_name not in local_state
+                    ):
+                        unexpected_keys.append(key)
+
+    def load_state_dict(
+        self,
+        state_dict: Union[Dict[str, Tensor], Dict[str, Tensor]],
+        strict: bool = True,
+    ):
+        missing_keys = []
+        unexpected_keys = []
+        error_msgs = []
+        metadata = getattr(state_dict, "_metadata", None)
+        state_dict = state_dict.copy()
+        if metadata is not None:
+            state_dict._metadata = metadata
+
+        def load(module, prefix=""):
+            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+            module._load_from_state_dict(
+                state_dict,
+                prefix,
+                local_metadata,
+                True,
+                missing_keys,
+                unexpected_keys,
+                error_msgs,
+            )
+            for (name, child) in module._modules.items():
+                if child is not None:
+                    load(child, prefix + name + ".")
+
+        load(self)
+        load = None
+        if strict:
+            if len(unexpected_keys) > 0:
+                error_msgs.insert(
+                    0,
+                    "Unexpected key(s) in state_dict: {}. ".format(
+                        ", ".join(('"{}"'.format(k) for k in unexpected_keys))
+                    ),
+                )
+            if len(missing_keys) > 0:
+                error_msgs.insert(
+                    0,
+                    "Missing key(s) in state_dict: {}. ".format(
+                        ", ".join(('"{}"'.format(k) for k in missing_keys))
+                    ),
+                )
+        if len(error_msgs) > 0:
+            raise RuntimeError(
+                "Error(s) in loading state_dict for {}:\n\t{}".format(
+                    self.__class__.__name__, "\n\t".join(error_msgs)
+                )
+            )
+        return _IncompatibleKeys(missing_keys, unexpected_keys)
+
+    def state_dict(
+        self, destination=None, prefix="", keep_vars=False
+    ) -> Dict[str, Tensor]:
+        if destination is None:
+            destination = OrderedDict()
+            destination._metadata = OrderedDict()
+        self._save_to_state_dict(destination, prefix, keep_vars)
+        for (name, module) in self._modules.items():
+            if module is not None:
+                module.state_dict(destination, prefix + name + ".", keep_vars=keep_vars)
+        for hook in self._state_dict_hooks.values():
+            hook_result = hook(self, destination, prefix)
+            if hook_result is not None:
+                destination = hook_result
+        return destination
+
+    def register_forward_pre_hook(self, hook: Callable[..., None]) -> None:
+        self._forward_pre_hooks[len(self._forward_pre_hooks)] = hook
+
+    def _apply(self, fn):
+        for module in self.children():
+            module._apply(fn)
+        for (key, param) in self._parameters.items():
+            if param is not None:
+                assert isinstance(param, Parameter)
+                assert param.is_leaf
+                with flow.no_grad():
+                    param_applied = Tensor(fn(param))
+                self._parameters[key] = Parameter(param_applied, param.requires_grad)
+                if param.grad is not None:
+                    assert param.grad.is_leaf
+                    with flow.no_grad():
+                        grad_applied = Tensor(fn(param.grad))
+                    self._parameters[key].grad = grad_applied.requires_grad_(
+                        param.grad.requires_grad
+                    )
+        for (key, buf) in self._buffers.items():
+            if buf is not None:
+                self._buffers[key] = Tensor(fn(buf))
+        return self
+
+    def apply(self: T, fn: Callable[["Module"], None]) -> T:
+        for module in self.children():
+            module.apply(fn)
+        fn(self)
+        return self
+
+    def to(self, device: Optional[Union[str, flow.device]] = None):
+        def convert(t):
+            return t.to(device)
+
+        return self._apply(convert)
+
+    def _get_name(self):
+        return self.__class__.__name__
+
+    def extra_repr(self) -> str:
+        """Set the extra representation of the module
+
+        To print customized extra information, you should re-implement
+        this method in your own modules. Both single-line and multi-line
+        strings are acceptable.
+        """
+        return ""
+
+    def __repr__(self):
+        extra_lines = []
+        extra_repr = self.extra_repr()
+        if extra_repr:
+            extra_lines = extra_repr.split("\n")
+        child_lines = []
+        for (key, module) in self._modules.items():
+            mod_str = repr(module)
+            mod_str = _addindent(mod_str, 2)
+            child_lines.append("(" + key + "): " + mod_str)
+        lines = extra_lines + child_lines
+        main_str = self._get_name() + "("
+        if lines:
+            if len(extra_lines) == 1 and (not child_lines):
+                main_str += extra_lines[0]
+            else:
+                main_str += "\n  " + "\n  ".join(lines) + "\n"
+        main_str += ")"
+        return main_str
diff --git a/python/oneflow/nn/modules/__init__.py b/python/oneflow/nn/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/python/oneflow/nn/modules/abs.py b/python/oneflow/nn/modules/abs.py
new file mode 100644
index 0000000000000000000000000000000000000000..09b1f9385303d61e42d59141824404a1143167ad
--- /dev/null
+++ b/python/oneflow/nn/modules/abs.py
@@ -0,0 +1,54 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class Abs(Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.abs(x)
+
+
+@register_tensor_op("abs")
+def abs_op(x):
+    """Return the absolute value of each element in input tensor:math:`y = |x|` element-wise.
+
+    Args:
+        input (Tensor): the input tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+
+        >>> x = flow.Tensor(np.array([-1, 2, -3, 4]).astype(np.float32))
+        >>> flow.abs(x)
+        tensor([1., 2., 3., 4.], dtype=oneflow.float32)
+
+    """
+    return Abs()(x)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/acos.py b/python/oneflow/nn/modules/acos.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfb2e608f512923bf56f902dccb01fd283d03847
--- /dev/null
+++ b/python/oneflow/nn/modules/acos.py
@@ -0,0 +1,60 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class Acos(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.acos(x)
+
+
+@register_tensor_op("acos")
+def acos_op(tensor):
+    """
+    Returns a new tensor with the inverse cosine of the elements of :attr:`input`.
+
+    .. math::
+        \\text{out}_{i} = \\arccos(\\text{input}_{i})
+
+    Args:
+        input (Tensor): the input tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+
+        >>> arr = np.array([0.5, 0.6, 0.7])
+        >>> input = flow.Tensor(arr, dtype=flow.float32)
+        >>> output = flow.acos(input)
+        >>> output
+        tensor([1.0472, 0.9273, 0.7954], dtype=oneflow.float32)
+
+    """
+    return Acos()(tensor)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/acosh.py b/python/oneflow/nn/modules/acosh.py
new file mode 100644
index 0000000000000000000000000000000000000000..42474008e9affc450d1b2f8d43359590a265ab21
--- /dev/null
+++ b/python/oneflow/nn/modules/acosh.py
@@ -0,0 +1,94 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class Acosh(Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.acosh(x)
+
+
+def acosh_op(x):
+    """Returns a new tensor with the inverse hyperbolic cosine of the elements of :attr:`input`.
+
+    .. math::
+
+        \\text{out}_{i} = \\cosh^{-1}(\\text{input}_{i})
+
+    Args:
+        input (Tensor): the input tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> x1 = flow.Tensor(np.array([2, 3, 4]).astype(np.float32))
+        >>> out1 = flow.acosh(x1)
+        >>> out1
+        tensor([1.317 , 1.7627, 2.0634], dtype=oneflow.float32)
+        >>> x2 = flow.Tensor(np.array([1.5, 2.6, 3.7]).astype(np.float32),device=flow.device('cuda'))
+        >>> out2 = flow.acosh(x2)
+        >>> out2
+        tensor([0.9624, 1.6094, 1.9827], device='cuda:0', dtype=oneflow.float32)
+
+    """
+    return Acosh()(x)
+
+
+@register_tensor_op("acosh")
+def acosh_op_tensor(x):
+    """
+
+    acosh() -> Tensor
+
+    See :func:`oneflow.acosh`
+
+    """
+    return Acosh()(x)
+
+
+def arccosh_op(x):
+    """
+
+    See :func:`oneflow.acosh`
+
+    """
+    return Acosh()(x)
+
+
+@register_tensor_op("arccosh")
+def arccosh_op_tensor(x):
+    """
+
+    arccosh() -> Tensor
+
+    See :func:`oneflow.acosh`
+
+    """
+    return Acosh()(x)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/activation.py b/python/oneflow/nn/modules/activation.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c3abb6e5f5aa516a1fb6aaecb0c5fe08d04819a
--- /dev/null
+++ b/python/oneflow/nn/modules/activation.py
@@ -0,0 +1,959 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import warnings
+from typing import Optional
+
+import oneflow as flow
+import oneflow._oneflow_internal
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+from oneflow.nn.modules.utils import _check_inplace_valid
+
+
+def _softmax_need_transpose(x, axis):
+    assert type(axis) is int
+    dim_num = len(x.shape)
+    if dim_num == 1:
+        return (False, None)
+    if axis < 0:
+        axis += dim_num
+    assert axis >= 0
+    assert axis < dim_num
+    need_transpose = False
+    permute = list(range(dim_num))
+    if axis != dim_num - 1:
+        need_transpose = True
+        permute[axis] = permute[-1]
+        permute[-1] = axis
+    return (need_transpose, permute)
+
+
+class PReLU(Module):
+    """Applies the element-wise function:
+
+    .. math::
+        PReLU(x) = \\max(0,x) + a * \\min(0,x)
+
+    Here :math:`a` is a learnable parameter. When called without arguments, `nn.PReLU()` uses a single
+    parameter :math:`a` across all input channels. If called with `nn.PReLU(nChannels)`,
+    a separate :math:`a` is used for each input channel.
+
+
+    .. note::
+        weight decay should not be used when learning :math:`a` for good performance.
+
+    .. note::
+        Channel dim is the 2nd dim of input. When input has dims < 2, then there is
+        no channel dim and the number of channels = 1.
+
+    Args:
+        num_parameters (int): number of :math:`a` to learn.
+            Although it takes an int as input, there is only two values are legitimate:
+            1, or the number of channels at input. Default: 1
+        init (float): the initial value of :math:`a`. Default: 0.25
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    Attr:
+        - weight (Tensor): the learnable weights of shape (:attr:`num_parameters`).
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> m = flow.nn.PReLU()
+        >>> input = flow.Tensor(np.asarray([[[[1, -2], [3, 4]]]]), dtype=flow.float32)
+        >>> print(m(input).numpy())
+        [[[[ 1.  -0.5]
+           [ 3.   4. ]]]]
+
+    """
+
+    def __init__(self, num_parameters: int = 1, init: float = 0.25) -> None:
+        super().__init__()
+        self.num_parameters = num_parameters
+        self.weight = flow.nn.Parameter(flow.Tensor(num_parameters, 1, 1).fill_(init))
+
+    def forward(self, x):
+        assert (
+            self.num_parameters == 1 or self.num_parameters == x.shape[1]
+        ), f"num_parameters in prelu must be 1 or {x.shape[1]}"
+        return flow.F.prelu(x, self.weight)
+
+
+class ReLU(Module):
+    """Applies the rectified linear unit function element-wise:
+
+    :math:`\\text{ReLU}(x) = (x)^+ = \\max(0, x)`
+
+    Args:
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> relu = flow.nn.ReLU()
+        >>> ndarr = np.asarray([1, -2, 3])
+        >>> x = flow.Tensor(ndarr)
+        >>> relu(x)
+        tensor([1., 0., 3.], dtype=oneflow.float32)
+
+    """
+
+    def __init__(self, inplace: bool = False):
+        super().__init__()
+        self.inplace = inplace
+
+    def forward(self, x):
+        if self.inplace:
+            _check_inplace_valid(x)
+        return flow.F.relu(x, self.inplace)
+
+    def extra_repr(self):
+        inplace_str = "inplace=True" if self.inplace else ""
+        return inplace_str
+
+
+class ReLU6(Module):
+    """Applies the element-wise function:
+
+    .. math::
+
+        \\text{Relu6}(x) = \\begin{cases}
+            6 & \\text{ if } x > 6 \\\\
+            0 & \\text{ if } x < 0 \\\\
+            x & \\text{ otherwise } \\\\
+        \\end{cases}
+
+    Args:
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> x = np.array([-0.5, 0, 0.5]).astype(np.float32)
+        >>> input = flow.Tensor(x)
+        >>> relu6 = flow.nn.ReLU6()
+
+        >>> out = relu6(input)
+        >>> out
+        tensor([0. , 0. , 0.5], dtype=oneflow.float32)
+
+    """
+
+    def __init__(self, inplace: bool = False):
+        super().__init__()
+        self.inplace = inplace
+
+    def forward(self, x):
+        if self.inplace:
+            warnings.warn("ReLU6 module do not support inplace now")
+        return flow.F.hardtanh(x, min_val=0.0, max_val=6.0)
+
+    def extra_repr(self):
+        inplace_str = "inplace=True" if self.inplace else ""
+        return inplace_str
+
+
+class Tanh(Module):
+    """This operator computes the hyperbolic tangent value of Tensor.
+
+    The equation is:
+
+    .. math::
+
+        out = \\frac{e^x-e^{-x}}{e^x+e^{-x}}
+
+    Args:
+        x (oneflow.Tensor): A Tensor
+
+    Returns:
+        oneflow.Tensor: The result Tensor
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> x = np.array([-1, 0, 1]).astype(np.float32)
+        >>> input = flow.Tensor(x)
+        >>> tanh = flow.nn.Tanh()
+        >>> out = tanh(input)
+        >>> out
+        tensor([-0.7616,  0.    ,  0.7616], dtype=oneflow.float32)
+
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.tanh(x)
+
+
+@register_tensor_op("tanh")
+def tanh_op(x):
+    """This operator computes the hyperbolic tangent value of Tensor.
+
+    The equation is:
+
+    .. math::
+
+        out = \\frac{e^x-e^{-x}}{e^x+e^{-x}}
+
+    Args:
+        x (oneflow.Tensor): A Tensor
+
+    Returns:
+        oneflow.Tensor: The result Tensor
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> x = np.array([-1, 0, 1]).astype(np.float32)
+        >>> input = flow.Tensor(x)
+        >>> tanh = flow.nn.Tanh()
+        >>> out = tanh(input)
+        >>> out
+        tensor([-0.7616,  0.    ,  0.7616], dtype=oneflow.float32)
+
+    """
+    return Tanh()(x)
+
+
+class ELU(Module):
+    """Applies the element-wise function:
+
+    .. math::
+
+        \\text{ELU}(x) = \\begin{cases}
+				x & \\text{ if } x \\gt 0  \\\\
+                \\alpha*(exp(x)-1) & \\text{ if } x \\le 0 \\\\
+    		    \\end{cases}
+
+    Args:
+        alpha: the :math:`\\alpha` value for the ELU formulation. Default: 1.0
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    For example:
+
+    .. code-block:: python
+
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> x = np.array([-0.5, 0, 0.5]).astype(np.float32)
+        >>> input = flow.Tensor(x)
+        >>> elu = flow.nn.ELU()
+
+        >>> out = elu(input)
+        >>> out
+        tensor([-0.3935,  0.    ,  0.5   ], dtype=oneflow.float32)
+
+    """
+
+    def __init__(self, alpha: float = 1.0, inplace: bool = False):
+        super().__init__()
+        self.alpha = alpha
+        self.inplace = inplace
+
+    def forward(self, x):
+        if self.inplace:
+            warnings.warn("ELU module do not support inplace now")
+        return flow.F.elu(x, alpha=self.alpha)
+
+    def extra_repr(self):
+        param_str = f"alpha={self.alpha}"
+        param_str += ", inplace=True" if self.inplace else ""
+        return param_str
+
+
+class GELU(Module):
+    """Gelu activation operator.
+
+    The equation is:
+
+    .. math::
+        out = 0.5 * x * (1 + tanh(\\sqrt{\\frac{2}{\\pi}} * (x + 0.044715x^{3})))
+
+    Args:
+        x (oneflow.Tensor): Input Tensor
+
+    Returns:
+        oneflow.Tensor: A Tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> x = np.array([-0.5, 0, 0.5]).astype(np.float32)
+        >>> input = flow.Tensor(x)
+        >>> gelu = flow.nn.GELU()
+
+        >>> out = gelu(input)
+        >>> out
+        tensor([-0.1543,  0.    ,  0.3457], dtype=oneflow.float32)
+
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.gelu(x)
+
+
+@register_tensor_op("gelu")
+def gelu_op(x):
+    """Gelu activation operator.
+
+    The equation is:
+
+    .. math::
+        out = 0.5 * x * (1 + tanh(\\sqrt{\\frac{2}{\\pi}} * (x + 0.044715x^{3})))
+
+    Args:
+        x (oneflow.Tensor): Input Tensor
+
+    Returns:
+        oneflow.Tensor: A Tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> x = np.array([-0.5, 0, 0.5]).astype(np.float32)
+        >>> input = flow.Tensor(x)
+        >>> gelu = flow.nn.GELU()
+
+        >>> out = gelu(input)
+        >>> out
+        tensor([-0.1543,  0.    ,  0.3457], dtype=oneflow.float32)
+
+    """
+    return GELU()(x)
+
+
+class Sigmoid(Module):
+    """Applies the element-wise function:
+
+    .. math::
+        \\text{Sigmoid}(x) = \\sigma(x) = \\frac{1}{1 + \\exp(-x)}
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> x = flow.Tensor(np.array([0.81733328, 0.43621480, 0.10351428]))
+        >>> m = flow.nn.Sigmoid()
+        >>> out = m(x)
+        >>> out
+        tensor([0.6937, 0.6074, 0.5259], dtype=oneflow.float32)
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.sigmoid(x)
+
+
+@register_tensor_op("sigmoid")
+def sigmoid_op(x):
+    """Applies the element-wise function:
+
+    .. math::
+        \\text{Sigmoid}(x) = \\sigma(x) = \\frac{1}{1 + \\exp(-x)}
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> x = flow.Tensor(np.array([0.81733328, 0.43621480, 0.10351428]))
+        >>> out = flow.sigmoid(x)
+        >>> out
+        tensor([0.6937, 0.6074, 0.5259], dtype=oneflow.float32)
+
+    """
+    return Sigmoid()(x)
+
+
+class Hardsigmoid(Module):
+    """Applies the element-wise function:
+
+    .. math::
+        \\text{Hardsigmoid}(x) = \\begin{cases}
+            0 & \\text{ if } x \\le -3  \\\\
+            1 & \\text{ if } x \\ge +3 \\\\
+            \\frac{x}{6} + \\frac{1}{2} & \\text{ otherwise } \\\\
+        \\end{cases}
+
+    Args:
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> x = np.array([-0.5, 0, 0.5]).astype(np.float32)
+        >>> input = flow.Tensor(x)
+        >>> hardsigmoid = flow.nn.Hardsigmoid()
+
+        >>> out = hardsigmoid(input)
+        >>> out
+        tensor([0.4167, 0.5   , 0.5833], dtype=oneflow.float32)
+
+
+    """
+
+    def __init__(self, inplace: bool = False):
+        super().__init__()
+        self.inplace = inplace
+
+    def forward(self, x):
+        if self.inplace:
+            warnings.warn("Hardsigmoid module do not support inplace now")
+        return flow.F.hardsigmoid(x)
+
+    def extra_repr(self):
+        inplace_str = "inplace=True" if self.inplace else ""
+        return inplace_str
+
+
+class Softmax(Module):
+    def __init__(self, dim: Optional[int] = None):
+        super().__init__()
+        self.axis = -1 if dim is None else dim
+
+    def forward(self, x):
+        (need_transpose, permute) = _softmax_need_transpose(x, self.axis)
+        if need_transpose:
+            x = flow.F.transpose(x, perm=permute)
+        res = flow.F.softmax(x)
+        if need_transpose:
+            res = flow.F.transpose(res, perm=permute)
+        return res
+
+    def extra_repr(self):
+        return f"axis={self.axis}"
+
+
+@register_tensor_op("softmax")
+def softmax_op(tensor, dim=None):
+    """Applies the Softmax function to an n-dimensional input Tensor
+    rescaling them so that the elements of the n-dimensional output Tensor
+    lie in the range [0,1] and sum to 1.
+
+    Softmax is defined as:
+
+    .. math::
+        \\text{Softmax}(x_{i}) = \\frac{\\exp(x_i)}{\\sum_j \\exp(x_j)}
+
+    When the input Tensor is a sparse tensor then the unspecifed
+    values are treated as ``-inf``.
+
+    Shape:
+        - Input: :math:`(*)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(*)`, same shape as the input
+
+    Returns:
+        a Tensor of the same dimension and shape as the input with
+        values in the range [0, 1]
+
+    Args:
+        dim (int): A dimension along which Softmax will be computed (so every slice
+            along dim will sum to 1).
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> m = flow.nn.Softmax(dim = 2)
+        >>> x = flow.Tensor(
+        ...    np.array(
+        ...        [[[-0.46716809,  0.40112534,  0.61984003],
+        ...        [-1.31244969, -0.42528763,  1.47953856]]]
+        ...    )
+        ... )
+        >>> out = m(x)
+        >>> out
+        tensor([[[0.1575, 0.3754, 0.4671],
+                 [0.0507, 0.123 , 0.8263]]], dtype=oneflow.float32)
+    """
+    return Softmax(dim)(tensor)
+
+
+class LogSoftmax(Module):
+    """Applies the :math:`\\log(\\text{Softmax}(x))` function to an n-dimensional
+    input Tensor.
+    The LogSoftmax formulation can be simplified as:
+
+    .. math::
+        \\text{LogSoftmax}(x_{i}) = \\log\\left(\\frac{\\exp(x_i) }{ \\sum_j \\exp(x_j)} \\right)
+
+    Args:
+        dim (int): A dimension along which LogSoftmax will be computed.
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> m = flow.nn.LogSoftmax(dim=1)
+        >>> x = flow.Tensor(
+        ...    np.array(
+        ...        [[ 0.4296, -1.1957,  2.5463],
+        ...        [ 1.2552, -1.5747,  0.6923]]
+        ...    )
+        ... )
+        >>> out = m(x)
+        >>> out
+        tensor([[-2.2513, -3.8766, -0.1346],
+                [-0.4877, -3.3176, -1.0506]], dtype=oneflow.float32)
+    """
+
+    def __init__(self, dim: Optional[int] = 1):
+        super().__init__()
+        self.dim = dim
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        if not hasattr(self, "dim"):
+            self.dim = None
+
+    def forward(self, x):
+        (need_transpose, permute) = _softmax_need_transpose(x, self.dim)
+        if need_transpose:
+            x = flow.F.transpose(x, perm=permute)
+        x = x.softmax()
+        res = x.log()
+        if need_transpose:
+            res = flow.F.transpose(res, perm=permute)
+        return res
+
+    def extra_repr(self):
+        return f"dim={self.dim}"
+
+
+class LogSigmoid(Module):
+    """Applies the element-wise function:
+
+    .. math::
+        \\text{LogSigmoid}(x) = \\log\\left(\\frac{ 1 }{ 1 + \\exp(-x)}\\right)
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    For example:
+
+    .. code-block:: python
+
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> x = np.array([-0.5, 0, 0.5]).astype(np.float32)
+        >>> input = flow.Tensor(x)
+        >>> logsigmoid = flow.nn.LogSigmoid()
+
+        >>> out = logsigmoid(input)
+        >>> out
+        tensor([-0.9741, -0.6931, -0.4741], dtype=oneflow.float32)
+
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        sigmoid_res = flow.sigmoid(x)
+        res = flow.log(sigmoid_res)
+        return res
+
+
+class Softplus(Module):
+    """Applies the element-wise function:
+
+    .. math::
+        \\text{Softplus}(x) = \\frac{1}{\\beta} * \\log(1 + \\exp(\\beta * x))
+
+    SoftPlus is a smooth approximation to the ReLU function and can be used
+    to constrain the output of a machine to always be positive.
+
+    For numerical stability the implementation reverts to the linear function
+    when :math:`input \\times \\beta > threshold`.
+
+    Args:
+        beta: the :math:`\\beta` value for the Softplus formulation. Default: 1
+        threshold: values above this revert to a linear function. Default: 20
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> x = np.array([-0.5, 0, 0.5]).astype(np.float32)
+        >>> input = flow.Tensor(x)
+        >>> softplus = flow.nn.Softplus()
+
+        >>> out = softplus(input)
+        >>> out
+        tensor([0.4741, 0.6931, 0.9741], dtype=oneflow.float32)
+    """
+
+    def __init__(self, beta: int = 1, threshold: int = 20):
+        super().__init__()
+        self.beta = beta
+        self.threshold = threshold
+
+    def forward(self, x):
+        return flow.where(
+            x * self.beta > self.threshold,
+            x,
+            1 / self.beta * flow.log(1.0 + flow.exp(self.beta * x)),
+        )
+
+    def extra_repr(self):
+        return f"beta={self.beta}, threshold={self.threshold}"
+
+
+class Hardswish(Module):
+    """Applies the hardswish function, element-wise, as described in the paper:
+    `Searching for MobileNetV3`_.
+
+    .. math::
+        \\text{Hardswish}(x) = \\begin{cases}
+            0 & \\text{ if } x \\le -3  \\\\
+            x & \\text{ if } x \\ge +3 \\\\
+            x*(x+3)/6 & \\text{ otherwise } \\\\
+        \\end{cases}
+
+    Args:
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> x = np.array([-0.5, 0, 0.5]).astype(np.float32)
+        >>> input = flow.Tensor(x)
+        >>> hardswish = flow.nn.Hardswish()
+
+        >>> out = hardswish(input)
+        >>> out
+        tensor([-0.2083,  0.    ,  0.2917], dtype=oneflow.float32)
+
+    .. _`Searching for MobileNetV3`:
+        https://arxiv.org/abs/1905.02244
+    """
+
+    def __init__(self, inplace: bool = False):
+        super().__init__()
+        self.inplace = inplace
+
+    def forward(self, x):
+        if self.inplace:
+            warnings.warn("Hardswish module do not support inplace now")
+        return flow.F.hardswish(x)
+
+    def extra_repr(self):
+        inplace_str = "inplace=True" if self.inplace else ""
+        return inplace_str
+
+
+class Hardtanh(Module):
+    """
+    Applies the HardTanh function element-wise
+
+    HardTanh is defined as:
+
+    .. math::
+        \\text{HardTanh}(x) = \\begin{cases}
+            1 & \\text{ if } x > 1 \\\\
+            -1 & \\text{ if } x < -1 \\\\
+            x & \\text{ otherwise } \\\\
+        \\end{cases}
+
+    The range of the linear region :math:`[-1, 1]` can be adjusted using
+    :attr:`min_val` and :attr:`max_val`.
+
+    Args:
+        min_val: minimum value of the linear region range. Default: -1
+        max_val: maximum value of the linear region range. Default: 1
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Keyword arguments :attr:`min_value` and :attr:`max_value`
+    have been deprecated in favor of :attr:`min_val` and :attr:`max_val`.
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    For example:
+
+    .. code-block:: python
+
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> m = flow.nn.Hardtanh()
+        >>> arr = np.array([0.2, 0.3, 3.0, 4.0])
+        >>> x = flow.Tensor(arr)
+        >>> out = m(x)
+        >>> out
+        tensor([0.2, 0.3, 1. , 1. ], dtype=oneflow.float32)
+
+    """
+
+    def __init__(
+        self,
+        min_val: float = -1,
+        max_val: float = 1,
+        inplace: bool = False,
+        min_value: Optional[float] = None,
+        max_value: Optional[float] = None,
+    ):
+        super().__init__()
+        if min_value is not None:
+            warnings.warn(
+                "keyword argument min_value is deprecated and rename to min_val"
+            )
+            min_val = min_value
+        if max_value is not None:
+            warnings.warn(
+                "keyword argument max_value is deprecated and rename to max_val"
+            )
+            max_val = max_value
+        self.min_val = min_val
+        self.max_val = max_val
+        self.inplace = inplace
+
+    def forward(self, x):
+        if self.inplace:
+            warnings.warn("Hardtanh module do not support inplace now")
+        return flow.F.hardtanh(x, min_val=self.min_val, max_val=self.max_val)
+
+    def extra_repr(self):
+        param_str = f"min_val={self.min_val}, max_val={self.max_val}"
+        param_str += ", inplace=True" if self.inplace else ""
+        return param_str
+
+
+class LeakyReLU(Module):
+    """Applies the element-wise function:
+
+    .. math::
+        \\text{LeakyRELU}(x) = \\begin{cases}
+            x, & \\text{ if } x \\geq 0 \\\\
+            \\text{negative_slope} \\times x, & \\text{ otherwise }
+        \\end{cases}
+
+    Args:
+        negative_slope: Controls the angle of the negative slope. Default: 1e-2
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> m = flow.nn.LeakyReLU(0.1)
+        >>> arr = np.array([0.2, 0.3, 3.0, 4.0])
+        >>> x = flow.Tensor(arr)
+        >>> out = m(x)
+        >>> out
+        tensor([0.2, 0.3, 3. , 4. ], dtype=oneflow.float32)
+    """
+
+    def __init__(self, negative_slope: float = 0.01, inplace: bool = False):
+        super().__init__()
+        self.negative_slope = negative_slope
+        self.inplace = inplace
+
+    def forward(self, x):
+        if self.inplace:
+            warnings.warn("LeakyReLU module do not support inplace now")
+        return flow.F.leaky_relu(x, alpha=self.negative_slope)
+
+    def extra_repr(self):
+        param_str = f"negative_slope={self.negative_slope}"
+        param_str += ", inplace=True" if self.inplace else ""
+        return param_str
+
+
+class Mish(Module):
+    """Applies the element-wise function:
+
+    .. math::
+        \\text{Mish}(x) = x * \\text{Tanh}(\\text{Softplus}(x))
+
+    .. note::
+        See `Mish: A Self Regularized Non-Monotonic Neural Activation Function <https://arxiv.org/abs/1908.08681>`_
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> x = np.array([1, 2, 3]).astype(np.float32)
+        >>> input = flow.Tensor(x)
+        >>> mish = flow.nn.Mish()
+
+        >>> out = mish(input)
+        >>> out
+        tensor([0.8651, 1.944 , 2.9865], dtype=oneflow.float32)
+    """
+
+    def __init__(self, inplace: bool = False):
+        assert not inplace, "In-place operation is not currently supported"
+        super().__init__()
+
+    def forward(self, x):
+        return x * flow.tanh(flow.softplus(x))
+
+
+def mish_op(x):
+    """Applies the element-wise function:
+
+    .. math::
+        \\text{Mish}(x) = x * \\text{Tanh}(\\text{Softplus}(x))
+
+    .. note::
+        See `Mish: A Self Regularized Non-Monotonic Neural Activation Function <https://arxiv.org/abs/1908.08681>`_
+
+    See :mod:`oneflow.nn.Mish`
+    """
+    return Mish()(x)
+
+
+@register_tensor_op("mish")
+def mish_op_tensor(x):
+    """
+    mish() -> Tensor
+    See :func:`oneflow.mish`
+    """
+    return Mish()(x)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/adaptive_pool.py b/python/oneflow/nn/modules/adaptive_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef0a37c75572fb717ecce2ec888d391c3e2230e3
--- /dev/null
+++ b/python/oneflow/nn/modules/adaptive_pool.py
@@ -0,0 +1,221 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.nn.module import Module
+
+
+def _generate_output_size(input_size, output_size):
+    new_output_size = []
+    if isinstance(output_size, int):
+        for _ in range(len(input_size) - 2):
+            new_output_size.append(output_size)
+    elif isinstance(output_size, tuple):
+        assert len(input_size) - 2 == len(
+            output_size
+        ), f"The length of 'output_size' does not match the input size, {len(input_size) - 2} expected"
+        for i in range(len(output_size)):
+            if output_size[i] is None:
+                new_output_size.append(input_size[i + 2])
+            else:
+                assert isinstance(
+                    output_size[i], int
+                ), "numbers in 'output_size' should be integer"
+                new_output_size.append(output_size[i])
+    else:
+        raise ValueError("invalid 'output_size', 'int' or 'tuple' expected")
+    return tuple(new_output_size)
+
+
+class AdaptiveAvgPool1d(Module):
+    """Applies a 1D adaptive average pooling over an input signal composed of several input planes.
+
+    The output size is H, for any input size.
+    The number of output features is equal to the number of input planes.
+
+    Args:
+        output_size: the target output size H
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        >>> import oneflow.nn as nn
+        
+        >>> m = nn.AdaptiveAvgPool1d(5)
+        >>> input = flow.Tensor(np.random.randn(1, 64, 8))
+        >>> output = m(input)
+        >>> output.size()
+        flow.Size([1, 64, 5])
+
+    """
+
+    def __init__(self, output_size) -> None:
+        super().__init__()
+        self.output_size = output_size
+
+    def forward(self, x):
+        assert len(x.shape) == 3
+        if isinstance(self.output_size, tuple):
+            new_output_size = self.output_size[0]
+        elif isinstance(self.output_size, int):
+            new_output_size = self.output_size
+        else:
+            raise ValueError("'output_size' should be integer or tuple")
+        return flow.F.adaptive_avg_pool1d(x, output_size=(new_output_size,))
+
+
+def adaptive_avg_pool1d(input, output_size):
+    """Applies a 1D adaptive average pooling over an input signal composed of several input planes.
+
+    See :mod:`oneflow.nn.AdaptiveAvgPool1d`
+
+    Args:
+        input: input tensor
+        output_size: the target output size (single integer)
+    """
+    return AdaptiveAvgPool1d(output_size)(input)
+
+
+class AdaptiveAvgPool2d(Module):
+    """Applies a 2D adaptive average pooling over an input signal composed of several input planes.
+
+    The output is of size H x W, for any input size.
+    The number of output features is equal to the number of input planes.
+
+    Args:
+        output_size: the target output size of the image of the form H x W.
+                     Can be a tuple (H, W) or a single H for a square image H x H.
+                     H and W can be either a ``int``, or ``None`` which means the size will
+                     be the same as that of the input.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        >>> import oneflow.nn as nn
+        
+        >>> m = nn.AdaptiveAvgPool2d((5,7))
+        >>> input = flow.Tensor(np.random.randn(1, 64, 8, 9))
+        >>> output = m(input)
+        >>> output.size()
+        flow.Size([1, 64, 5, 7])
+
+        >>> m = nn.AdaptiveAvgPool2d(7)
+        >>> input = flow.Tensor(np.random.randn(1, 64, 10, 9))
+        >>> output = m(input)
+        >>> output.size()
+        flow.Size([1, 64, 7, 7])
+
+        >>> m = nn.AdaptiveAvgPool2d((None, 7))
+        >>> input = flow.Tensor(np.random.randn(1, 64, 10, 9))
+        >>> output = m(input)
+        >>> output.size()
+        flow.Size([1, 64, 10, 7])
+
+    """
+
+    def __init__(self, output_size) -> None:
+        super().__init__()
+        self.output_size = output_size
+
+    def forward(self, x):
+        assert len(x.shape) == 4
+        new_output_size = _generate_output_size(x.shape, self.output_size)
+        return flow.F.adaptive_avg_pool2d(x, output_size=new_output_size)
+
+
+def adaptive_avg_pool2d(input, output_size):
+    """Applies a 2D adaptive average pooling over an input signal composed of several input planes.
+
+    See :mod:`oneflow.nn.AdaptiveAvgPool2d`
+
+    Args:
+        input: input tensor
+        output_size: the target output size (single integer or double-integer tuple)
+    """
+    return AdaptiveAvgPool2d(output_size)(input)
+
+
+class AdaptiveAvgPool3d(Module):
+    """Applies a 3D adaptive average pooling over an input signal composed of several input planes.
+
+    The output is of size D x H x W, for any input size.
+    The number of output features is equal to the number of input planes.
+
+    Args:
+        output_size: the target output size of the form D x H x W.
+                     Can be a tuple (D, H, W) or a single number D for a cube D x D x D.
+                     D, H and W can be either a ``int``, or ``None`` which means the size will
+                     be the same as that of the input.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        >>> import oneflow.nn as nn
+        
+        >>> m = nn.AdaptiveAvgPool3d((5,7,9))
+        >>> input = flow.Tensor(np.random.randn(1, 64, 8, 9, 10))
+        >>> output = m(input)
+        >>> output.size()
+        flow.Size([1, 64, 5, 7, 9])
+
+        >>> m = nn.AdaptiveAvgPool3d(7)
+        >>> input = flow.Tensor(np.random.randn(1, 64, 10, 9, 8))
+        >>> output = m(input)
+        >>> output.size()
+        flow.Size([1, 64, 7, 7, 7])
+
+        >>> m = nn.AdaptiveAvgPool3d((7, None, None))
+        >>> input = flow.Tensor(np.random.randn(1, 64, 10, 9, 8))
+        >>> output = m(input)
+        >>> output.size()
+        flow.Size([1, 64, 7, 9, 8])
+
+    """
+
+    def __init__(self, output_size) -> None:
+        super().__init__()
+        self.output_size = output_size
+
+    def forward(self, x):
+        assert len(x.shape) == 5
+        new_output_size = _generate_output_size(x.shape, self.output_size)
+        return flow.F.adaptive_avg_pool3d(x, output_size=new_output_size)
+
+
+def adaptive_avg_pool3d(input, output_size):
+    """Applies a 3D adaptive average pooling over an input signal composed of several input planes.
+
+    See :mod:`oneflow.nn.AdaptiveAvgPool3d`
+
+    Args:
+        input: input tensor
+        output_size: the target output size (single integer or triple-integer tuple)
+    """
+    return AdaptiveAvgPool3d(output_size)(input)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/arange.py b/python/oneflow/nn/modules/arange.py
new file mode 100644
index 0000000000000000000000000000000000000000..491ca3598b7e38e825788ca0e6291073bfa184a0
--- /dev/null
+++ b/python/oneflow/nn/modules/arange.py
@@ -0,0 +1,102 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Union
+
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class Arange(Module):
+    def __init__(
+        self,
+        start: int = 0,
+        end: int = None,
+        step: int = 1,
+        dtype: flow.dtype = None,
+        device: Union[str, flow.device] = "cpu",
+        requires_grad: bool = False,
+    ) -> None:
+        super().__init__()
+        assert end > start, "end should be larger than start"
+        assert step <= end - start, "step is ilegal"
+        self.start = start
+        self.end = end
+        self.step = step
+        self.dtype = dtype
+        self.device = device
+        self.requires_grad = requires_grad
+
+    def forward(self):
+        tmp = flow.F.range(
+            start=self.start, limit=self.end, delta=self.step, dtype=flow.int64
+        )
+        tmp.requires_grad = self.requires_grad
+        if isinstance(self.device, str):
+            device = flow.device(self.device)
+        else:
+            device = self.device
+        res = tmp.to(device, dtype=self.dtype)
+        return res
+
+
+def arange_op(
+    start: int = 0,
+    end: int = None,
+    step: int = 1,
+    dtype: flow.dtype = flow.int64,
+    device: Union[str, flow.device] = "cpu",
+    requires_grad: bool = False,
+):
+    """
+    Returns a 1-D tensor of size :math:`\\left\\lfloor \\frac{\\text{end} - \\text{start}}{\\text{step}} \\right\\rfloor + 1`
+    with values from :attr:`start` to :attr:`end` with step :attr:`step`. Step is
+    the gap between two values in the tensor.
+
+    .. math::
+        \\text{out}_{i+1} = \\text{out}_i + \\text{step}.
+
+    Args:
+        start (int): the starting value for the set of points. Default: ``0``.
+        end (int): the ending value for the set of points
+        step (int): the gap between each pair of adjacent points. Default: ``1``.
+
+    Keyword args:
+        dtype(flow.dtype, optional): If `dtype` is not given, the `dtype` is inferred to be `flow.int64`.
+        device(flow.device, optional): the desired device of returned tensor. Default: if None, uses the current device for the default tensor.
+        requires_grad(bool, optional): If autograd should record operations on the returned tensor. Default: `False`.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        
+        >>> y = flow.arange(0, 5)
+        >>> y
+        tensor([0, 1, 2, 3, 4], dtype=oneflow.int64)
+
+    """
+    if end is None:
+        end = start
+        start = 0
+    return Arange(start, end, step, dtype, device, requires_grad)()
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/argmax.py b/python/oneflow/nn/modules/argmax.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0edfd8c9f960312d5f2edef2b7823b078cf2da0
--- /dev/null
+++ b/python/oneflow/nn/modules/argmax.py
@@ -0,0 +1,90 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+from oneflow.ops.transpose_util import (
+    get_inversed_perm,
+    get_perm_when_transpose_axis_to_last_dim,
+)
+
+
+class Argmax(Module):
+    def __init__(self, dim: int = None, keepdim: bool = False) -> None:
+        super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
+
+    def forward(self, input):
+        if self.dim == None:
+            input = flow.F.flatten(input)
+            self.dim = 0
+        num_axes = len(input.shape)
+        axis = self.dim if self.dim >= 0 else self.dim + num_axes
+        assert 0 <= axis < num_axes, "axis out of range"
+        if axis == num_axes - 1:
+            x = flow.F.argmax(input)
+            if self.keepdim == True:
+                x = flow.unsqueeze(x, -1)
+            return x
+        else:
+            perm = get_perm_when_transpose_axis_to_last_dim(num_axes, axis)
+            x = flow.F.transpose(input, perm=perm)
+            x = flow.F.argmax(x)
+            x = flow.unsqueeze(x, -1)
+            x = flow.F.transpose(x, perm=get_inversed_perm(perm))
+            if self.keepdim == False:
+                x = x.squeeze(dim=[axis])
+            return x
+
+
+@register_tensor_op("argmax")
+def argmax_op(input, dim: int = None, keepdim: bool = False):
+    """The op computes the index with the largest value of a Tensor at specified axis.
+
+    Args:
+        input (oneflow.Tensor): Input Tensor
+        dim (int, optional): dimension to be calculated. Defaults to the last dim (-1)
+        keepdim (bool optional):  whether the output tensor has dim retained or not. Ignored if dim=None.
+
+    Returns:
+        oneflow.Tensor: A Tensor(dtype=int32) contains the index with the largest value of `input`
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> x = np.array([[1, 3, 8, 7, 2],
+        ...            [1, 9, 4, 3, 2]], dtype=np.float32)
+
+        >>> out = flow.argmax(flow.Tensor(x))
+        >>> out
+        tensor([6], dtype=oneflow.int32)
+        >>> out = flow.argmax(flow.Tensor(x), dim=1)
+        >>> out
+        tensor([2, 1], dtype=oneflow.int32)
+
+    """
+    return Argmax(dim=dim, keepdim=keepdim)(input)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/argsort.py b/python/oneflow/nn/modules/argsort.py
new file mode 100644
index 0000000000000000000000000000000000000000..c66b0567b494a675f4904ce25551918529e55011
--- /dev/null
+++ b/python/oneflow/nn/modules/argsort.py
@@ -0,0 +1,92 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+from oneflow.ops.transpose_util import (
+    get_inversed_perm,
+    get_perm_when_transpose_axis_to_last_dim,
+)
+
+
+class Argsort(Module):
+    def __init__(self, dim: int = -1, descending: bool = False) -> None:
+        super().__init__()
+        self.dim = dim
+        direction = "DESCENDING" if descending else "ASCENDING"
+        self._argsort_op = (
+            flow.builtin_op("arg_sort")
+            .Input("in")
+            .Output("out")
+            .Attr("direction", direction)
+            .Build()
+        )
+
+    def forward(self, input):
+        num_dims = len(input.shape)
+        dim = self.dim if self.dim >= 0 else self.dim + num_dims
+        assert 0 <= dim < num_dims, "dim out of range"
+        if dim == num_dims - 1:
+            return self._argsort_op(input)[0]
+        else:
+            perm = get_perm_when_transpose_axis_to_last_dim(num_dims, dim)
+            x = flow.F.transpose(input, perm=perm)
+            x = self._argsort_op(x)[0]
+            return flow.F.transpose(x, perm=get_inversed_perm(perm))
+
+
+@register_tensor_op("argsort")
+def argsort_op(input, dim: int = -1, descending: bool = False):
+    """This operator sorts the input Tensor at specified dim and return the indices of the sorted Tensor.
+
+    Args:
+        input (oneflow.Tensor): The input Tensor.
+        dim (int, optional): dimension to be sorted. Defaults to the last dim (-1).
+        descending (bool, optional): controls the sorting order (ascending or descending).
+
+    Returns:
+        oneflow.Tensor: The indices of the sorted Tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        >>> x = np.array([[10, 2, 9, 3, 7],
+        ...               [1, 9, 4, 3, 2]]).astype("float32")
+        >>> input = flow.Tensor(x)
+        >>> output = flow.argsort(input)
+        >>> output
+        tensor([[1, 3, 4, 2, 0],
+                [0, 4, 3, 2, 1]], dtype=oneflow.int32)
+        >>> output = flow.argsort(input, descending=True)
+        >>> output
+        tensor([[0, 2, 4, 3, 1],
+                [1, 2, 3, 4, 0]], dtype=oneflow.int32)
+        >>> output = flow.argsort(input, dim=0)
+        >>> output
+        tensor([[1, 0, 1, 0, 1],
+                [0, 1, 0, 1, 0]], dtype=oneflow.int32)
+
+    """
+    return Argsort(dim=dim, descending=descending)(input)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/argwhere.py b/python/oneflow/nn/modules/argwhere.py
new file mode 100644
index 0000000000000000000000000000000000000000..79cbca02d89ccf3a62434da27fcef9991c92f309
--- /dev/null
+++ b/python/oneflow/nn/modules/argwhere.py
@@ -0,0 +1,85 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional
+
+import numpy as np
+
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class Argwhere(Module):
+    def __init__(self, dtype) -> None:
+        super().__init__()
+        if dtype == None:
+            dtype = flow.int32
+        self.dtype = dtype
+
+    def forward(self, x):
+        (res, size) = flow.F.argwhere(x, dtype=self.dtype)
+        slice_tup_list = [[0, int(size.numpy()), 1]]
+        return flow.slice(res, slice_tup_list=slice_tup_list)
+
+
+def argwhere_op(x, dtype: Optional[flow.dtype] = None):
+    """This operator finds the indices of input Tensor `x` elements that are non-zero. 
+
+    It returns a list in which each element is a coordinate that points to a non-zero element in the condition.
+
+    Args:
+        x (oneflow.Tensor): The input Tensor.
+        dtype (Optional[flow.dtype], optional): The data type of output. Defaults to None.
+
+    Returns:
+        oneflow.Tensor: The result Tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        >>> x = np.array([[0, 1, 0],
+        ...            [2, 0, 2]]).astype(np.float32)
+        
+        >>> input = flow.Tensor(x)
+        >>> output = flow.argwhere(input)
+        >>> output
+        tensor([[0, 1],
+                [1, 0],
+                [1, 2]], dtype=oneflow.int32)
+
+    """
+    return Argwhere(dtype=dtype)(x)
+
+
+@register_tensor_op("argwhere")
+def argwhere_tebsor_op(x, dtype: Optional[flow.dtype] = None):
+    """
+
+    argwhere() -> Tensor
+
+    See :func:`oneflow.argwhere`
+
+    """
+    return Argwhere(dtype=dtype)(x)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/atan2.py b/python/oneflow/nn/modules/atan2.py
new file mode 100644
index 0000000000000000000000000000000000000000..26083e5d7adf2cd62197b19b3230c6b7c5d576b1
--- /dev/null
+++ b/python/oneflow/nn/modules/atan2.py
@@ -0,0 +1,83 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class Atan2(Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.atan2_op = (
+            flow.builtin_op("atan2").Input("x").Input("y").Output("z").Build()
+        )
+
+    def forward(self, x, y):
+        return self.atan2_op(x, y)[0]
+
+
+def atan2_op(input, other):
+    """Element-wise arctangent of input{i}/other{i}
+    with consideration of the quadrant. Returns a new tensor with the signed
+    angles in radians between vector (other{i},input{i}) and vector (1, 0).
+
+    The shapes of input and other must be broadcastable.
+
+    Args:
+        input (Tensor): the first input tensor.
+
+        other (Tensor): the second input tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+
+        >>> x1 = flow.Tensor(np.array([1,2,3]))
+        >>> y1 = flow.Tensor(np.array([3,2,1]))
+        >>> x2 = flow.Tensor(np.array([1.53123589,0.54242598,0.15117185]))
+        >>> y2 = flow.Tensor(np.array([-0.21906378,0.09467151,-0.75562878]))
+        >>> x3 = flow.Tensor(np.array([1,0,-1]))
+        >>> y3 = flow.Tensor(np.array([0,1,0]))
+
+        >>> flow.atan2(x1,y1).numpy()
+        array([0.32175055, 0.7853982 , 1.2490457 ], dtype=float32)
+        >>> flow.atan2(x2,y2).numpy()
+        array([1.7128955, 1.3980033, 2.9441385], dtype=float32)
+        >>> flow.atan2(x3,y3).numpy()
+        array([ 1.5707964,  0.       , -1.5707964], dtype=float32)
+
+    """
+    return Atan2()(input, other)
+
+
+@register_tensor_op("atan2")
+def atan2_op_tensor(input, other):
+    """
+
+    atan2(other) -> Tensor
+
+    See :func:`oneflow.atan2`
+    """
+    return Atan2()(input, other)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/atanh.py b/python/oneflow/nn/modules/atanh.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5bb53c1dd9e3a5a8264f324c6f200e8373d6708
--- /dev/null
+++ b/python/oneflow/nn/modules/atanh.py
@@ -0,0 +1,84 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class Atanh(Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.atanh(x)
+
+
+def atanh_op(input):
+    """Returns a new tensor with the inverse hyperbolic tangent of the elements of :attr:`input`.
+
+    .. math::
+        \\text{out}_{i} = \\tanh^{-1}(\\text{input}_{i})
+
+    Args:
+        input (Tensor): the input tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> np_arr = np.array([0.5, 0.6, 0.7]).astype(np.float32)
+        >>> input = flow.Tensor(np_arr)
+        >>> output = flow.atanh(input)
+        >>> output
+        tensor([0.5493, 0.6931, 0.8673], dtype=oneflow.float32)
+
+    """
+    return Atanh()(input)
+
+
+@register_tensor_op("atanh")
+def atanh_op_tensor(x):
+    """
+    atanh() -> Tensor
+    See :func:`oneflow.atanh`
+
+    """
+    return Atanh()(x)
+
+
+def arctanh_op(input):
+    """
+
+    Alias for :func:`oneflow.atanh`
+    """
+    return Atanh()(input)
+
+
+@register_tensor_op("arctanh")
+def arctanh_op_tensor(input):
+    """
+
+    Alias for :func:`oneflow.atanh`
+    """
+    return Atanh()(input)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/batchnorm.py b/python/oneflow/nn/modules/batchnorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..6384ae5b0b44c47b835a973ad22fc72db2815883
--- /dev/null
+++ b/python/oneflow/nn/modules/batchnorm.py
@@ -0,0 +1,337 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Union
+
+import oneflow as flow
+from oneflow.nn.module import Module
+
+
+class _NormBase(Module):
+    """Common base of _InstanceNorm and _BatchNorm"""
+
+    def __init__(
+        self,
+        num_features: int,
+        eps: float = 1e-05,
+        momentum: float = 0.1,
+        affine: bool = True,
+        track_running_stats: bool = True,
+    ) -> None:
+        super().__init__()
+        self.num_features = num_features
+        self.eps = eps
+        self.momentum = momentum
+        self.affine = affine
+        self.track_running_stats = track_running_stats
+        if self.affine:
+            self.weight = flow.nn.Parameter(flow.Tensor(num_features))
+            self.bias = flow.nn.Parameter(flow.Tensor(num_features))
+        else:
+            self.register_parameter("weight", None)
+            self.register_parameter("bias", None)
+        if self.track_running_stats:
+            self.register_buffer("running_mean", flow.Tensor(num_features))
+            self.register_buffer("running_var", flow.Tensor(num_features))
+        else:
+            self.register_parameter("running_mean", None)
+            self.register_parameter("running_var", None)
+        self.reset_parameters()
+
+    def reset_running_stats(self) -> None:
+        if self.track_running_stats:
+            self.running_mean.fill_(0)
+            self.running_var.fill_(1)
+
+    def reset_parameters(self) -> None:
+        self.reset_running_stats()
+        if self.affine:
+            flow.nn.init.ones_(self.weight)
+            flow.nn.init.zeros_(self.bias)
+
+    def _check_input_dim(self, input):
+        raise NotImplementedError
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        super(_NormBase, self)._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
+    def extra_repr(self):
+        return "{num_features}, eps={eps}, momentum={momentum}, affine={affine}, track_running_stats={track_running_stats}".format(
+            **self.__dict__
+        )
+
+
+class _BatchNorm(_NormBase):
+    def __init__(
+        self,
+        num_features,
+        eps=1e-05,
+        momentum=0.1,
+        affine=True,
+        track_running_stats=True,
+    ):
+        super().__init__(num_features, eps, momentum, affine, track_running_stats)
+
+    def forward(self, x):
+        self._check_input_dim(x)
+        if x.device == flow.device("cpu"):
+            reduce_axis = []
+            for dim in range(len(x.shape)):
+                if dim != 1:
+                    reduce_axis.append(dim)
+            mean = x.mean(dim=reduce_axis, keepdim=False)
+            variance = x.var(dim=reduce_axis, keepdim=False)
+            if self.training and self.track_running_stats:
+                running_mean = (
+                    self.momentum * self.running_mean + (1 - self.momentum) * mean
+                )
+                running_var = (
+                    self.momentum * self.running_var + (1 - self.momentum) * variance
+                )
+                self.__setattr__("running_mean", flow.Tensor(running_mean))
+                self.__setattr__("running_var", flow.Tensor(running_var))
+            else:
+                mean = mean if self.running_mean is None else self.running_mean
+                variance = variance if self.running_var is None else self.running_var
+            axis = 1
+            params_shape = [x.shape[axis]]
+            weight = self.weight
+            bias = self.bias
+            if len(mean.shape) == 1:
+                nd_params_shape = [1] * len(x.shape)
+                nd_params_shape[axis] = params_shape[0]
+                mean = mean.reshape(shape=nd_params_shape)
+                variance = variance.reshape(shape=nd_params_shape)
+                if self.weight and params_shape[0] == self.weight.nelement():
+                    weight = self.weight.reshape(shape=nd_params_shape)
+                if self.bias and params_shape[0] == self.bias.nelement():
+                    bias = self.bias.reshape(shape=nd_params_shape)
+            elif len(mean.shape) == len(x.shape):
+                pass
+            else:
+                raise ValueError(
+                    "shape of mean and variance should be 1D or has number of axes and x's"
+                )
+            variance += self.eps
+            normalized = (x - mean) * variance.rsqrt()
+            affined = normalized
+            if self.weight:
+                affined = affined * weight
+            if self.bias:
+                affined = affined + bias
+            return affined
+        elif self.track_running_stats:
+            return flow.F.normalization(
+                x,
+                self.running_mean,
+                self.running_var,
+                self.weight,
+                self.bias,
+                axis=1,
+                epsilon=self.eps,
+                momentum=self.momentum,
+                is_training=self.training,
+            )
+        else:
+            reduce_axis = []
+            for dim in range(len(x.shape)):
+                if dim != 1:
+                    reduce_axis.append(dim)
+            return flow.F.normalization(
+                x,
+                x.mean(dim=reduce_axis, keepdim=False),
+                x.var(dim=reduce_axis, keepdim=False),
+                self.weight,
+                self.bias,
+                axis=1,
+                epsilon=self.eps,
+                momentum=self.momentum,
+                is_training=self.training,
+            )
+
+
+class BatchNorm1d(_BatchNorm):
+    """Applies Batch Normalization over a 2D or 3D input (a mini-batch of 1D
+    inputs with optional additional channel dimension) as described in the paper
+    `Batch Normalization: Accelerating Deep Network Training by Reducing
+    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .
+
+    .. math::
+
+        y = \\frac{x - \\mathrm{E}[x]}{\\sqrt{\\mathrm{Var}[x] + \\epsilon}} * \\gamma + \\beta
+
+    The mean and standard-deviation are calculated per-dimension over
+    the mini-batches and :math:`\\gamma` and :math:`\\beta` are learnable parameter vectors
+    of size `C` (where `C` is the input size). By default, the elements of :math:`\\gamma` are set
+    to 1 and the elements of :math:`\\beta` are set to 0. The standard-deviation is calculated
+    via the biased estimator, equivalent to `torch.var(input, unbiased=False)`.
+
+    Also by default, during training this layer keeps running estimates of its
+    computed mean and variance, which are then used for normalization during
+    evaluation. The running estimates are kept with a default :attr:`momentum`
+    of 0.1.
+
+    If :attr:`track_running_stats` is set to ``False``, this layer then does not
+    keep running estimates, and batch statistics are instead used during
+    evaluation time as well.
+
+    .. note::
+        This :attr:`momentum` argument is different from one used in optimizer
+        classes and the conventional notion of momentum. Mathematically, the
+        update rule for running statistics here is
+        :math:`\\hat{x}_\\text{new} = (1 - \\text{momentum}) \\times \\hat{x} + \\text{momentum} \\times x_t`,
+        where :math:`\\hat{x}` is the estimated statistic and :math:`x_t` is the
+        new observed value.
+
+    Because the Batch Normalization is done over the `C` dimension, computing statistics
+    on `(N, L)` slices, it's common terminology to call this Temporal Batch Normalization.
+
+    Args:
+        num_features: :math:`C` from an expected input of size
+            :math:`(N, C, L)` or :math:`L` from input of size :math:`(N, L)`
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Can be set to ``None`` for cumulative moving average
+            (i.e. simple average). Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters. Default: ``True``
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics, and initializes statistics
+            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
+            When these buffers are ``None``, this module always uses batch statistics.
+            in both training and eval modes. Default: ``True``
+
+    Shape:
+        - Input: :math:`(N, C)` or :math:`(N, C, L)`
+        - Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input)
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        
+        >>> x = flow.Tensor(np.random.randn(20, 100))
+        >>> m = flow.nn.BatchNorm1d(100)
+        >>> y = m(x)
+
+    """
+
+    def _check_input_dim(self, input):
+        if input.ndim != 2 and input.ndim != 3:
+            raise ValueError(
+                "expected 2D or 3D input (got {}D input)".format(input.ndim)
+            )
+
+
+class BatchNorm2d(_BatchNorm):
+    """Applies Batch Normalization over a 4D input (a mini-batch of 2D inputs
+    with additional channel dimension) as described in the paper
+    `Batch Normalization: Accelerating Deep Network Training by Reducing
+    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .
+
+    .. math::
+
+        y = \\frac{x - \\mathrm{E}[x]}{ \\sqrt{\\mathrm{Var}[x] + \\epsilon}} * \\gamma + \\beta
+
+    The mean and standard-deviation are calculated per-dimension over
+    the mini-batches and :math:`\\gamma` and :math:`\\beta` are learnable parameter vectors
+    of size `C` (where `C` is the input size). By default, the elements of :math:`\\gamma` are set
+    to 1 and the elements of :math:`\\beta` are set to 0. The standard-deviation is calculated
+    via the biased estimator, equivalent to `torch.var(input, unbiased=False)`.
+
+    Also by default, during training this layer keeps running estimates of its
+    computed mean and variance, which are then used for normalization during
+    evaluation. The running estimates are kept with a default :attr:`momentum`
+    of 0.1.
+
+    If :attr:`track_running_stats` is set to ``False``, this layer then does not
+    keep running estimates, and batch statistics are instead used during
+    evaluation time as well.
+
+    .. note::
+        This :attr:`momentum` argument is different from one used in optimizer
+        classes and the conventional notion of momentum. Mathematically, the
+        update rule for running statistics here is
+        :math:`\\hat{x}_\\text{new} = (1 - \\text{momentum}) \\times \\hat{x} + \\text{momentum} \\times x_t`,
+        where :math:`\\hat{x}` is the estimated statistic and :math:`x_t` is the
+        new observed value.
+
+    Because the Batch Normalization is done over the `C` dimension, computing statistics
+    on `(N, H, W)` slices, it's common terminology to call this Spatial Batch Normalization.
+
+    Args:
+        num_features: :math:`C` from an expected input of size
+            :math:`(N, C, H, W)`
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Can be set to ``None`` for cumulative moving average
+            (i.e. simple average). Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters. Default: ``True``
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics, and initializes statistics
+            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
+            When these buffers are ``None``, this module always uses batch statistics.
+            in both training and eval modes. Default: ``True``
+
+    Shape:
+        - Input: :math:`(N, C, H, W)`
+        - Output: :math:`(N, C, H, W)` (same shape as input)
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        
+        >>> x = flow.Tensor(np.random.randn(4, 2, 8, 3))
+        >>> m = flow.nn.BatchNorm2d(num_features=2, eps=1e-5, momentum=0.1)
+        >>> y = m(x)
+
+    """
+
+    def _check_input_dim(self, input):
+        if input.ndim != 4:
+            raise ValueError("expected 4D input (got {}D input)".format(input.ndim()))
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/bmm.py b/python/oneflow/nn/modules/bmm.py
new file mode 100644
index 0000000000000000000000000000000000000000..445d7b5a91225a957267c3cc9df830cec491583f
--- /dev/null
+++ b/python/oneflow/nn/modules/bmm.py
@@ -0,0 +1,74 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class BMM(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, input, mat2):
+        assert (
+            input.shape[0] == mat2.shape[0] and input.shape[2] == mat2.shape[1]
+        ), f"batch dim or matmul dim not match, please check input!"
+        return flow.F.batch_matmul(input, mat2)
+
+
+def bmm_op(x, y):
+    """
+    Performs a batch matrix-matrix product of matrices stored in input and mat2.
+
+    `input` and `mat2` must be 3-D tensors each containing the same number of matrices.
+
+    If input is a (b x n x m) tensor, mat2 is a (b x m x p) tensor, out will be a (b x n x p) tensor.
+
+    Args:
+        input(oneflow.Tensor):  the first batch of matrices to be multiplied
+        mat2(oneflow.Tensor): the second batch of matrices to be multiplied
+    
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> input1 = flow.Tensor(np.random.randn(10, 3, 4), dtype=flow.float32)
+        >>> input2 = flow.Tensor(np.random.randn(10, 4, 5), dtype=flow.float32)
+        >>> of_out = flow.bmm(input1, input2)
+        >>> of_out.shape
+        flow.Size([10, 3, 5])
+    """
+    return BMM()(x, y)
+
+
+@register_tensor_op("bmm")
+def bmm_op_tensor(x, y):
+    """
+
+    bmm() -> Tensor
+
+    See :func:`oneflow.bmm`
+
+    """
+    return BMM()(x, y)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/broadcast_like.py b/python/oneflow/nn/modules/broadcast_like.py
new file mode 100644
index 0000000000000000000000000000000000000000..33cfc2ecf4e4b1623a21b216bf0060848333db3a
--- /dev/null
+++ b/python/oneflow/nn/modules/broadcast_like.py
@@ -0,0 +1,51 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional, Sequence
+
+import oneflow as flow
+from oneflow.nn.module import Module
+
+
+def _calc_broadcast_axes(x, like_tensor):
+    num_prepend = len(like_tensor.shape) - len(x.shape)
+    prepend_shape = [1] * num_prepend + list(x.shape)
+    broadcast_axes = [x for x in range(num_prepend)]
+    for i in range(num_prepend, len(prepend_shape)):
+        if prepend_shape[i] != like_tensor.shape[i]:
+            if prepend_shape[i] != 1:
+                raise RuntimeError(
+                    f"output with shape {x.shape} doesn't match the broadcast shape {like_tensor.shape}"
+                )
+            else:
+                broadcast_axes.append(i)
+    return tuple(broadcast_axes)
+
+
+class BroadCastLike(Module):
+    def __init__(self, broadcast_axes: Optional[Sequence] = None) -> None:
+        super().__init__()
+        self.broadcast_axes = broadcast_axes
+
+    def forward(self, x, like_tensor):
+        if self.broadcast_axes is None:
+            broadcast_axes = _calc_broadcast_axes(x, like_tensor)
+        else:
+            broadcast_axes = self.broadcast_axes
+        return flow.F.broadcast_like(x, like_tensor, broadcast_axes=broadcast_axes)
+
+
+def broadcast_like_op(x, like_tensor, broadcast_axes: Optional[Sequence] = None):
+    return BroadCastLike(broadcast_axes=broadcast_axes)(x, like_tensor)
diff --git a/python/oneflow/nn/modules/cast.py b/python/oneflow/nn/modules/cast.py
new file mode 100644
index 0000000000000000000000000000000000000000..e18495f042af12800ba34b90c11c70cda4950cc1
--- /dev/null
+++ b/python/oneflow/nn/modules/cast.py
@@ -0,0 +1,60 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class Cast(Module):
+    def __init__(self, dtype: flow.dtype) -> None:
+        super().__init__()
+        self.dtype = dtype
+
+    def forward(self, x):
+        return flow.F.cast(x, dtype=self.dtype)
+
+
+@register_tensor_op("cast")
+def cast_op(x, dtype):
+    """The operation takes input tensor `x` and casts it to the output with `dtype`
+
+    Args:
+        x (oneflow.Tensor): A Tensor
+        dtype (flow.dtype): Data type of the output tensor
+
+    Returns:
+        oneflow.Tensor: A Tensor with specific dtype.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> np_arr = np.random.randn(2, 3, 4, 5).astype(np.float32)
+        >>> input = flow.Tensor(np_arr, dtype=flow.float32)
+        >>> output = flow.cast(input, flow.int8)
+        >>> np.array_equal(output.numpy(), np_arr.astype(np.int8))
+        True
+
+    """
+    return Cast(dtype)(x)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/chunk.py b/python/oneflow/nn/modules/chunk.py
new file mode 100644
index 0000000000000000000000000000000000000000..73d05c018ebea26fcda10cf0d749fd493931eed0
--- /dev/null
+++ b/python/oneflow/nn/modules/chunk.py
@@ -0,0 +1,127 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional
+
+import oneflow as flow
+from oneflow.framework.tensor import Tensor, register_tensor_op
+from oneflow.nn.module import Module
+from oneflow.ops.array_ops import check_slice_tup_list
+
+
+class Chunk(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, input, chunks, dim):
+        if dim is not None:
+            assert input.shape[dim] > 0, "chunk expects at least a 1-dimensional tensor"
+            assert chunks > 0, "chunk expects `chunks` to be greater than 0"
+            channel = input.dim()
+            dim_size = input.shape[dim]
+            chunk_size = (
+                dim_size / chunks if dim_size % chunks == 0 else int(dim_size / chunks)
+            )
+            last_chunk_size = (
+                dim_size / chunks
+                if dim_size % chunks == 0
+                else dim_size - chunk_size * (chunks - 1)
+            )
+            chunk_dim_dict = {}
+            tup_ndim = []
+            splits = []
+            for chunk in range(0, chunks):
+                if dim_size % chunks == 0:
+                    start = chunk * chunk_size
+                    stop = (chunk + 1) * chunk_size
+                else:
+                    start = (
+                        chunk * chunk_size
+                        if chunk < chunks - 1
+                        else chunk_size * (chunks - 1)
+                    )
+                    stop = (chunk + 1) * chunk_size if chunk < chunks - 1 else dim_size
+                step = 1
+                chunk_dim_dict.setdefault(dim, []).append(
+                    [int(start), int(stop), int(step)]
+                )
+            for (k, v) in chunk_dim_dict.items():
+                for v_chunk in v:
+                    tup_list = []
+                    for i in range(0, channel):
+                        if i != dim:
+                            tup_list.append([None, None, None])
+                        else:
+                            tup_list.append(v_chunk)
+                    (start_tup, stop_tup, step_tup) = check_slice_tup_list(
+                        tup_list, input.shape
+                    )
+                    splits.append(
+                        flow.F.slice(
+                            input, start=start_tup, stop=stop_tup, step=step_tup
+                        )
+                    )
+            return splits
+
+
+@register_tensor_op("chunk")
+def chunk_op(input, chunks, dim):
+    """Splits a tensor into a specific number of chunks. Each chunk is a view of the input tensor. Last chunk will be smaller if the tensor size along the given dimension dim is not divisible by chunks.
+
+    Args:
+        input (oneflow.Tensor): The tensor to split.
+        chunks (int): Number of chunks to return.
+        dim (int): Dimension along which to split the tensor.
+
+    Returns:
+        List of Tensors.
+
+    For example:
+
+    .. code-block:: python
+    
+        >>> import oneflow as flow
+        >>> import numpy as np
+               
+        >>> np_arr = np.random.randn(5, 3, 6, 9).astype(np.float32)
+        >>> input = flow.Tensor(np_arr)
+        >>> of_out = []
+        >>> of_out = flow.chunk(input, chunks=3, dim=2)
+        >>> chunks = 3
+        >>> of_out_shape = []
+        >>> for i in range(0, chunks):
+        ...     of_out_shape.append(of_out[i].numpy().shape)
+        >>> of_out_shape
+        [(5, 3, 2, 9), (5, 3, 2, 9), (5, 3, 2, 9)]
+
+        >>> np_arr = np.random.randn(5, 3, 6, 9).astype(np.float32)
+        >>> input = flow.Tensor(np_arr)
+        >>> of_out = []
+        >>> of_out = flow.chunk(input, chunks=4, dim=3)
+        >>> chunks = 4
+        >>> of_out_shape = []
+        >>> for i in range(0, chunks):
+        ...     of_out_shape.append(of_out[i].numpy().shape)
+        >>> of_out_shape
+        [(5, 3, 6, 2), (5, 3, 6, 2), (5, 3, 6, 2), (5, 3, 6, 3)]
+
+    """
+    return Chunk()(input, chunks, dim)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/concat.py b/python/oneflow/nn/modules/concat.py
new file mode 100644
index 0000000000000000000000000000000000000000..b799fe77f4f541fc84bf9dc191936ce4ebd8a90b
--- /dev/null
+++ b/python/oneflow/nn/modules/concat.py
@@ -0,0 +1,84 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional, Sequence
+
+import oneflow as flow
+from oneflow.framework.tensor import Tensor, register_tensor_op
+from oneflow.nn.module import Module
+
+
+class Cat(Module):
+    def __init__(self, dim=0) -> None:
+        super().__init__()
+        self.axis = dim
+
+    def forward(self, inputs):
+        if len(inputs) == 1:
+            return inputs[0]
+        axis = self.axis
+        assert len(inputs) >= 2
+        if axis < 0:
+            axis += len(inputs[0].shape)
+        assert axis >= 0 and axis < len(
+            inputs[0].shape
+        ), "axis must be in range [0, num_axes of inputs)"
+        first_input_shape = inputs[0].shape
+        dynamic_dim_size = 0
+        for input in inputs:
+            assert len(input.shape) == len(first_input_shape)
+            for i in range(len(input.shape)):
+                if i == axis:
+                    dynamic_dim_size += input.shape[i]
+                else:
+                    assert input.shape[i] == first_input_shape[i]
+        return flow.F.concat(inputs, axis=axis, max_dim_size=dynamic_dim_size)
+
+
+def concat_op(inputs, dim=0):
+    """Concatenate two or more `Tensor` s at specified axis.
+
+    Analogous to `numpy.concatenate <https://docs.scipy.org/doc/numpy/reference/generated/numpy.concatenate.html>`_
+
+    Args:
+        inputs: a `list` of `Tensor`
+        dim: a `int`.
+
+    Returns:
+        A `Tensor`
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+
+        >>> input1 = flow.Tensor(np.random.randn(2, 6, 5, 3), dtype=flow.float32)
+        >>> input2 = flow.Tensor(np.random.randn(2, 6, 5, 3), dtype=flow.float32)
+        >>> input3 = flow.Tensor(np.random.randn(2, 6, 5, 3), dtype=flow.float32)
+
+        >>> out = flow.cat([input1, input2, input3], dim=1)
+        >>> out.shape
+        flow.Size([2, 18, 5, 3])
+
+    """
+    return Cat(dim=dim)(inputs)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/constant.py b/python/oneflow/nn/modules/constant.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0de614c86589f9f017cecb0d4ea460e6a937f70
--- /dev/null
+++ b/python/oneflow/nn/modules/constant.py
@@ -0,0 +1,267 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional, Union
+
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.common_types import _size_any_t
+from oneflow.nn.module import Module
+from oneflow.nn.modules.utils import _single
+
+
+class _ConstantBase(Module):
+    def __init__(
+        self,
+        size: Union[_size_any_t, flow.Size],
+        value: Union[float, int],
+        dtype: Optional[flow.dtype],
+        device: Union[flow.device, str] = None,
+        requires_grad: bool = False,
+    ) -> None:
+        super().__init__()
+        assert size is not None, "shape must not be None!"
+        assert isinstance(
+            size, (int, tuple, flow.Size)
+        ), "shape should be int or tuple int!"
+        self.device = device
+        self.requires_grad = requires_grad
+        size = _single(size)
+        if dtype is None:
+            dtype = flow.float32
+        if device is None:
+            self.device = flow.device("cpu")
+        self.shape = size
+        self.value = value
+        self.dtype = dtype
+
+    def forward(self):
+        res = flow.F.constant(self.shape, self.value, self.dtype)
+        res = res.to(device=self.device)
+        res.requires_grad = self.requires_grad
+        return res
+
+
+class Ones(_ConstantBase):
+    def __init__(self, size, dtype=None, device=None, requires_grad=False):
+        super().__init__(size, 1, dtype, device, requires_grad)
+
+
+def ones_op(
+    size: Union[_size_any_t, flow.Size],
+    dtype: Optional[flow.dtype] = None,
+    device: Union[flow.device, str, None] = None,
+    requires_grad: bool = False,
+):
+    """
+    Returns a tensor filled with the scalar value 1,
+    with the shape defined by the variable argument `size`.
+
+    Args:
+        size (an integer or tuple of integer values) 鈥� defining the shape of the output tensor. Can be \\
+         a variable number of arguments or a collection like a list or tuple.
+        dtype (flow.dtype, optional) 鈥� the desired data type of returned tensor.
+        device (torch.device, optional) 鈥� the desired device of returned tensor. Default: if None, uses the current device for the default tensor type
+        requires_grad (bool, optional) 鈥� If autograd should record operations on the returned tensor. Default: False.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> y = flow.ones(5)
+        >>> y
+        tensor([1., 1., 1., 1., 1.], dtype=oneflow.float32)
+
+    """
+    return Ones(size, dtype, device, requires_grad)()
+
+
+class Zeros(_ConstantBase):
+    def __init__(self, size, dtype=None, device=None, requires_grad=False):
+        super().__init__(size, 0, dtype, device, requires_grad)
+
+
+def zeros_op(
+    size: Union[_size_any_t, flow.Size],
+    dtype: Optional[flow.dtype] = None,
+    device: Union[flow.device, str, None] = None,
+    requires_grad: bool = False,
+):
+    """
+    Returns a tensor filled with the scalar value 0,
+    with the shape defined by the variable argument `size`.
+
+    Args:
+        size(an integer or tuple of integer values) - defining the shape of the output tensor. Can be \\
+         a variable number of arguments or a collection like a list or tuple.
+        dtype (flow.dtype, optional) 鈥� the desired data type of returned tensor.
+        device (torch.device, optional) 鈥� the desired device of returned tensor. Default: if None, uses the current device for the default tensor type
+        requires_grad (bool, optional) 鈥� If autograd should record operations on the returned tensor. Default: False.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> y = flow.zeros(5)
+        >>> y
+        tensor([0., 0., 0., 0., 0.], dtype=oneflow.float32)
+
+    """
+    return Zeros(size, dtype, device, requires_grad)()
+
+
+class ZerosLike(Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, other):
+        return flow.F.zeros_like(other)
+
+
+def zeros_like_op(other):
+    """
+    Returns a tensor filled with the scalar value 0, with the same size as input.
+    flow.zeros_like(input) is equivalent to flow.zeros(input.shape, dtype=input.dtype)
+
+    Args:
+        other(Tensor): The size of input will determine size of the output tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> x = flow.Tensor(np.random.rand(5))
+        >>> y = flow.zeros_like(x)
+        >>> y
+        tensor([0., 0., 0., 0., 0.], dtype=oneflow.float32)
+
+    """
+    return ZerosLike()(other)
+
+
+class OnesLike(Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, other):
+        return flow.F.ones_like(other)
+
+
+def ones_like_op(other):
+    """
+    Returns a tensor filled with the scalar value 1, with the same size as input.
+    flow.ones_like(input) is equivalent to flow.ones(input.shape, dtype=input.dtype)
+
+    Args:
+        other(Tensor): The size of input will determine size of the output tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> x = flow.Tensor(np.random.rand(5))
+        >>> y = flow.ones_like(x)
+        >>> y
+        tensor([1., 1., 1., 1., 1.], dtype=oneflow.float32)
+
+    """
+    return OnesLike()(other)
+
+
+class NewOnes(Module):
+    def __init__(
+        self,
+        size: Union[_size_any_t, flow.Size] = None,
+        dtype: Optional[flow.dtype] = None,
+        device: Union[flow.device, str] = None,
+        requires_grad: bool = False,
+    ):
+        super().__init__()
+        self.device = device
+        self.requires_grad = requires_grad
+        if size != None:
+            size = _single(size)
+        self.size = size
+        self.dtype = dtype
+
+    def forward(self, x):
+        new_size = self.size
+        new_dtype = self.dtype
+        new_device = self.device
+        new_requires_grad = self.requires_grad
+        if self.size is None:
+            new_size = x.shape
+        if self.dtype is None:
+            new_dtype = x.dtype
+        if self.device is None:
+            new_device = x.device
+        assert isinstance(
+            new_size, (int, tuple, flow.Size)
+        ), f"size parameter not correct, please check!"
+        assert isinstance(
+            new_dtype, flow.dtype
+        ), f"dtype parameter not correct, please check!"
+        assert isinstance(
+            new_device, (str, flow.device)
+        ), f"device parameter not correct, please check!"
+        assert isinstance(
+            new_requires_grad, bool
+        ), f"requires_grad parameter not correct, please check!"
+        res = flow.F.constant(new_size, 1.0, new_dtype)
+        res = res.to(new_device)
+        res.requires_grad = new_requires_grad
+        return res
+
+
+@register_tensor_op("new_ones")
+def new_ones_op(x, size=None, dtype=None, device=None, requires_grad=False):
+    """
+    
+    Returns a Tensor of size size filled with 1. By default, the returned Tensor has the same torch.dtype and torch.device as this tensor.
+
+    Args:
+        size (int...): a list, tuple, or flow.Size of integers defining the shape of the output tensor.
+        dtype (flow.dtype, optional):  the desired type of returned tensor. Default: if None, same flow.dtype as this tensor.
+        device (flow.device, optional): the desired device of returned tensor. Default: if None, same flow.device as this tensor.
+        requires_grad (bool, optional): If autograd should record operations on the returned tensor. Default: False.
+    
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> x = flow.Tensor(np.ones((1, 2, 3)))
+        >>> y = x.new_ones((2, 2))
+        >>> y
+        tensor([[1., 1.],
+                [1., 1.]], dtype=oneflow.float32)
+    """
+    return NewOnes(size=size, dtype=dtype, device=device, requires_grad=requires_grad)(
+        x
+    )
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/container.py b/python/oneflow/nn/modules/container.py
new file mode 100644
index 0000000000000000000000000000000000000000..c91fcfdfcae89232edb82c563d27851f7b0a616e
--- /dev/null
+++ b/python/oneflow/nn/modules/container.py
@@ -0,0 +1,534 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import collections.abc
+import operator
+from collections import OrderedDict
+from itertools import islice
+from typing import (
+    Any,
+    Iterable,
+    Iterator,
+    Mapping,
+    Optional,
+    Tuple,
+    TypeVar,
+    Union,
+    overload,
+)
+
+import oneflow as flow
+from oneflow.nn.module import Module
+
+T = TypeVar("T")
+
+
+class Sequential(Module):
+    """A sequential container.
+    Modules will be added to it in the order they are passed in the constructor.
+    Alternatively, an ordered dict of modules can also be passed in.
+
+    To make it easier to understand, here is a small example:
+
+    .. code-block:: python
+
+        >>> import oneflow.nn as nn
+        >>> nn.Sequential(nn.Conv2d(1,20,5), nn.ReLU(), nn.Conv2d(20,64,5), nn.ReLU()) #doctest: +ELLIPSIS
+        Sequential(
+          (0): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
+          (1): ReLU()
+          (2): Conv2d(20, 64, kernel_size=(5, 5), stride=(1, 1))
+          (3): ReLU()
+        )
+        >>> nn.Sequential(OrderedDict([
+        ...    ('conv1', nn.Conv2d(1,20,5)),
+        ...    ('relu1', nn.ReLU()),
+        ...    ('conv2', nn.Conv2d(20,64,5)),
+        ...    ('relu2', nn.ReLU())
+        ... ])) #doctest: +ELLIPSIS
+        Sequential(
+          (conv1): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
+          (relu1): ReLU()
+          (conv2): Conv2d(20, 64, kernel_size=(5, 5), stride=(1, 1))
+          (relu2): ReLU()
+        )
+
+    """
+
+    @overload
+    def __init__(self, *args: Module) -> None:
+        ...
+
+    @overload
+    def __init__(self, arg: "OrderedDict[str, Module]") -> None:
+        ...
+
+    def __init__(self, *args: Any):
+        super(Sequential, self).__init__()
+        if len(args) == 1 and isinstance(args[0], OrderedDict):
+            for (key, module) in args[0].items():
+                self.add_module(key, module)
+        else:
+            for (idx, module) in enumerate(args):
+                self.add_module(str(idx), module)
+
+    def _get_item_by_idx(self, iterator, idx):
+        """Get the idx-th item of the iterator"""
+        size = len(self)
+        idx = operator.index(idx)
+        if not -size <= idx < size:
+            raise IndexError("index {} is out of range".format(idx))
+        idx %= size
+        return next(islice(iterator, idx, None))
+
+    def __getitem__(self: T, idx) -> T:
+        if isinstance(idx, slice):
+            return self.__class__(OrderedDict(list(self._modules.items())[idx]))
+        else:
+            return self._get_item_by_idx(self._modules.values(), idx)
+
+    def __setitem__(self, idx: int, module: Module) -> None:
+        key = self._get_item_by_idx(self._modules.keys(), idx)
+        return setattr(self, key, module)
+
+    def __delitem__(self, idx: Union[slice, int]) -> None:
+        if isinstance(idx, slice):
+            for key in list(self._modules.keys())[idx]:
+                delattr(self, key)
+        else:
+            key = self._get_item_by_idx(self._modules.keys(), idx)
+            delattr(self, key)
+
+    def __len__(self) -> int:
+        return len(self._modules)
+
+    def __dir__(self):
+        keys = super(Sequential, self).__dir__()
+        keys = [key for key in keys if not key.isdigit()]
+        return keys
+
+    def __iter__(self) -> Iterator[Module]:
+        return iter(self._modules.values())
+
+    def forward(self, input):
+        for module in self:
+            input = module(input)
+        return input
+
+
+class ParameterList(Module):
+    def __init__(self, parameters: Optional[Iterable["Parameter"]] = None) -> None:
+        super(ParameterList, self).__init__()
+        self._initialized = True
+        if parameters is not None:
+            self += parameters
+
+    def __setstate__(self, state):
+        state["_initialized"] = False
+        super(ParameterList, self).__setstate__(state)
+        self._initialized = True
+
+    def _get_abs_string_index(self, idx):
+        """Get the absolute index for the list of modules"""
+        idx = operator.index(idx)
+        if not -len(self) <= idx < len(self):
+            raise IndexError("index {} is out of range".format(idx))
+        if idx < 0:
+            idx += len(self)
+        return str(idx)
+
+    @overload
+    def __getitem__(self, idx: int) -> "Parameter":
+        ...
+
+    @overload
+    def __getitem__(self: T, idx: slice) -> T:
+        ...
+
+    def __getitem__(self, idx):
+        if isinstance(idx, slice):
+            return self.__class__(list(self._parameters.values())[idx])
+        else:
+            idx = self._get_abs_string_index(idx)
+            return self._parameters[str(idx)]
+
+    def __setitem__(self, idx: int, param: "Parameter") -> None:
+        idx = self._get_abs_string_index(idx)
+        return self.register_parameter(str(idx), param)
+
+    def __setattr__(self, key: Any, value: Any) -> None:
+        if getattr(self, "_initialized", False):
+            if not hasattr(self, key) and (not isinstance(value, flow.nn.Parameter)):
+                warnings.warn("Setting attributes on ParameterList is not supported.")
+        super(ParameterList, self).__setattr__(key, value)
+
+    def __len__(self) -> int:
+        return len(self._parameters)
+
+    def __iter__(self) -> Iterator["Parameter"]:
+        return iter(self._parameters.values())
+
+    def __iadd__(self: T, parameters: Iterable["Parameter"]) -> T:
+        return self.extend(parameters)
+
+    def __dir__(self):
+        keys = super(ParameterList, self).__dir__()
+        keys = [key for key in keys if not key.isdigit()]
+        return keys
+
+    def append(self: T, parameter: "Parameter") -> T:
+        """Appends a given parameter at the end of the list.
+
+        Arguments:
+            parameter (nn.Parameter): parameter to append
+        """
+        self.register_parameter(str(len(self)), parameter)
+        return self
+
+    def extend(self: T, parameters: Iterable["Parameter"]) -> T:
+        """Appends parameters from a Python iterable to the end of the list.
+
+        Arguments:
+            parameters (iterable): iterable of parameters to append
+        """
+        if not isinstance(parameters, collections.abc.Iterable):
+            raise TypeError(
+                "ParameterList.extend should be called with an iterable, but got "
+                + type(parameters).__name__
+            )
+        offset = len(self)
+        for (i, param) in enumerate(parameters):
+            self.register_parameter(str(offset + i), param)
+        return self
+
+    def extra_repr(self) -> str:
+        child_lines = []
+        for (k, p) in self._parameters.items():
+            size_str = "x".join((str(size) for size in p.size()))
+            device_str = "" if not p.is_cuda else " (GPU {})".format(p.get_device())
+            parastr = "Parameter containing: [{} of size {}{}]".format(
+                type(p), size_str, device_str
+            )
+            child_lines.append("  (" + str(k) + "): " + parastr)
+        tmpstr = "\n".join(child_lines)
+        return tmpstr
+
+    def __call__(self, input):
+        raise RuntimeError("ParameterList should not be called.")
+
+    def _replicate_for_data_parallel(self):
+        warnings.warn(
+            "nn.ParameterList is being used with DataParallel but this is not supported. This list will appear empty for the models replicated on each GPU except the original one."
+        )
+        return super(ParameterList, self)._replicate_for_data_parallel()
+
+
+class ParameterDict(Module):
+    def __init__(self, parameters: Optional[Mapping[str, "Parameter"]] = None) -> None:
+        super(ParameterDict, self).__init__()
+        self._initialized = True
+        if parameters is not None:
+            self.update(parameters)
+
+    def __setstate__(self, state):
+        state["_initialized"] = False
+        super(ParameterDict, self).__setstate__(state)
+        self._initialized = True
+
+    def __getitem__(self, key: str) -> "Parameter":
+        return self._parameters[key]
+
+    def __setitem__(self, key: str, parameter: "Parameter") -> None:
+        self.register_parameter(key, parameter)
+
+    def __delitem__(self, key: str) -> None:
+        del self._parameters[key]
+
+    def __setattr__(self, key: Any, value: Any) -> None:
+        if getattr(self, "_initialized", False):
+            if not hasattr(self, key) and (not isinstance(value, flow.nn.Parameter)):
+                warnings.warn("Setting attributes on ParameterDict is not supported.")
+        super(ParameterDict, self).__setattr__(key, value)
+
+    def __len__(self) -> int:
+        return len(self._parameters)
+
+    def __iter__(self) -> Iterator[str]:
+        return iter(self._parameters.keys())
+
+    def __contains__(self, key: str) -> bool:
+        return key in self._parameters
+
+    def clear(self) -> None:
+        """Remove all items from the ParameterDict.
+        """
+        self._parameters.clear()
+
+
+class ModuleList(Module):
+    def __init__(self, modules: Optional[Iterable[Module]] = None) -> None:
+        super(ModuleList, self).__init__()
+        if modules is not None:
+            self += modules
+
+    def _get_abs_string_index(self, idx):
+        """Get the absolute index for the list of modules"""
+        idx = operator.index(idx)
+        if not -len(self) <= idx < len(self):
+            raise IndexError("index {} is out of range".format(idx))
+        if idx < 0:
+            idx += len(self)
+        return str(idx)
+
+    def __getitem__(self, idx: int) -> Module:
+        if isinstance(idx, slice):
+            return self.__class__(list(self._modules.values())[idx])
+        else:
+            return self._modules[self._get_abs_string_index(idx)]
+
+    def __setitem__(self, idx: int, module: Module) -> None:
+        idx = self._get_abs_string_index(idx)
+        return setattr(self, str(idx), module)
+
+    def __delitem__(self, idx: Union[int, slice]) -> None:
+        if isinstance(idx, slice):
+            for k in range(len(self._modules))[idx]:
+                delattr(self, str(k))
+        else:
+            delattr(self, self._get_abs_string_index(idx))
+        str_indices = [str(i) for i in range(len(self._modules))]
+        self._modules = OrderedDict(list(zip(str_indices, self._modules.values())))
+
+    def __len__(self) -> int:
+        return len(self._modules)
+
+    def __iter__(self) -> Iterator[Module]:
+        return iter(self._modules.values())
+
+    def __iadd__(self: T, modules: Iterable[Module]) -> T:
+        return self.extend(modules)
+
+    def __dir__(self):
+        keys = super(ModuleList, self).__dir__()
+        keys = [key for key in keys if not key.isdigit()]
+        return keys
+
+    def insert(self, index: int, module: Module) -> None:
+        """Insert a given module before a given index in the list.
+
+        Arguments:
+            index (int): index to insert.
+            module (nn.Module): module to insert
+        """
+        for i in range(len(self._modules), index, -1):
+            self._modules[str(i)] = self._modules[str(i - 1)]
+        self._modules[str(index)] = module
+
+    def append(self: T, module: Module) -> T:
+        """Appends a given module to the end of the list.
+
+        Arguments:
+            module (nn.Module): module to append
+        """
+        self.add_module(str(len(self)), module)
+        return self
+
+    def extend(self: T, modules: Iterable[Module]) -> T:
+        """Appends modules from a Python iterable to the end of the list.
+
+        Arguments:
+            modules (iterable): iterable of modules to append
+        """
+        if not isinstance(modules, collections.abc.Iterable):
+            raise TypeError(
+                "ModuleList.extend should be called with an iterable, but got "
+                + type(modules).__name__
+            )
+        offset = len(self)
+        for (i, module) in enumerate(modules):
+            self.add_module(str(offset + i), module)
+        return self
+
+    def forward(self):
+        raise NotImplementedError()
+
+
+class ModuleDict(Module):
+    def __init__(self, modules: Optional[Mapping[str, Module]] = None) -> None:
+        super(ModuleDict, self).__init__()
+        if modules is not None:
+            self.update(modules)
+
+    def __getitem__(self, key: str) -> Module:
+        return self._modules[key]
+
+    def __setitem__(self, key: str, module: Module) -> None:
+        self.add_module(key, module)
+
+    def __delitem__(self, key: str) -> None:
+        del self._modules[key]
+
+    def __len__(self) -> int:
+        return len(self._modules)
+
+    def __iter__(self) -> Iterator[str]:
+        return iter(self._modules)
+
+    def __contains__(self, key: str) -> bool:
+        return key in self._modules
+
+    def clear(self) -> None:
+        """Remove all items from the ModuleDict.
+        """
+        self._modules.clear()
+
+    def pop(self, key: str) -> Module:
+        """Remove key from the ModuleDict and return its module.
+
+        Arguments:
+            key (string): key to pop from the ModuleDict
+        """
+        v = self[key]
+        del self[key]
+        return v
+
+    def keys(self) -> Iterable[str]:
+        """Return an iterable of the ModuleDict keys.
+        """
+        return self._modules.keys()
+
+    def items(self) -> Iterable[Tuple[str, Module]]:
+        """Return an iterable of the ModuleDict key/value pairs.
+        """
+        return self._modules.items()
+
+    def values(self) -> Iterable[Module]:
+        """Return an iterable of the ModuleDict values.
+        """
+        return self._modules.values()
+
+    def update(self, modules: Mapping[str, Module]) -> None:
+        if not isinstance(modules, collections.abc.Iterable):
+            raise TypeError(
+                "ModuleDict.update should be called with an iterable of key/value pairs, but got "
+                + type(modules).__name__
+            )
+        if isinstance(modules, (OrderedDict, ModuleDict, collections.abc.Mapping)):
+            for (key, module) in modules.items():
+                self[key] = module
+        else:
+            for (j, m) in enumerate(modules):
+                if not isinstance(m, collections.abc.Iterable):
+                    raise TypeError(
+                        "ModuleDict update sequence element #"
+                        + str(j)
+                        + " should be Iterable; is"
+                        + type(m).__name__
+                    )
+                if not len(m) == 2:
+                    raise ValueError(
+                        "ModuleDict update sequence element #"
+                        + str(j)
+                        + " has length "
+                        + str(len(m))
+                        + "; 2 is required"
+                    )
+                self[m[0]] = m[1]
+
+    def forward(self):
+        raise NotImplementedError()
+
+    def pop(self, key: str) -> "Parameter":
+        """Remove key from the ParameterDict and return its parameter.
+
+        Arguments:
+            key (string): key to pop from the ParameterDict
+        """
+        v = self[key]
+        del self[key]
+        return v
+
+    def keys(self) -> Iterable[str]:
+        """Return an iterable of the ParameterDict keys.
+        """
+        return self._parameters.keys()
+
+    def items(self) -> Iterable[Tuple[str, "Parameter"]]:
+        """Return an iterable of the ParameterDict key/value pairs.
+        """
+        return self._parameters.items()
+
+    def values(self) -> Iterable["Parameter"]:
+        """Return an iterable of the ParameterDict values.
+        """
+        return self._parameters.values()
+
+    def update(self, parameters: Mapping[str, "Parameter"]) -> None:
+        if not isinstance(parameters, collections.abc.Iterable):
+            raise TypeError(
+                "ParametersDict.update should be called with an iterable of key/value pairs, but got "
+                + type(parameters).__name__
+            )
+        if isinstance(parameters, (OrderedDict, ParameterDict)):
+            for (key, parameter) in parameters.items():
+                self[key] = parameter
+        elif isinstance(parameters, collections.abc.Mapping):
+            for (key, parameter) in sorted(parameters.items()):
+                self[key] = parameter
+        else:
+            for (j, p) in enumerate(parameters):
+                if not isinstance(p, collections.abc.Iterable):
+                    raise TypeError(
+                        "ParameterDict update sequence element #"
+                        + str(j)
+                        + " should be Iterable; is"
+                        + type(p).__name__
+                    )
+                if not len(p) == 2:
+                    raise ValueError(
+                        "ParameterDict update sequence element #"
+                        + str(j)
+                        + " has length "
+                        + str(len(p))
+                        + "; 2 is required"
+                    )
+                self[p[0]] = p[1]
+
+    def extra_repr(self) -> str:
+        child_lines = []
+        for (k, p) in self._parameters.items():
+            size_str = "x".join((str(size) for size in p.size()))
+            device_str = "" if not p.is_cuda else " (GPU {})".format(p.get_device())
+            parastr = "Parameter containing: [{} of size {}{}]".format(
+                type(p), size_str, device_str
+            )
+            child_lines.append("  (" + k + "): " + parastr)
+        tmpstr = "\n".join(child_lines)
+        return tmpstr
+
+    def __call__(self, input):
+        raise RuntimeError("ParameterDict should not be called.")
+
+    def _replicate_for_data_parallel(self):
+        warnings.warn(
+            "nn.ParameterDict is being used with DataParallel but this is not supported. This dict will appear empty for the models replicated on each GPU except the original one."
+        )
+        return super(ParameterDict, self)._replicate_for_data_parallel()
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/conv.py b/python/oneflow/nn/modules/conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a24f11ed7b7f0ca667d6e0b428ff62decca5886
--- /dev/null
+++ b/python/oneflow/nn/modules/conv.py
@@ -0,0 +1,489 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import math
+
+import oneflow as flow
+from oneflow.nn import init
+from oneflow.nn.common_types import _size_1_t, _size_2_t
+from oneflow.nn.module import Module
+from oneflow.nn.modules.utils import _pair, _single
+
+
+def slice(x, begin, size):
+    ndim = len(x.shape)
+    if not isinstance(begin, (list, tuple)) or len(begin) != ndim:
+        raise ValueError(
+            "begin must be a list/tuple with the same length as input tensor's number of dimensions"
+        )
+    if not all((isinstance(b, int) or b is None for b in begin)):
+        raise ValueError("element of begin must be a int or None")
+    if not isinstance(size, (list, tuple)) or len(size) != ndim:
+        raise ValueError(
+            "size must be a list/tuple with the same length as input tensor's number of dimensions."
+        )
+    if not all((isinstance(s, int) or s is None for s in size)):
+        raise ValueError("element of size must be a int or None")
+    slice_tup_list = []
+    for (b, s, dim_size) in zip(begin, size, x.shape):
+        (start, stop, step) = (None, None, 1)
+        if b is not None:
+            if b < -dim_size or b >= dim_size:
+                raise ValueError("element of begin is out of range")
+            start = b
+        if s is not None:
+            if s == -1:
+                stop = dim_size
+            else:
+                if s <= 0 or s > dim_size:
+                    raise ValueError("element of size is invalid")
+                if b + s < dim_size:
+                    stop = b + s
+        slice_tup_list.append((start, stop, step))
+    return flow.slice(x, slice_tup_list)
+
+
+class ConvUtil(object):
+    @classmethod
+    def split(cls, x, axis, split_num):
+        split_len = x.shape[axis] // split_num
+        result_list = []
+        slice_begin = [0] * len(x.shape)
+        slice_size = [-1] * len(x.shape)
+        slice_size[axis] = split_len
+        for i in range(split_num):
+            slice_begin[axis] = i * split_len
+            result = slice(x, slice_begin, slice_size)
+            result_list.append(result)
+        return result_list
+
+
+class Conv1d(Module):
+    """The interface is consistent with PyTorch.    
+    The documentation is referenced from: https://pytorch.org/docs/master/generated/torch.nn.Conv1d.html#conv1d
+    
+    Applies a 1D convolution over an input signal composed of several input
+    planes.
+
+    In the simplest case, the output value of the layer with input size
+    :math:`(N, C_{\\text{in}}, L)` and output :math:`(N, C_{\\text{out}}, L_{\\text{out}})` can be
+    precisely described as:
+
+    .. math::
+        \\text{out}(N_i, C_{\\text{out}_j}) = \\text{bias}(C_{\\text{out}_j}) +
+        \\sum_{k = 0}^{C_{in} - 1} \\text{weight}(C_{\\text{out}_j}, k)
+        \\star \\text{input}(N_i, k)
+
+    where :math:`\\star` is the valid `cross-correlation`_ operator,
+    :math:`N` is a batch size, :math:`C` denotes a number of channels,
+    :math:`L` is a length of signal sequence.
+
+    * :attr:`stride` controls the stride for the cross-correlation, a single
+      number or a one-element tuple.
+
+    * :attr:`padding` controls the amount of padding applied to the input. It
+      can be either a string {{'valid', 'same'}} or a tuple of ints giving the
+      amount of implicit padding applied on both sides.
+
+    * :attr:`dilation` controls the spacing between the kernel points; also
+      known as the 脿 trous algorithm. It is harder to describe, but this `link`_
+      has a nice visualization of what :attr:`dilation` does.
+
+    Note:
+        ``padding='valid'`` is the same as no padding. ``padding='same'`` pads
+        the input so the output has the shape as the input. However, this mode
+        doesn't support any stride values other than 1.
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int, tuple or str, optional): Padding added to both sides of
+            the input. Default: 0
+        padding_mode (string, optional): ``'zeros'``, ``'reflect'``,
+            ``'replicate'`` or ``'circular'``. Default: ``'zeros'``
+        dilation (int or tuple, optional): Spacing between kernel
+            elements. Default: 1
+        groups (int, optional): Number of blocked connections from input
+            channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the
+            output. Default: ``True``
+
+    Shape:
+        - Input: :math:`(N, C_{in}, L_{in})`
+        - Output: :math:`(N, C_{out}, L_{out})` where
+
+          .. math::
+              L_{out} = \\left\\lfloor\\frac{L_{in} + 2 \\times \\text{padding} - \\text{dilation}
+                        \\times (\\text{kernel\\_size} - 1) - 1}{\\text{stride}} + 1\\right\\rfloor
+
+    Attributes:
+        weight (Tensor): the learnable weights of the module of shape
+            :math:`(\\text{out\\_channels},
+            \\frac{\\text{in\\_channels}}{\\text{groups}}, \\text{kernel\\_size})`.
+            The values of these weights are sampled from
+            :math:`\\mathcal{U}(-\\sqrt{k}, \\sqrt{k})` where
+            :math:`k = \\frac{groups}{C_\\text{in} * \\text{kernel\\_size}}`
+        bias (Tensor):   the learnable bias of the module of shape
+            (out_channels). If :attr:`bias` is ``True``, then the values of these weights are
+            sampled from :math:`\\mathcal{U}(-\\sqrt{k}, \\sqrt{k})` where
+            :math:`k = \\frac{groups}{C_\\text{in} * \\text{kernel\\_size}}`
+
+    For example: 
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        >>> import oneflow.nn as nn
+        
+        >>> arr = np.random.randn(20, 16, 50)
+        >>> input = flow.Tensor(arr)
+        >>> m = nn.Conv1d(16, 33, 3, stride=2)
+        >>> output = m(input)
+
+    .. _cross-correlation:
+        https://en.wikipedia.org/wiki/Cross-correlation
+
+    .. _link:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: _size_1_t,
+        stride: _size_1_t = 1,
+        padding: _size_1_t = 0,
+        dilation: _size_1_t = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = "zeros",
+    ):
+        super().__init__()
+        assert padding_mode == "zeros"
+        self.padding_mode = padding_mode
+        self.kernel_size = _single(kernel_size)
+        self.stride = _single(stride)
+        self.padding = _single(padding)
+        self.dilation = _single(dilation)
+        self.groups = groups
+        assert in_channels % groups == 0
+        assert out_channels % groups == 0
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.weight = flow.nn.Parameter(
+            flow.Tensor(out_channels, in_channels // groups, *self.kernel_size)
+        )
+        self.out_channel_groups = out_channels // groups
+        self.bias = None
+        if bias:
+            self.bias = flow.nn.Parameter(flow.Tensor(out_channels))
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+        if self.bias is not None:
+            (fan_in, _) = init._calculate_fan_in_and_fan_out(self.weight)
+            bound = 1 / math.sqrt(fan_in)
+            init.uniform_(self.bias, -bound, bound)
+
+    def forward(self, x):
+        if x.device.type == "cpu" and self.groups > 1:
+            in_channel_axis = 1
+            weight_channel_axis = 0
+            bias_channel_axis = 0
+            in_split_list = ConvUtil.split(
+                x, axis=in_channel_axis, split_num=self.groups
+            )
+            out_list = []
+            for i in range(len(in_split_list)):
+                out_list.append(
+                    flow.F.conv1d(
+                        in_split_list[i],
+                        self.weight[
+                            i
+                            * self.out_channel_groups : (i + 1)
+                            * self.out_channel_groups,
+                            :,
+                            :,
+                        ],
+                        self.bias[
+                            i
+                            * self.out_channel_groups : (i + 1)
+                            * self.out_channel_groups
+                        ]
+                        if self.bias
+                        else None,
+                        stride=self.stride,
+                        padding=self.padding,
+                        dilation=self.dilation,
+                        groups=1,
+                    )
+                )
+            res = flow.cat(out_list, dim=in_channel_axis)
+        else:
+            res = flow.F.conv1d(
+                x,
+                self.weight,
+                self.bias,
+                stride=self.stride,
+                padding=self.padding,
+                dilation=self.dilation,
+                groups=self.groups,
+            )
+        return res
+
+    def extra_repr(self):
+        s = "{in_channels}, {out_channels}, kernel_size={kernel_size}, stride={stride}"
+        if self.padding != (0,) * len(self.padding):
+            s += ", padding={padding}"
+        if self.dilation != (1,) * len(self.dilation):
+            s += ", dilation={dilation}"
+        if self.groups != 1:
+            s += ", groups={groups}"
+        if self.bias is None:
+            s += ", bias=False"
+        if self.padding_mode != "zeros":
+            s += ", padding_mode={padding_mode}"
+        return s.format(**self.__dict__)
+
+
+class Conv2d(Module):
+    """The interface is consistent with PyTorch.    
+    The documentation is referenced from: https://pytorch.org/docs/master/generated/torch.nn.Conv2d.html#conv2d
+    
+    Applies a 2D convolution over an input signal composed of several input
+    planes.
+
+    In the simplest case, the output value of the layer with input size
+    :math:`(N, C_{\\text{in}}, H, W)` and output :math:`(N, C_{\\text{out}}, H_{\\text{out}}, W_{\\text{out}})`
+    can be precisely described as:
+
+    .. math::
+        \\text{out}(N_i, C_{\\text{out}_j}) = \\text{bias}(C_{\\text{out}_j}) +
+        \\sum_{k = 0}^{C_{\\text{in}} - 1} \\text{weight}(C_{\\text{out}_j}, k) \\star \\text{input}(N_i, k)
+
+
+    where :math:`\\star` is the valid 2D `cross-correlation`_ operator,
+    :math:`N` is a batch size, :math:`C` denotes a number of channels,
+    :math:`H` is a height of input planes in pixels, and :math:`W` is
+    width in pixels.
+
+
+    * :attr:`stride` controls the stride for the cross-correlation, a single
+      number or a tuple.
+    * :attr:`padding` controls the amount of implicit padding on both
+      sides for :attr:`padding` number of points for each dimension.
+    * :attr:`dilation` controls the spacing between the kernel points; also
+      known as the 脿 trous algorithm. It is harder to describe, but this `link`_
+      has a nice visualization of what :attr:`dilation` does.
+    * :attr:`groups` controls the connections between inputs and outputs.
+      :attr:`in_channels` and :attr:`out_channels` must both be divisible by
+      :attr:`groups`. For example,
+
+        * At groups=1, all inputs are convolved to all outputs.
+        * At groups=2, the operation becomes equivalent to having two conv
+          layers side by side, each seeing half the input channels
+          and producing half the output channels, and both subsequently
+          concatenated.
+        * At groups= :attr:`in_channels`, each input channel is convolved with
+          its own set of filters (of size
+          :math:`\\frac{\\text{out_channels}}{\\text{in_channels}}`).,
+
+    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be:
+
+        - a single ``int`` -- in which case the same value is used for the height and width dimension
+        - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
+          and the second `int` for the width dimension
+
+    Note:
+        When `groups == in_channels` and `out_channels == K * in_channels`,
+        where `K` is a positive integer, this operation is also known as a "depthwise convolution".
+
+        In other words, for an input of size :math:`(N, C_{in}, L_{in})`,
+        a depthwise convolution with a depthwise multiplier `K` can be performed with the arguments
+        :math:`(C_\\text{in}=C_\\text{in}, C_\\text{out}=C_\\text{in} \\times \\text{K}, ..., \\text{groups}=C_\\text{in})`.
+
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of
+            the input. Default: 0
+        padding_mode (string, optional): ``'zeros'``, ``'reflect'``,
+            ``'replicate'`` or ``'circular'``. Default: ``'zeros'``
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+        groups (int, optional): Number of blocked connections from input
+            channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the
+            output. Default: ``True``
+
+    Shape:
+        - Input: :math:`(N, C_{in}, H_{in}, W_{in})`
+        - Output: :math:`(N, C_{out}, H_{out}, W_{out})` where
+
+          .. math::
+              H_{out} = \\left\\lfloor\\frac{H_{in}  + 2 \\times \\text{padding}[0] - \\text{dilation}[0]
+                        \\times (\\text{kernel_size}[0] - 1) - 1}{\\text{stride}[0]} + 1\\right\\rfloor
+
+          .. math::
+              W_{out} = \\left\\lfloor\\frac{W_{in}  + 2 \\times \\text{padding}[1] - \\text{dilation}[1]
+                        \\times (\\text{kernel_size}[1] - 1) - 1}{\\text{stride}[1]} + 1\\right\\rfloor
+
+    Attr:
+        - weight (Tensor): the learnable weights of the module of shape
+            :math:`(\\text{out_channels}, \\frac{\\text{in_channels}}{\\text{groups}},`
+            :math:`\\text{kernel_size[0]}, \\text{kernel_size[1]})`.
+            The values of these weights are sampled from
+            :math:`\\mathcal{U}(-\\sqrt{k}, \\sqrt{k})` where
+            :math:`k = \\frac{groups}{C_\\text{in} * \\prod_{i=0}^{1}\\text{kernel_size}[i]}`
+
+        - bias (Tensor):   the learnable bias of the module of shape
+            (out_channels). If :attr:`bias` is ``True``,
+            then the values of these weights are
+            sampled from :math:`\\mathcal{U}(-\\sqrt{k}, \\sqrt{k})` where
+            :math:`k = \\frac{groups}{C_\\text{in} * \\prod_{i=0}^{1}\\text{kernel_size}[i]}`
+
+    For example: 
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        >>> import oneflow.nn as nn
+        
+        >>> arr = np.random.randn(20, 16, 50, 100)
+        >>> input = flow.Tensor(arr)
+        >>> m = nn.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1))
+        >>> output = m(input)
+
+    .. _cross-correlation:
+        https://en.wikipedia.org/wiki/Cross-correlation
+
+    .. _link:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: _size_2_t,
+        stride: _size_2_t = 1,
+        padding: _size_2_t = 0,
+        dilation: _size_2_t = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = "zeros",
+    ):
+        super().__init__()
+        assert padding_mode == "zeros"
+        self.padding_mode = padding_mode
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride)
+        self.padding = _pair(padding)
+        self.dilation = _pair(dilation)
+        self.groups = groups
+        assert in_channels % groups == 0
+        assert out_channels % groups == 0
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.weight = flow.nn.Parameter(
+            flow.Tensor(out_channels, in_channels // groups, *self.kernel_size)
+        )
+        self.out_channel_groups = out_channels // groups
+        self.bias = None
+        if bias:
+            self.bias = flow.nn.Parameter(flow.Tensor(out_channels))
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+        if self.bias is not None:
+            (fan_in, _) = init._calculate_fan_in_and_fan_out(self.weight)
+            bound = 1 / math.sqrt(fan_in)
+            init.uniform_(self.bias, -bound, bound)
+
+    def forward(self, x):
+        if x.shape[1] != self.in_channels:
+            raise ValueError("The input channels should be equal to self.in_channels")
+        if x.device.type == "cpu" and self.groups > 1:
+            in_channel_axis = 1
+            in_split_list = ConvUtil.split(
+                x, axis=in_channel_axis, split_num=self.groups
+            )
+            out_list = []
+            for i in range(len(in_split_list)):
+                out_list.append(
+                    flow.F.conv2d(
+                        in_split_list[i],
+                        self.weight[
+                            i
+                            * self.out_channel_groups : (i + 1)
+                            * self.out_channel_groups,
+                            :,
+                            :,
+                            :,
+                        ],
+                        self.bias[
+                            i
+                            * self.out_channel_groups : (i + 1)
+                            * self.out_channel_groups
+                        ]
+                        if self.bias
+                        else None,
+                        stride=self.stride,
+                        padding=self.padding,
+                        dilation=self.dilation,
+                        groups=1,
+                    )
+                )
+            res = flow.cat(out_list, dim=in_channel_axis)
+        else:
+            res = flow.F.conv2d(
+                x,
+                self.weight,
+                self.bias,
+                stride=self.stride,
+                padding=self.padding,
+                dilation=self.dilation,
+                groups=self.groups,
+            )
+        return res
+
+    def extra_repr(self):
+        s = "{in_channels}, {out_channels}, kernel_size={kernel_size}, stride={stride}"
+        if self.padding != (0,) * len(self.padding):
+            s += ", padding={padding}"
+        if self.dilation != (1,) * len(self.dilation):
+            s += ", dilation={dilation}"
+        if self.groups != 1:
+            s += ", groups={groups}"
+        if self.bias is None:
+            s += ", bias=False"
+        if self.padding_mode != "zeros":
+            s += ", padding_mode={padding_mode}"
+        return s.format(**self.__dict__)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/dataset.py b/python/oneflow/nn/modules/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..84d53407bbd8c9babcd921487801c2cbf339275a
--- /dev/null
+++ b/python/oneflow/nn/modules/dataset.py
@@ -0,0 +1,593 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import random
+import sys
+import traceback
+from typing import List, Optional, Sequence, Tuple, Union
+
+import oneflow as flow
+from oneflow.nn.common_types import _size_1_t, _size_2_t, _size_3_t, _size_any_t
+from oneflow.nn.module import Module
+from oneflow.nn.modules.utils import _pair, _reverse_repeat_tuple, _single, _triple
+
+
+def mirrored_gen_random_seed(seed=None):
+    if seed is None:
+        seed = -1
+        has_seed = False
+    else:
+        has_seed = True
+    return (seed, has_seed)
+
+
+class OfrecordReader(Module):
+    def __init__(
+        self,
+        ofrecord_dir: str,
+        batch_size: int = 1,
+        data_part_num: int = 1,
+        part_name_prefix: str = "part-",
+        part_name_suffix_length: int = -1,
+        random_shuffle: bool = False,
+        shuffle_buffer_size: int = 1024,
+        shuffle_after_epoch: bool = False,
+        random_seed: int = -1,
+        name: Optional[str] = None,
+    ):
+        super().__init__()
+        (seed, has_seed) = mirrored_gen_random_seed(random_seed)
+        self._op = (
+            flow.builtin_op("OFRecordReader", name)
+            .Output("out")
+            .Attr("data_dir", ofrecord_dir)
+            .Attr("data_part_num", data_part_num)
+            .Attr("batch_size", batch_size)
+            .Attr("part_name_prefix", part_name_prefix)
+            .Attr("random_shuffle", random_shuffle)
+            .Attr("shuffle_buffer_size", shuffle_buffer_size)
+            .Attr("shuffle_after_epoch", shuffle_after_epoch)
+            .Attr("part_name_suffix_length", part_name_suffix_length)
+            .Attr("seed", seed)
+            .Build()
+        )
+
+    def forward(self):
+        res = self._op()[0]
+        return res
+
+
+class OfrecordRawDecoder(Module):
+    def __init__(
+        self,
+        blob_name: str,
+        shape: Sequence[int],
+        dtype: flow.dtype,
+        dim1_varying_length: bool = False,
+        truncate: bool = False,
+        auto_zero_padding: bool = False,
+        name: Optional[str] = None,
+    ):
+        super().__init__()
+        if auto_zero_padding:
+            print(
+                "WARNING: auto_zero_padding has been deprecated, Please use truncate instead.\n                "
+            )
+        self._op = (
+            flow.builtin_op("ofrecord_raw_decoder", name)
+            .Input("in")
+            .Output("out")
+            .Attr("name", blob_name)
+            .Attr("shape", shape)
+            .Attr("data_type", dtype)
+            .Attr("dim1_varying_length", dim1_varying_length)
+            .Attr("truncate", truncate or auto_zero_padding)
+            .Build()
+        )
+
+    def forward(self, input):
+        res = self._op(input)[0]
+        return res
+
+
+class CoinFlip(Module):
+    def __init__(
+        self,
+        batch_size: int = 1,
+        random_seed: Optional[int] = None,
+        probability: float = 0.5,
+    ):
+        super().__init__()
+        (seed, has_seed) = mirrored_gen_random_seed(random_seed)
+        self._op = (
+            flow.builtin_op("coin_flip")
+            .Output("out")
+            .Attr("batch_size", batch_size)
+            .Attr("probability", probability)
+            .Attr("has_seed", has_seed)
+            .Attr("seed", seed)
+            .Build()
+        )
+
+    def forward(self):
+        res = self._op()[0]
+        return res
+
+
+class CropMirrorNormalize(Module):
+    def __init__(
+        self,
+        color_space: str = "BGR",
+        output_layout: str = "NCHW",
+        crop_h: int = 0,
+        crop_w: int = 0,
+        crop_pos_y: float = 0.5,
+        crop_pos_x: float = 0.5,
+        mean: Sequence[float] = [0.0],
+        std: Sequence[float] = [1.0],
+        output_dtype: flow.dtype = flow.float,
+    ):
+        super().__init__()
+        self._op = (
+            flow.builtin_op("crop_mirror_normalize_from_uint8")
+            .Input("in")
+            .Input("mirror")
+            .Output("out")
+            .Attr("color_space", color_space)
+            .Attr("output_layout", output_layout)
+            .Attr("mean", mean)
+            .Attr("std", std)
+            .Attr("crop_h", crop_h)
+            .Attr("crop_w", crop_w)
+            .Attr("crop_pos_y", crop_pos_y)
+            .Attr("crop_pos_x", crop_pos_x)
+            .Attr("output_dtype", output_dtype)
+            .Build()
+        )
+        self._val_op = (
+            flow.builtin_op("crop_mirror_normalize_from_tensorbuffer")
+            .Input("in")
+            .Output("out")
+            .Attr("color_space", color_space)
+            .Attr("output_layout", output_layout)
+            .Attr("mean", mean)
+            .Attr("std", std)
+            .Attr("crop_h", crop_h)
+            .Attr("crop_w", crop_w)
+            .Attr("crop_pos_y", crop_pos_y)
+            .Attr("crop_pos_x", crop_pos_x)
+            .Attr("output_dtype", output_dtype)
+            .Build()
+        )
+
+    def forward(self, input, mirror=None):
+        if mirror != None:
+            res = self._op(input, mirror)[0]
+        else:
+            res = self._val_op(input)[0]
+        return res
+
+
+class OFRecordImageDecoderRandomCrop(Module):
+    def __init__(
+        self,
+        blob_name: str,
+        color_space: str = "BGR",
+        num_attempts: int = 10,
+        random_seed: Optional[int] = None,
+        random_area: Sequence[float] = [0.08, 1.0],
+        random_aspect_ratio: Sequence[float] = [0.75, 1.333333],
+    ):
+        super().__init__()
+        (seed, has_seed) = mirrored_gen_random_seed(random_seed)
+        self._op = (
+            flow.builtin_op("ofrecord_image_decoder_random_crop")
+            .Input("in")
+            .Output("out")
+            .Attr("name", blob_name)
+            .Attr("color_space", color_space)
+            .Attr("num_attempts", num_attempts)
+            .Attr("random_area", random_area)
+            .Attr("random_aspect_ratio", random_aspect_ratio)
+            .Attr("has_seed", has_seed)
+            .Attr("seed", seed)
+            .Build()
+        )
+
+    def forward(self, input):
+        res = self._op(input)[0]
+        return res
+
+
+class OFRecordImageDecoder(Module):
+    def __init__(self, blob_name: str, color_space: str = "BGR"):
+        super().__init__()
+        self._op = (
+            flow.builtin_op("ofrecord_image_decoder")
+            .Input("in")
+            .Output("out")
+            .Attr("name", blob_name)
+            .Attr("color_space", color_space)
+            .Build()
+        )
+
+    def forward(self, input):
+        res = self._op(input)[0]
+        return res
+
+
+class TensorBufferToListOfTensors(Module):
+    def __init__(
+        self, out_shapes, out_dtypes, out_num: int = 1, dynamic_out: bool = False
+    ):
+        super().__init__()
+        self._op = (
+            flow.builtin_op("tensor_buffer_to_list_of_tensors_v2")
+            .Input("in")
+            .Output("out", out_num)
+            .Attr("out_shapes", out_shapes)
+            .Attr("out_dtypes", out_dtypes)
+            .Attr("dynamic_out", dynamic_out)
+            .Build()
+        )
+
+    def forward(self, input):
+        return self._op(input)
+
+
+def tensor_buffer_to_list_of_tensors(tensor, out_shapes, out_dtypes):
+    return TensorBufferToListOfTensors(
+        [list(out_shape) for out_shape in out_shapes], out_dtypes, len(out_shapes)
+    )(tensor)
+
+
+class ImageResize(Module):
+    def __init__(
+        self,
+        target_size: Union[int, Sequence[int]] = None,
+        min_size: Optional[int] = None,
+        max_size: Optional[int] = None,
+        keep_aspect_ratio: bool = False,
+        resize_side: str = "shorter",
+        channels: int = 3,
+        dtype: Optional[flow.dtype] = None,
+        interpolation_type: str = "auto",
+        name: Optional[str] = None,
+        color_space: Optional[str] = None,
+        interp_type: Optional[str] = None,
+        resize_shorter: int = 0,
+        resize_x: int = 0,
+        resize_y: int = 0,
+    ):
+        super().__init__()
+        deprecated_param_used = False
+        if color_space is not None:
+            print(
+                "WARNING: color_space has been deprecated. Please use channels instead."
+            )
+            print(traceback.format_stack()[-2])
+            deprecated_param_used = True
+            assert isinstance(color_space, str)
+            if color_space.upper() == "RGB" or color_space.upper() == "BGR":
+                channels = 3
+            elif color_space.upper() == "GRAY":
+                channels = 1
+            else:
+                raise ValueError("invalid color_space")
+        if interp_type is not None:
+            print(
+                "WARNING: interp_type has been deprecated. Please use interpolation_type instead."
+            )
+            print(traceback.format_stack()[-2])
+            deprecated_param_used = True
+            assert isinstance(interp_type, str)
+            if interp_type == "Linear":
+                interpolation_type = "bilinear"
+            elif interp_type == "NN":
+                interpolation_type = "nearest_neighbor"
+            elif interp_type == "Cubic":
+                interpolation_type = "bicubic"
+            else:
+                raise ValueError("invalid interp_type")
+        if resize_x > 0 and resize_y > 0:
+            print(
+                "WARNING: resize_x and resize_y has been deprecated. Please use target_size instead."
+            )
+            print(traceback.format_stack()[-2])
+            deprecated_param_used = True
+            target_size = (resize_x, resize_y)
+            keep_aspect_ratio = False
+        if resize_shorter > 0:
+            print(
+                "WARNING: resize_shorter has been deprecated. Please use target_size instead."
+            )
+            print(traceback.format_stack()[-2])
+            deprecated_param_used = True
+            target_size = resize_shorter
+            keep_aspect_ratio = True
+            resize_side = "shorter"
+        if keep_aspect_ratio:
+            if not isinstance(target_size, int):
+                raise ValueError(
+                    "target_size must be an int when keep_aspect_ratio is True"
+                )
+            if min_size is None:
+                min_size = 0
+            if max_size is None:
+                max_size = 0
+            if resize_side == "shorter":
+                resize_longer = False
+            elif resize_side == "longer":
+                resize_longer = True
+            else:
+                raise ValueError('resize_side must be "shorter" or "longer"')
+            self._op = (
+                flow.builtin_op("image_resize_keep_aspect_ratio")
+                .Input("in")
+                .Output("out")
+                .Output("size")
+                .Output("scale")
+                .Attr("target_size", target_size)
+                .Attr("min_size", min_size)
+                .Attr("max_size", max_size)
+                .Attr("resize_longer", resize_longer)
+                .Attr("interpolation_type", interpolation_type)
+                .Build()
+            )
+        else:
+            if (
+                not isinstance(target_size, (list, tuple))
+                or len(target_size) != 2
+                or (not all((isinstance(size, int) for size in target_size)))
+            ):
+                raise ValueError(
+                    "target_size must be a form like (width, height) when keep_aspect_ratio is False"
+                )
+            if dtype is None:
+                dtype = flow.uint8
+            (target_w, target_h) = target_size
+            self._op = (
+                flow.builtin_op("image_resize_to_fixed")
+                .Input("in")
+                .Output("out")
+                .Output("scale")
+                .Attr("target_width", target_w)
+                .Attr("target_height", target_h)
+                .Attr("channels", channels)
+                .Attr("data_type", dtype)
+                .Attr("interpolation_type", interpolation_type)
+                .Build()
+            )
+
+    def forward(self, input):
+        res = self._op(input)
+        res_image = res[0]
+        if len(res) == 3:
+            new_size = flow.tensor_buffer_to_tensor(
+                res[1], dtype=flow.int32, instance_shape=(2,)
+            )
+            scale = flow.tensor_buffer_to_tensor(
+                res[2], dtype=flow.float32, instance_shape=(2,)
+            )
+        else:
+            new_size = None
+            scale = res[1]
+        return (res_image, scale, new_size)
+
+
+def raw_decoder(
+    input_record,
+    blob_name: str,
+    shape: Sequence[int],
+    dtype: flow.dtype,
+    dim1_varying_length: bool = False,
+    truncate: bool = False,
+    auto_zero_padding: bool = False,
+    name: Optional[str] = None,
+):
+    if auto_zero_padding:
+        print(
+            "WARNING: auto_zero_padding has been deprecated, Please use truncate instead.\n            "
+        )
+    return OfrecordRawDecoder(
+        blob_name,
+        shape,
+        dtype,
+        dim1_varying_length,
+        truncate or auto_zero_padding,
+        name,
+    ).forward(input_record)
+
+
+def get_ofrecord_handle(
+    ofrecord_dir: str,
+    batch_size: int = 1,
+    data_part_num: int = 1,
+    part_name_prefix: str = "part-",
+    part_name_suffix_length: int = -1,
+    random_shuffle: bool = False,
+    shuffle_buffer_size: int = 1024,
+    shuffle_after_epoch: bool = False,
+    name: Optional[str] = None,
+):
+    return OfrecordReader(
+        ofrecord_dir,
+        batch_size,
+        data_part_num,
+        part_name_prefix,
+        part_name_suffix_length,
+        random_shuffle,
+        shuffle_buffer_size,
+        shuffle_after_epoch,
+        name,
+    )()
+
+
+class ImageFlip(Module):
+    """This operator flips the images.
+
+    The flip code corresponds to the different flip mode:
+
+    0 (0x00): Non Flip
+
+    1 (0x01): Horizontal Flip
+
+    16 (0x10): Vertical Flip
+
+    17 (0x11): Both Horizontal and Vertical Flip
+
+    Args:
+        images: The input images.
+        flip_code: The flip code.
+
+    Returns:
+        The result image.
+
+    For example:
+    
+    .. code-block:: python
+        
+        >>> import numpy as np
+        >>> import oneflow as flow
+        >>> import oneflow.nn as nn
+        
+        >>> arr = np.array([
+        ...    [[[1, 2, 3], [3, 2, 1]],
+        ...     [[2, 3, 4], [4, 3, 2]]],
+        ...    [[[3, 4, 5], [5, 4, 3]],
+        ...     [[4, 5, 6], [6, 5, 4]]]])
+        >>> image_tensors = flow.Tensor(arr, device=flow.device("cpu"))
+        >>> image_tensor_buffer = flow.tensor_to_tensor_buffer(image_tensors, instance_dims=3)
+        >>> output = nn.image.flip(1)(image_tensor_buffer).numpy()
+        >>> output[0]
+        array([[[3., 2., 1.],
+                [1., 2., 3.]],
+        <BLANKLINE>
+               [[4., 3., 2.],
+                [2., 3., 4.]]], dtype=float32)
+        >>> output[1]
+        array([[[5., 4., 3.],
+                [3., 4., 5.]],
+        <BLANKLINE>
+               [[6., 5., 4.],
+                [4., 5., 6.]]], dtype=float32)
+    """
+
+    def __init__(self, flip_code):
+        super().__init__()
+        self.flip_code = flip_code
+
+    def forward(self, images):
+        flip_codes = flow.Tensor([self.flip_code] * images.shape[0], dtype=flow.int8)
+        return flow.F.image_flip(images, flip_codes)
+
+
+class ImageDecode(Module):
+    def __init__(self, dtype: flow.dtype = flow.uint8, color_space: str = "BGR"):
+        super().__init__()
+        self._op = (
+            flow.builtin_op("image_decode")
+            .Input("in")
+            .Output("out")
+            .Attr("color_space", color_space)
+            .Attr("data_type", dtype)
+            .Build()
+        )
+
+    def forward(self, input):
+        return self._op(input)[0]
+
+
+class ImageNormalize(Module):
+    def __init__(self, std: Sequence[float], mean: Sequence[float]):
+        super().__init__()
+        self._op = (
+            flow.builtin_op("image_normalize")
+            .Input("in")
+            .Output("out")
+            .Attr("std", std)
+            .Attr("mean", mean)
+            .Build()
+        )
+
+    def forward(self, input):
+        return self._op(input)[0]
+
+
+class COCOReader(Module):
+    def __init__(
+        self,
+        annotation_file: str,
+        image_dir: str,
+        batch_size: int,
+        shuffle: bool = True,
+        random_seed: Optional[int] = None,
+        group_by_aspect_ratio: bool = True,
+        remove_images_without_annotations: bool = True,
+        stride_partition: bool = True,
+    ):
+        super().__init__()
+        if random_seed is None:
+            random_seed = random.randrange(sys.maxsize)
+        self._op = (
+            flow.builtin_op("COCOReader")
+            .Output("image")
+            .Output("image_id")
+            .Output("image_size")
+            .Output("gt_bbox")
+            .Output("gt_label")
+            .Output("gt_segm")
+            .Output("gt_segm_index")
+            .Attr("session_id", flow.current_scope().session_id)
+            .Attr("annotation_file", annotation_file)
+            .Attr("image_dir", image_dir)
+            .Attr("batch_size", batch_size)
+            .Attr("shuffle_after_epoch", shuffle)
+            .Attr("random_seed", random_seed)
+            .Attr("group_by_ratio", group_by_aspect_ratio)
+            .Attr(
+                "remove_images_without_annotations", remove_images_without_annotations
+            )
+            .Attr("stride_partition", stride_partition)
+            .Build()
+        )
+
+    def forward(self):
+        res = self._op()
+        return res
+
+
+class ImageBatchAlign(Module):
+    def __init__(self, shape: Sequence[int], dtype: flow.dtype, alignment: int):
+        super().__init__()
+        self._op = (
+            flow.builtin_op("image_batch_align")
+            .Input("in")
+            .Output("out")
+            .Attr("shape", shape)
+            .Attr("data_type", dtype)
+            .Attr("alignment", alignment)
+            .Attr("dynamic_out", False)
+            .Build()
+        )
+
+    def forward(self, input):
+        return self._op(input)[0]
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/deconv.py b/python/oneflow/nn/modules/deconv.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69794aea2a37bbb86fa10e7c0e7d7b120f9e09e
--- /dev/null
+++ b/python/oneflow/nn/modules/deconv.py
@@ -0,0 +1,237 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import math
+
+import oneflow as flow
+from oneflow.nn import init
+from oneflow.nn.common_types import _size_2_t
+from oneflow.nn.module import Module
+from oneflow.nn.modules.utils import _pair
+
+
+def slice(x, begin, size):
+    ndim = len(x.shape)
+    if not isinstance(begin, (list, tuple)) or len(begin) != ndim:
+        raise ValueError(
+            "begin must be a list/tuple with the same length as input tensor's number of dimensions"
+        )
+    if not all((isinstance(b, int) or b is None for b in begin)):
+        raise ValueError("element of begin must be a int or None")
+    if not isinstance(size, (list, tuple)) or len(size) != ndim:
+        raise ValueError(
+            "size must be a list/tuple with the same length as input tensor's number of dimensions."
+        )
+    if not all((isinstance(s, int) or s is None for s in size)):
+        raise ValueError("element of size must be a int or None")
+    slice_tup_list = []
+    for (b, s, dim_size) in zip(begin, size, x.shape):
+        (start, stop, step) = (None, None, 1)
+        if b is not None:
+            if b < -dim_size or b >= dim_size:
+                raise ValueError("element of begin is out of range")
+            start = b
+        if s is not None:
+            if s == -1:
+                stop = dim_size
+            else:
+                if s <= 0 or s > dim_size:
+                    raise ValueError("element of size is invalid")
+                if b + s < dim_size:
+                    stop = b + s
+        slice_tup_list.append((start, stop, step))
+    return flow.slice(x, slice_tup_list)
+
+
+class ConvUtil(object):
+    @classmethod
+    def split(cls, x, axis, split_num):
+        split_len = x.shape[axis] // split_num
+        result_list = []
+        slice_begin = [0] * len(x.shape)
+        slice_size = [-1] * len(x.shape)
+        slice_size[axis] = split_len
+        for i in range(split_num):
+            slice_begin[axis] = i * split_len
+            result = slice(x, slice_begin, slice_size)
+            result_list.append(result)
+        return result_list
+
+
+class ConvTranspose2d(Module):
+    """
+    
+    Applies a 2D transposed convolution operator over an input image composed of several input planes.
+
+    This module can be seen as the gradient of Conv2d with respect to its input.
+    It is also known as a fractionally-strided convolution or
+    a deconvolution (although it is not an actual deconvolution operation).
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): ``dilation * (kernel_size - 1) - padding`` zero-padding
+            will be added to both sides of each dimension in the input. Default: 0
+        output_padding (int or tuple, optional): Additional size added to one side
+            of each dimension in the output shape. Default: 0
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True``
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+
+    Shape:
+        - Input: :math:`(N, C_{in}, H_{in}, W_{in})`
+        - Output: :math:`(N, C_{out}, H_{out}, W_{out})` where
+
+        .. math::
+              H_{out} = (H_{in} - 1) \\times \\text{stride}[0] - 2 \\times \\text{padding}[0] + \\text{dilation}[0] 
+
+                        \\times (\\text{kernel_size}[0] - 1) + \\text{output_padding}[0] + 1
+        .. math::
+              W_{out} = (W_{in} - 1) \\times \\text{stride}[1] - 2 \\times \\text{padding}[1] + \\text{dilation}[1]
+              
+                        \\times (\\text{kernel_size}[1] - 1) + \\text{output_padding}[1] + 1
+
+    Attributes:
+        ConvTranspose2d.weight (Tensor): the learnable weights of the module of shape
+                         :math:`(\\text{in_channels}, \\frac{\\text{out_channels}}{\\text{groups}},`
+                         :math:`\\text{kernel_size[0]}, \\text{kernel_size[1]})`.
+                         The values of these weights are sampled from
+                         :math:`\\mathcal{U}(-\\sqrt{k}, \\sqrt{k})` where
+                         :math:`k = \\frac{groups}{C_\\text{out} * \\prod_{i=0}^{1}\\text{kernel_size}[i]}`
+        ConvTranspose2d.bias (Tensor): the learnable bias of the module of shape (out_channels)
+                         If :attr:`bias` is ``True``, then the values of these weights are
+                         sampled from :math:`\\mathcal{U}(-\\sqrt{k}, \\sqrt{k})` where
+                         :math:`k = \\frac{groups}{C_\\text{out} * \\prod_{i=0}^{1}\\text{kernel_size}[i]}`
+
+    Examples::
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        >>> import oneflow.nn as nn
+        
+        >>> m = nn.ConvTranspose2d(16, 33, 3, stride=2)
+        >>> # non-square kernels and unequal stride and with padding
+        >>> m = nn.ConvTranspose2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
+        >>> m = m.to("cuda")
+        >>> input = flow.Tensor(np.random.randn(20, 16, 50, 100), device=flow.device("cuda"))
+        >>> output = m(input)
+        >>> output.size()
+        flow.Size([20, 33, 93, 100])
+
+    .. _cross-correlation:
+        https://en.wikipedia.org/wiki/Cross-correlation
+
+    .. _link:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: _size_2_t,
+        stride: _size_2_t = 1,
+        padding: _size_2_t = 0,
+        output_padding: _size_2_t = 0,
+        groups: int = 1,
+        bias: bool = True,
+        dilation: int = 1,
+        padding_mode: str = "zeros",
+    ) -> None:
+        super().__init__()
+        assert padding_mode == "zeros"
+        kernel_size = _pair(kernel_size)
+        stride = _pair(stride)
+        padding = _pair(padding)
+        output_padding = _pair(output_padding)
+        dilation = _pair(dilation)
+        self.groups = groups
+        assert in_channels % groups == 0
+        assert out_channels % groups == 0
+        self.weight = flow.nn.Parameter(
+            flow.Tensor(in_channels, out_channels // groups, *kernel_size)
+        )
+        self.in_channel_groups = in_channels // groups
+        self.bias = None
+        self._bias_add_op = None
+        if bias:
+            self.bias = flow.nn.Parameter(flow.Tensor(out_channels))
+            self._bias_add_op = (
+                flow.builtin_op("bias_add")
+                .Input("a")
+                .Input("b")
+                .Output("out")
+                .Attr("axis", 1)
+                .Build()
+            )
+        self._op = (
+            flow.builtin_op("deconv2d")
+            .Input("in")
+            .Input("weight")
+            .Attr("filters", out_channels // groups)
+            .Attr("padding_before", padding)
+            .Attr("data_format", "channels_first")
+            .Attr("kernel_size", kernel_size)
+            .Attr("strides", stride)
+            .Attr("dilation_rate", dilation)
+            .Attr("output_padding", output_padding)
+            .Attr("groups", 1)
+            .Output("out")
+            .Build()
+        )
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+        if self.bias is not None:
+            (fan_in, _) = init._calculate_fan_in_and_fan_out(self.weight)
+            bound = 1 / math.sqrt(fan_in)
+            init.uniform_(self.bias, -bound, bound)
+
+    def forward(self, x):
+        if self.groups > 1:
+            in_channel_axis = 1
+            in_split_list = ConvUtil.split(
+                x, axis=in_channel_axis, split_num=self.groups
+            )
+            out_list = []
+            for i in range(len(in_split_list)):
+                out_list.append(
+                    self._op(
+                        in_split_list[i],
+                        self.weight[
+                            i
+                            * self.in_channel_groups : (i + 1)
+                            * self.in_channel_groups,
+                            :,
+                            :,
+                            :,
+                        ],
+                    )[0]
+                )
+            res = flow.cat(out_list, dim=in_channel_axis)
+        else:
+            res = self._op(x, self.weight)[0]
+        if self._bias_add_op is not None:
+            res = self._bias_add_op(res, self.bias)[0]
+        return res
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/diag.py b/python/oneflow/nn/modules/diag.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a78aceeeecad2067a480de30f8945154ca0c5b3
--- /dev/null
+++ b/python/oneflow/nn/modules/diag.py
@@ -0,0 +1,77 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class Diag(Module):
+    def __init__(self, diagonal=0):
+        super().__init__()
+        self.diagonal = diagonal
+
+    def forward(self, input):
+        return flow.F.diag(input, self.diagonal)
+
+
+def diag_op(input, diagonal=0):
+    """
+    If input is a vector (1-D tensor), then returns a 2-D square tensor with the elements of input as the diagonal.
+    If input is a matrix (2-D tensor), then returns a 1-D tensor with diagonal elements of input.
+
+    Args:
+        input (Tensor): the input tensor.
+        diagonal (Optional[int], 0): The diagonal to consider. 
+            If diagonal = 0, it is the main diagonal. If diagonal > 0, it is above the main diagonal. If diagonal < 0, it is below the main diagonal. Defaults to 0.
+    
+    Returns:
+        oneflow.Tensor: the output Tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> arr = np.array(
+        ...     [
+        ...        [1.0, 2.0, 3.0],
+        ...        [4.0, 5.0, 6.0],
+        ...        [7.0, 8.0, 9.0],
+        ...     ]
+        ... )
+
+        >>> input = flow.Tensor(arr, dtype=flow.float32)
+        >>> flow.diag(input)
+        tensor([1., 5., 9.], dtype=oneflow.float32)
+    """
+    return Diag(diagonal)(input)
+
+
+@register_tensor_op("diag")
+def diag_op_tensor(input, diagonal=0):
+    """
+    diag() -> Tensor
+    See :func:`oneflow.diag`
+    
+    """
+    return Diag(diagonal)(input)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/dropout.py b/python/oneflow/nn/modules/dropout.py
new file mode 100644
index 0000000000000000000000000000000000000000..f351635515f969ea5cfd1e424ab531c0ce2fe939
--- /dev/null
+++ b/python/oneflow/nn/modules/dropout.py
@@ -0,0 +1,106 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import random
+import sys
+
+import oneflow as flow
+import oneflow.framework.id_util as id_util
+from oneflow.nn.module import Module
+
+
+class _DropoutNd(Module):
+    __constants__ = ["p", "inplace"]
+    p: float
+    inplace: bool
+
+    def __init__(self, p: float = 0.5, inplace: bool = False) -> None:
+        super(_DropoutNd, self).__init__()
+        if p < 0 or p > 1:
+            raise ValueError(
+                "dropout probability has to be between 0 and 1, but got {}".format(p)
+            )
+        self.p = p
+        self.inplace = inplace
+
+    def extra_repr(self) -> str:
+        return "p={}, inplace={}".format(self.p, self.inplace)
+
+
+class Dropout(_DropoutNd):
+    """During training, randomly zeroes some of the elements of the input
+    tensor with probability :attr:`p` using samples from a Bernoulli
+    distribution. Each channel will be zeroed out independently on every forward
+    call.
+
+    This has proven to be an effective technique for regularization and
+    preventing the co-adaptation of neurons as described in the paper
+    "Improving neural networks by preventing co-adaptation of feature
+    detectors".
+
+    Furthermore, the outputs are scaled by a factor of :math:`\\frac{1}{1-p}` during
+    training. This means that during evaluation the module simply computes an
+    identity function.
+
+    Args:
+        p: probability of an element to be zeroed. Default: 0.5
+        inplace: If set to ``True``, will do this operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(*)`. Input can be of any shape
+        - Output: :math:`(*)`. Output is of the same shape as input
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> m = flow.nn.Dropout(p=0)
+        >>> arr = np.array(
+        ...    [
+        ...        [-0.7797, 0.2264, 0.2458, 0.4163],
+        ...        [0.4299, 0.3626, -0.4892, 0.4141],
+        ...        [-1.4115, 1.2183, -0.5503, 0.6520],
+        ...    ]
+        ... )
+        >>> x = flow.Tensor(arr)
+        >>> y = m(x)
+        >>> y #doctest: +ELLIPSIS
+        tensor([[-0.7797,  0.2264,  0.2458,  0.4163],
+                ...
+                [-1.4115,  1.2183, -0.5503,  0.652 ]], dtype=oneflow.float32)
+
+
+    """
+
+    def __init__(self, p: float = 0.5, inplace: bool = False, generator=None):
+        _DropoutNd.__init__(self, p, inplace)
+        self.p = p
+        if generator is None:
+            generator = flow.Generator()
+        self.generator = generator
+
+    def forward(self, x):
+        if self.p == 0.0 or not self.training:
+            return x
+        return flow.F.dropout(x, self.p, self.generator)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/eq.py b/python/oneflow/nn/modules/eq.py
new file mode 100644
index 0000000000000000000000000000000000000000..767aedbe6f75067e04677b9c881be053a429eaa6
--- /dev/null
+++ b/python/oneflow/nn/modules/eq.py
@@ -0,0 +1,79 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class Eq(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, input, other):
+        if isinstance(other, flow.Tensor) or isinstance(
+            other, flow._oneflow_internal.Tensor
+        ):
+            for i in range(len(input.size())):
+                assert (
+                    input.shape[i] >= other.shape[i]
+                ), "The second tensor's shape should broadcastable with the first argument."
+                if input.dtype != other.dtype:
+                    other = other.to(dtype=input.dtype)
+        elif isinstance(other, int) or isinstance(other, float):
+            other = flow.Tensor([other], dtype=input.dtype, device=input.device)
+        else:
+            raise NotImplementedError(
+                "Unsupport data type, The second argument can be a tensor whose shape is broadcastable with the first argument."
+            )
+        return flow.F.broadcast_equal(input, other)
+
+
+@register_tensor_op("eq")
+def eq_op(input, other):
+    """
+    Computes element-wise equality.
+    The second argument can be a number or a tensor whose shape is broadcastable with the first argument.
+
+    Args:
+        input (oneflow.Tensor): the tensor to compare
+        other (oneflow.Tensor, float or int): the target to compare
+
+    Returns:
+
+        - A boolean tensor that is True where :attr:`input` is equal to :attr:`other` and False elsewhere
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        
+        >>> input = flow.Tensor(np.array([2, 3, 4, 5]), dtype=flow.float32)
+        >>> other = flow.Tensor(np.array([2, 3, 4, 1]), dtype=flow.float32)
+
+        >>> y = flow.eq(input, other)
+        >>> y
+        tensor([1, 1, 1, 0], dtype=oneflow.int8)
+
+    """
+    return Eq()(input, other)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/exp.py b/python/oneflow/nn/modules/exp.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b197210190079a97dd6e3402b39369c3fd0fa9d
--- /dev/null
+++ b/python/oneflow/nn/modules/exp.py
@@ -0,0 +1,64 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class Exp(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.exp(x)
+
+
+@register_tensor_op("exp")
+def exp_op(x):
+    """This operator computes the exponential of Tensor.
+
+    The equation is:
+
+    .. math::
+
+        out = e^x
+
+    Args:
+        x (oneflow.Tensor): A Tensor
+
+    Returns:
+        oneflow.Tensor: The result Tensor
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> x = flow.Tensor(np.array([1, 2, 3]).astype(np.float32))
+        >>> y = x.exp()
+        >>> y
+        tensor([ 2.7183,  7.3891, 20.0855], dtype=oneflow.float32)
+
+    """
+    return Exp()(x)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/expand.py b/python/oneflow/nn/modules/expand.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d9bb1399102311c47f25d3181f2a1c2e5b7c3b0
--- /dev/null
+++ b/python/oneflow/nn/modules/expand.py
@@ -0,0 +1,74 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional
+
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class Expand(Module):
+    def __init__(self, *sizes) -> None:
+        super().__init__()
+        self.expand_size = list(*sizes)
+
+    def forward(self, x):
+        if x.dtype == flow.int8:
+            x = flow.cast(x, flow.int32)
+        return flow.F.expand(x, self.expand_size)
+
+
+@register_tensor_op("expand")
+def expand_op(x, *sizes):
+    """This operator expand the input tensor to a larger size.
+
+    Passing -1 as the size for a dimension means not changing the size of that dimension.
+
+    Tensor can be also expanded to a larger number of dimensions and the new ones will be appended at the front.
+
+    For the new dimensions, the size cannot be set to -1.
+
+    Args:
+        x (oneflow.Tensor): The input Tensor.
+        *sizes  (flow.Size or int): The desired expanded size.
+
+    Returns:
+        oneflow.Tensor: The result Tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> x = np.array([[[[0, 1]],
+        ...               [[2, 3]],
+        ...               [[4, 5]]]]).astype(np.int32)
+
+        >>> input = flow.Tensor(x)
+
+        >>> out = input.expand(1, 3, 2, 2)
+        >>> out.shape
+        flow.Size([1, 3, 2, 2])
+
+    """
+    return Expand(sizes)(x)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/flatten.py b/python/oneflow/nn/modules/flatten.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf4beae9a365746321a8bd6beddd8355a616a15e
--- /dev/null
+++ b/python/oneflow/nn/modules/flatten.py
@@ -0,0 +1,80 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class Flatten(Module):
+    """Flattens a contiguous range of dims into a tensor. For use with: nn.Sequential.
+
+    Args:
+        start_dim: first dim to flatten (default = 1).
+        end_dim: last dim to flatten (default = -1).
+    
+
+    For example: 
+
+    .. code-block:: python 
+
+        >>> import oneflow as flow
+        >>> input = flow.Tensor(32, 1, 5, 5)
+        >>> m = flow.nn.Flatten()
+        >>> output = m(input)
+        >>> output.shape
+        flow.Size([32, 25])
+
+    """
+
+    def __init__(self, start_dim: int = 1, end_dim: int = -1) -> None:
+        super().__init__()
+        self.start_dim = start_dim
+        self.end_dim = end_dim
+
+    def forward(self, input):
+        return flow.F.flatten(input, start_dim=self.start_dim, end_dim=self.end_dim)
+
+    def extra_repr(self) -> str:
+        return "start_dim={}, end_dim={}".format(self.start_dim, self.end_dim)
+
+
+@register_tensor_op("flatten")
+def _flow_flatten(input, start_dim: int = 0, end_dim: int = -1):
+    """Flattens a contiguous range of dims into a tensor.
+
+    Args:
+        start_dim: first dim to flatten (default = 0).
+        end_dim: last dim to flatten (default = -1).
+    
+    For example: 
+
+    .. code-block:: python 
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        >>> input = flow.Tensor(32, 1, 5, 5)
+        >>> output = input.flatten(start_dim=1)
+        >>> output.shape
+        flow.Size([32, 25])
+
+    """
+    return Flatten(start_dim=start_dim, end_dim=end_dim)(input)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/flip.py b/python/oneflow/nn/modules/flip.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8739226401ccf65e7d33e76d663df3035d23f44
--- /dev/null
+++ b/python/oneflow/nn/modules/flip.py
@@ -0,0 +1,93 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import collections
+from typing import Optional, Sequence, Union
+
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+from oneflow.nn.modules.utils import _check_axis
+
+
+class Flip(Module):
+    def __init__(self, dims) -> None:
+        super().__init__()
+        assert isinstance(dims, (list, tuple)), f"dims must be list or tuple"
+        self.dims = dims
+
+    def forward(self, x):
+        input_len = len(x.shape)
+        assert (
+            len(self.dims) <= input_len
+        ), f"len of dims must less than len of input tensor"
+        new_dims = []
+        for i in self.dims:
+            if i < 0:
+                i += input_len
+            assert (
+                i < input_len
+            ), f"IndexError: Dimension out of range (expected to be in range of {input_len}, but got {i})"
+            new_dims.append(i)
+        return flow.F.flip(x, new_dims)
+
+
+def flip_op(input, dims):
+    """
+    
+    Reverse the order of a n-D tensor along given axis in dims.
+
+    .. note::
+        `flow.flip` makes a copy of :attr:`input`'s data. This is different from NumPy's `np.flip`,
+        which returns a view in constant time. Since copying a tensor's data is more work than viewing that data,
+        `flow.flip` is expected to be slower than `np.flip`.
+
+    Args:
+        input (Tensor): the input tensor
+        dims (a list or tuple): axis to flip on
+        
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        
+        >>> np_arr = np.arange(0, 8).reshape((2, 2, 2)).astype(np.float32)
+        >>> input = flow.Tensor(np_arr)
+        >>> out = flow.flip(input, [0, 1])
+        >>> out
+        tensor([[[6., 7.],
+                 [4., 5.]],
+        <BLANKLINE>
+                [[2., 3.],
+                 [0., 1.]]], dtype=oneflow.float32)
+
+    """
+    return Flip(dims)(input)
+
+
+@register_tensor_op("flip")
+def flip_op_tensor(input, dims):
+    """
+    See :func:`oneflow.flip`
+    """
+    return Flip(dims)(input)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/floor.py b/python/oneflow/nn/modules/floor.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ada45de2a4325ca078bc7114fbb0b2454b013c9
--- /dev/null
+++ b/python/oneflow/nn/modules/floor.py
@@ -0,0 +1,79 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import collections
+from typing import Optional, Sequence, Union
+
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+from oneflow.nn.modules.utils import _check_axis
+
+
+class Floor(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.floor(x)
+
+
+def floor_op(x):
+    """
+    Returns a new tensor with the arcsine of the elements of :attr:`input`.
+
+    .. math::
+        \\text{out}_{i} = \\lfloor \\text{input}_{i} \\rfloor
+
+    Args:
+        input (Tensor): the input tensor.
+        
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> input = flow.Tensor(np.array([-0.5,  1.5, 0,  0.8]), dtype=flow.float32)
+        >>> output = flow.floor(input)
+        >>> output.shape
+        flow.Size([4])
+        >>> output.numpy()
+        array([-1.,  1.,  0.,  0.], dtype=float32)
+        
+        >>> input1 = flow.Tensor(np.array([[0.8, 1.0], [-0.6, 2.5]]), dtype=flow.float32)
+        >>> output1 = input1.floor()
+        >>> output1.shape
+        flow.Size([2, 2])
+        >>> output1.numpy()
+        array([[ 0.,  1.],
+               [-1.,  2.]], dtype=float32)
+
+    """
+    return Floor()(x)
+
+
+@register_tensor_op("floor")
+def floor_op_tensor(input):
+    """
+    See :func:`oneflow.floor`
+    """
+    return Floor()(input)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/gather.py b/python/oneflow/nn/modules/gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..73f5ddbecb3b62eb5fb6f68b626d6b3bf7793123
--- /dev/null
+++ b/python/oneflow/nn/modules/gather.py
@@ -0,0 +1,85 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import List, Optional, Tuple
+
+import oneflow as flow
+from oneflow.framework.tensor import Tensor, register_tensor_op
+from oneflow.nn.module import Module
+
+
+class Gather(Module):
+    def __init__(self, dim: int = 0, sparse_grad: bool = False):
+        super().__init__()
+        assert sparse_grad is False, "Only support bool = False for now!"
+        self.dim = dim
+
+    def forward(self, input, index):
+        assert self.dim < len(
+            index.shape
+        ), "Value of dim is out of range(dim should be less than len(index.shape))"
+        assert len(input.shape) == len(
+            index.shape
+        ), "Dimensions of input and index should equal"
+        for i in range(0, len(input.shape)):
+            if self.dim == i:
+                continue
+            else:
+                assert (
+                    input.shape[i] == index.shape[i]
+                ), "Dimensions of input and index should be same except at dim"
+        return flow.F.dim_gather(input, index, dim=self.dim)
+
+
+@register_tensor_op("gather")
+def gather_op(input, index, dim=0, sparse_grad=False):
+    """Gathers values along an axis specified by `dim`.
+
+    For a 3-D tensor the output is specified by::
+
+        out[i][j][k] = input[index[i][j][k]][j][k]  # if dim == 0
+        out[i][j][k] = input[i][index[i][j][k]][k]  # if dim == 1
+        out[i][j][k] = input[i][j][index[i][j][k]]  # if dim == 2
+
+    :attr:`input` and :attr:`index` must have the same number of dimensions.
+    It is also required that ``index.size(d) <= input.size(d)`` for all
+    dimensions ``d != dim``.  :attr:`out` will have the same shape as :attr:`index`.
+    Note that ``input`` and ``index`` do not broadcast against each other.
+
+    Args:
+        input (Tensor): the source tensor
+        dim (int): the axis along which to index
+        index (LongTensor): the indices of elements to gather
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> input = np.random.randn(3, 4, 3, 5)
+        >>> index = np.random.choice(np.arange(3), size=180, replace=True).reshape((3, 4, 3, 5))
+        >>> output = flow.gather(flow.Tensor(input), flow.Tensor(index, dtype=flow.int), dim=1)
+        >>> output.shape
+        flow.Size([3, 4, 3, 5])
+
+    """
+    return Gather(dim, sparse_grad)(input, index)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/gather_nd.py b/python/oneflow/nn/modules/gather_nd.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd8925753c00cced348c60e13ff5e874b3f1648a
--- /dev/null
+++ b/python/oneflow/nn/modules/gather_nd.py
@@ -0,0 +1,77 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.framework.tensor import Tensor
+from oneflow.nn.module import Module
+
+
+class Gather_nd(Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.gather_nd_op = (
+            flow.builtin_op("gather_nd")
+            .Input("params")
+            .Input("indices")
+            .Output("out")
+            .Build()
+        )
+
+    def forward(self, input, index):
+        return self.gather_nd_op(input, index)[0]
+
+
+def gather_nd_op(input, index):
+    """This operator is a high-dimensional extension of `gather`, `index` is a K-dimensional
+    tensor, which is regarded as a index of input Tensor `input`.
+
+    Each element defines a slice of `input`:
+
+    .. math::
+
+        output[i_{0},i_{1},...,i_{K-2}] = input[index(i_{0},i_{1},...,i_{K-2})]
+
+
+    Args:
+        input: The input Tensor.
+        index: The slice indices.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> input = flow.Tensor(np.array([[1, 2,3], [4, 5,6],[7,8,9]]), dtype=flow.float)
+        >>> index_1 = flow.Tensor(np.array([[0], [2]]), dtype=flow.int)
+        >>> out_1 = flow.gather_nd(input,index_1)
+        >>> print(out_1.shape)
+        flow.Size([2, 3])
+        >>> out_1
+        tensor([[1., 2., 3.],
+                [7., 8., 9.]], dtype=oneflow.float32)
+        >>> index_2 = flow.Tensor(np.array([[0,2], [2,1]]), dtype=flow.int)
+        >>> out_2 = flow.gather_nd(input,index_2)
+        >>> out_2
+        tensor([3., 8.], dtype=oneflow.float32)
+
+    """
+    return Gather_nd()(input, index)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/greater.py b/python/oneflow/nn/modules/greater.py
new file mode 100644
index 0000000000000000000000000000000000000000..c52adade8e06fc37afa19a75592a47722c7f2b00
--- /dev/null
+++ b/python/oneflow/nn/modules/greater.py
@@ -0,0 +1,80 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class Greater(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x, y):
+        if x.dtype != flow.float32:
+            x = flow.cast(x, flow.float32)
+        if isinstance(y, int) or isinstance(y, float):
+            y = flow.Tensor(
+                [float(y)], dtype=flow.float32, device=flow.device(x.device.type)
+            )
+        if y.dtype != flow.float32:
+            y = flow.cast(y, flow.float32)
+        return flow.F.broadcast_greater(x, y)
+
+
+def greater_op(x, y):
+    """Returns the truth value of :math:`x > y` element-wise.
+
+    Args:
+        x (oneflow.Tensor): A Tensor
+        y (oneflow.Tensor): A Tensor
+
+    Returns:
+        oneflow.Tensor: A Tensor with int8 type.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> input1 = flow.Tensor(np.random.randn(2, 6, 5, 3), dtype=flow.float32)
+        >>> input2 = flow.Tensor(np.random.randn(2, 6, 5, 3), dtype=flow.float32)
+
+        >>> out = flow.gt(input1, input2).shape
+        >>> out
+        flow.Size([2, 6, 5, 3])
+
+    """
+    return Greater()(x, y)
+
+
+@register_tensor_op("gt")
+def greater_op_tensor(x, y):
+    """
+
+    gt() -> Tensor
+
+    See :func:`oneflow.gt`
+
+    """
+    return Greater()(x, y)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/greater_equal.py b/python/oneflow/nn/modules/greater_equal.py
new file mode 100644
index 0000000000000000000000000000000000000000..440a5fb68f49ed14374c9d9b75245cf9be41b9ca
--- /dev/null
+++ b/python/oneflow/nn/modules/greater_equal.py
@@ -0,0 +1,80 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class GreaterEqual(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x, y):
+        if x.dtype != flow.float32:
+            x = flow.cast(x, flow.float32)
+        if isinstance(y, int) or isinstance(y, float):
+            y = flow.Tensor(
+                [float(y)], dtype=flow.float32, device=flow.device(x.device.type)
+            )
+        if y.dtype != flow.float32:
+            y = flow.cast(y, flow.float32)
+        return flow.F.broadcast_greater_equal(x, y)
+
+
+def greater_equal_op(x, y):
+    """Returns the truth value of :math:`x >= y` element-wise.
+
+    Args:
+        x (oneflow.Tensor): A Tensor
+        y (oneflow.Tensor): A Tensor
+
+    Returns:
+        oneflow.Tensor: A Tensor with int8 type.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> input1 = flow.Tensor(np.array([1, 2, 3]).astype(np.float32), dtype=flow.float32)
+        >>> input2 = flow.Tensor(np.array([1, 1, 4]).astype(np.float32), dtype=flow.float32)
+
+        >>> out = flow.ge(input1, input2)
+        >>> out
+        tensor([1, 1, 0], dtype=oneflow.int8)
+
+    """
+    return GreaterEqual()(x, y)
+
+
+@register_tensor_op("ge")
+def greater_equal_op_tensor(x, y):
+    """
+
+    ge() -> Tensor
+
+    See :func:`oneflow.ge`
+
+    """
+    return GreaterEqual()(x, y)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/in_top_k.py b/python/oneflow/nn/modules/in_top_k.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5ea4da058d255d3e25980e8b5d5bf136c0cc014
--- /dev/null
+++ b/python/oneflow/nn/modules/in_top_k.py
@@ -0,0 +1,92 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class InTopk(Module):
+    def __init__(self, k) -> None:
+        super().__init__()
+        self._in_top_k = (
+            flow.builtin_op("in_top_k")
+            .Input("targets")
+            .Input("predictions")
+            .Output("out")
+            .Attr("k", k)
+            .Build()
+        )
+
+    def forward(self, targets, predictions):
+        assert (
+            targets.shape[0] == predictions.shape[0]
+        ), "The num of targets must equal the num of predictions"
+        assert len(targets.shape) == 1, "The dimension of targets must be 1"
+        assert len(predictions.shape) == 2, "The dimension of predictions must be 2"
+        return self._in_top_k(targets, predictions)
+
+
+def in_top_k_op(targets, predictions, k):
+    """Says whether the targets are in the top K predictions.
+
+    Args:
+        targets (Tensor): the target tensor of type int32 or int64.
+        predictions (Tensor): the predictions tensor of type float32 .
+        k (int): Number of top elements to look at for computing precision.
+
+    Returns:
+        oneflow.Tensor: A Tensor of type bool. Computed Precision at k as a bool Tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> targets1 = flow.Tensor(np.array([3, 1]), dtype=flow.int32)
+        >>> predictions1 = flow.Tensor(np.array([[0.0, 1.0, 2.0, 3.0], [3.0, 2.0, 1.0, 0.0],]), dtype=flow.float32)
+        >>> out1 = flow.in_top_k(targets1, predictions1, k=1)
+        >>> out1
+        tensor([1, 0], dtype=oneflow.int8)
+        >>> out2 = flow.in_top_k(targets1, predictions1, k=2)
+        >>> out2
+        tensor([1, 1], dtype=oneflow.int8)
+        >>> targets2 = flow.Tensor(np.array([3, 1]), dtype=flow.int32, device=flow.device('cuda'))
+        >>> predictions2 = flow.Tensor(np.array([[0.0, 1.0, 2.0, 3.0], [3.0, 2.0, 1.0, 0.0],]), dtype=flow.float32, device=flow.device('cuda'))
+        >>> out3 = flow.in_top_k(targets2, predictions2, k=1)
+        >>> out3
+        tensor([1, 0], device='cuda:0', dtype=oneflow.int8)
+
+    """
+    return InTopk(k=k)(targets, predictions)[0]
+
+
+@register_tensor_op("in_top_k")
+def in_top_k_op_tensor(targets, predictions, k):
+    """
+
+    in_top_k() -> Tensor
+
+    See :func:`oneflow.in_top_k`
+
+    """
+    return InTopk(k=k)(targets, predictions)[0]
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/instancenorm.py b/python/oneflow/nn/modules/instancenorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3f38f268ea5d6bb04070cbb6a6e7f9c13b24057
--- /dev/null
+++ b/python/oneflow/nn/modules/instancenorm.py
@@ -0,0 +1,311 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.nn.modules.batchnorm import _NormBase
+
+
+class _InstanceNorm(_NormBase):
+    def __init__(
+        self,
+        num_features: int,
+        eps: float = 1e-05,
+        momentum: float = 0.1,
+        affine: bool = False,
+        track_running_stats: bool = False,
+    ):
+        super().__init__(num_features, eps, momentum, affine, track_running_stats)
+
+    def _forward(self, x):
+        axis = 1
+        params_shape = [x.shape[axis]]
+        weight = self.weight
+        bias = self.bias
+        nd_params_shape = [1] * len(x.shape)
+        nd_params_shape[axis] = params_shape[0]
+        mean = x.mean(2, keepdim=True)
+        variance = x.var(2, keepdim=True)
+        normalized = (x - mean) / flow.sqrt(variance + self.eps)
+        if self.weight and params_shape[0] == self.weight.nelement():
+            weight = self.weight.reshape(shape=nd_params_shape)
+        if self.bias and params_shape[0] == self.bias.nelement():
+            bias = self.bias.reshape(shape=nd_params_shape)
+        if self.weight:
+            normalized = normalized * weight
+        if self.bias:
+            normalized = normalized + bias
+        return normalized
+
+    def forward(self, x):
+        self._check_input_dim(x)
+        reshape_to_1d = x.reshape([x.shape[0], x.shape[1], -1])
+        normalized_1d_out = self._forward(reshape_to_1d)
+        reshape_back_to_nd = normalized_1d_out.reshape(list(x.shape))
+        return reshape_back_to_nd
+
+
+class InstanceNorm1d(_InstanceNorm):
+    """The interface is consistent with PyTorch.
+    The documentation is referenced from: https://pytorch.org/docs/stable/generated/torch.nn.InstanceNorm1d.html
+
+    Applies Instance Normalization over a 3D input (a mini-batch of 1D
+    inputs with optional additional channel dimension) as described in the paper
+    `Instance Normalization: The Missing Ingredient for Fast Stylization
+    <https://arxiv.org/abs/1607.08022>`__.
+
+    .. math::
+
+        y = \\frac{x - \\mathrm{E}[x]}{ \\sqrt{\\mathrm{Var}[x] + \\epsilon}} * \\gamma + \\beta
+
+    The mean and standard-deviation are calculated per-dimension separately
+    for each object in a mini-batch. :math:`\\gamma` and :math:`\\beta` are learnable parameter vectors
+    of size `C` (where `C` is the input size) if :attr:`affine` is ``True``.
+    The standard-deviation is calculated via the biased estimator, equivalent to
+    `torch.var(input, unbiased=False)`.
+
+    By default, this layer uses instance statistics computed from input data in
+    both training and evaluation modes.
+
+    If :attr:`track_running_stats` is set to ``True``, during training this
+    layer keeps running estimates of its computed mean and variance, which are
+    then used for normalization during evaluation. The running estimates are
+    kept with a default :attr:`momentum` of 0.1.
+
+    .. note::
+        This :attr:`momentum` argument is different from one used in optimizer
+        classes and the conventional notion of momentum. Mathematically, the
+        update rule for running statistics here is
+        :math:`\\hat{x}_\\text{new} = (1 - \\text{momentum}) \\times \\hat{x} + \\text{momentum} \\times x_t`,
+        where :math:`\\hat{x}` is the estimated statistic and :math:`x_t` is the
+        new observed value.
+
+    .. note::
+        :class:`InstanceNorm1d` and :class:`LayerNorm` are very similar, but
+        have some subtle differences. :class:`InstanceNorm1d` is applied
+        on each channel of channeled data like multidimensional time series, but
+        :class:`LayerNorm` is usually applied on entire sample and often in NLP
+        tasks. Additionally, :class:`LayerNorm` applies elementwise affine
+        transform, while :class:`InstanceNorm1d` usually don't apply affine
+        transform.
+
+    Args:
+        num_features: :math:`C` from an expected input of size
+            :math:`(N, C, L)` or :math:`L` from input of size :math:`(N, L)`
+        eps: a value added to the denominator for numerical stability. Default: 1e-5
+        momentum: the value used for the running_mean and running_var computation. Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters, initialized the same way as done for batch normalization.
+            Default: ``False``.
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics and always uses batch
+            statistics in both training and eval modes. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, C, L)`
+        - Output: :math:`(N, C, L)` (same shape as input)
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+
+        >>> # Without Learnable Parameters
+        >>> m = flow.nn.InstanceNorm1d(100)
+        >>> # With Learnable Parameters
+        >>> m = flow.nn.InstanceNorm1d(100, affine=True)
+        >>> x = flow.Tensor(np.random.randn(20, 100, 40))
+        >>> output = m(x)
+
+    """
+
+    def _check_input_dim(self, input):
+        if input.dim() == 2:
+            raise ValueError(
+                "InstanceNorm1d returns 0-filled tensor to 2D tensor.This is because InstanceNorm1d reshapes inputs to(1, N * C, ...) from (N, C,...) and this makesvariances 0."
+            )
+        if input.dim() != 3:
+            raise ValueError("expected 3D input (got {}D input)".format(input.dim()))
+
+
+class InstanceNorm2d(_InstanceNorm):
+    """The interface is consistent with PyTorch.
+    The documentation is referenced from: https://pytorch.org/docs/stable/generated/torch.nn.InstanceNorm2d.html
+
+    Applies Instance Normalization over a 4D input (a mini-batch of 2D inputs
+    with additional channel dimension) as described in the paper
+    `Instance Normalization: The Missing Ingredient for Fast Stylization
+    <https://arxiv.org/abs/1607.08022>`__.
+
+    .. math::
+
+        y = \\frac{x - \\mathrm{E}[x]}{ \\sqrt{\\mathrm{Var}[x] + \\epsilon}} * \\gamma + \\beta
+
+    The mean and standard-deviation are calculated per-dimension separately
+    for each object in a mini-batch. :math:`\\gamma` and :math:`\\beta` are learnable parameter vectors
+    of size `C` (where `C` is the input size) if :attr:`affine` is ``True``.
+    The standard-deviation is calculated via the biased estimator, equivalent to
+    `torch.var(input, unbiased=False)`.
+
+    By default, this layer uses instance statistics computed from input data in
+    both training and evaluation modes.
+
+    If :attr:`track_running_stats` is set to ``True``, during training this
+    layer keeps running estimates of its computed mean and variance, which are
+    then used for normalization during evaluation. The running estimates are
+    kept with a default :attr:`momentum` of 0.1.
+
+    .. note::
+        This :attr:`momentum` argument is different from one used in optimizer
+        classes and the conventional notion of momentum. Mathematically, the
+        update rule for running statistics here is
+        :math:`\\hat{x}_\\text{new} = (1 - \\text{momentum}) \\times \\hat{x} + \\text{momentum} \\times x_t`,
+        where :math:`\\hat{x}` is the estimated statistic and :math:`x_t` is the
+        new observed value.
+
+    .. note::
+        :class:`InstanceNorm2d` and :class:`LayerNorm` are very similar, but
+        have some subtle differences. :class:`InstanceNorm2d` is applied
+        on each channel of channeled data like RGB images, but
+        :class:`LayerNorm` is usually applied on entire sample and often in NLP
+        tasks. Additionally, :class:`LayerNorm` applies elementwise affine
+        transform, while :class:`InstanceNorm2d` usually don't apply affine
+        transform.
+
+    Args:
+        num_features: :math:`C` from an expected input of size
+            :math:`(N, C, H, W)`
+        eps: a value added to the denominator for numerical stability. Default: 1e-5
+        momentum: the value used for the running_mean and running_var computation. Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters, initialized the same way as done for batch normalization.
+            Default: ``False``.
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics and always uses batch
+            statistics in both training and eval modes. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, C, H, W)`
+        - Output: :math:`(N, C, H, W)` (same shape as input)
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+
+        >>> # Without Learnable Parameters
+        >>> m = flow.nn.InstanceNorm2d(100)
+        >>> # With Learnable Parameters
+        >>> m = flow.nn.InstanceNorm2d(100, affine=True)
+        >>> x = flow.Tensor(np.random.randn(20, 100, 35, 45))
+        >>> output = m(x)
+
+    """
+
+    def _check_input_dim(self, input):
+        if input.dim() != 4:
+            raise ValueError("expected 4D input (got {}D input)".format(input.dim()))
+
+
+class InstanceNorm3d(_InstanceNorm):
+    """The interface is consistent with PyTorch.
+    The documentation is referenced from: https://pytorch.org/docs/stable/generated/torch.nn.InstanceNorm3d.html
+
+    Applies Instance Normalization over a 5D input (a mini-batch of 3D inputs
+    with additional channel dimension) as described in the paper
+    `Instance Normalization: The Missing Ingredient for Fast Stylization
+    <https://arxiv.org/abs/1607.08022>`__.
+
+    .. math::
+
+        y = \\frac{x - \\mathrm{E}[x]}{ \\sqrt{\\mathrm{Var}[x] + \\epsilon}} * \\gamma + \\beta
+
+    The mean and standard-deviation are calculated per-dimension separately
+    for each object in a mini-batch. :math:`\\gamma` and :math:`\\beta` are learnable parameter vectors
+    of size C (where C is the input size) if :attr:`affine` is ``True``.
+    The standard-deviation is calculated via the biased estimator, equivalent to
+    `torch.var(input, unbiased=False)`.
+
+    By default, this layer uses instance statistics computed from input data in
+    both training and evaluation modes.
+
+    If :attr:`track_running_stats` is set to ``True``, during training this
+    layer keeps running estimates of its computed mean and variance, which are
+    then used for normalization during evaluation. The running estimates are
+    kept with a default :attr:`momentum` of 0.1.
+
+    .. note::
+        This :attr:`momentum` argument is different from one used in optimizer
+        classes and the conventional notion of momentum. Mathematically, the
+        update rule for running statistics here is
+        :math:`\\hat{x}_\\text{new} = (1 - \\text{momentum}) \\times \\hat{x} + \\text{momentum} \\times x_t`,
+        where :math:`\\hat{x}` is the estimated statistic and :math:`x_t` is the
+        new observed value.
+
+    .. note::
+        :class:`InstanceNorm3d` and :class:`LayerNorm` are very similar, but
+        have some subtle differences. :class:`InstanceNorm3d` is applied
+        on each channel of channeled data like 3D models with RGB color, but
+        :class:`LayerNorm` is usually applied on entire sample and often in NLP
+        tasks. Additionally, :class:`LayerNorm` applies elementwise affine
+        transform, while :class:`InstanceNorm3d` usually don't apply affine
+        transform.
+
+    Args:
+        num_features: :math:`C` from an expected input of size
+            :math:`(N, C, D, H, W)`
+        eps: a value added to the denominator for numerical stability. Default: 1e-5
+        momentum: the value used for the running_mean and running_var computation. Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters, initialized the same way as done for batch normalization.
+            Default: ``False``.
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics and always uses batch
+            statistics in both training and eval modes. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, C, D, H, W)`
+        - Output: :math:`(N, C, D, H, W)` (same shape as input)
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> # Without Learnable Parameters
+        >>> m = flow.nn.InstanceNorm3d(100)
+        >>> # With Learnable Parameters
+        >>> m = flow.nn.InstanceNorm3d(100, affine=True)
+        >>> x = flow.Tensor(np.random.randn(20, 100, 35, 45, 10))
+        >>> output = m(x)
+
+    """
+
+    def _check_input_dim(self, input):
+        if input.dim() != 5:
+            raise ValueError("expected 5D input (got {}D input)".format(input.dim()))
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/interpolate.py b/python/oneflow/nn/modules/interpolate.py
new file mode 100644
index 0000000000000000000000000000000000000000..61b9a2c882cb03c4c0ec33796392eabde69d97fe
--- /dev/null
+++ b/python/oneflow/nn/modules/interpolate.py
@@ -0,0 +1,300 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import math
+import warnings
+from typing import Optional, Tuple, Union
+
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class Interpolate(Module):
+    def __init__(
+        self,
+        size: Optional[Union[int, Tuple[int, ...]]] = None,
+        scale_factor: Optional[Union[float, Tuple[float, ...]]] = None,
+        mode: str = "nearest",
+        align_corners: Optional[bool] = None,
+        recompute_scale_factor: Optional[bool] = None,
+    ):
+        super().__init__()
+        self.size = size
+        if isinstance(scale_factor, tuple):
+            self.scale_factor = tuple((float(factor) for factor in scale_factor))
+        else:
+            self.scale_factor = float(scale_factor) if scale_factor else None
+        if mode in ("nearest", "area") and align_corners is not None:
+            raise ValueError(
+                "align_corners option can only be set with the interpolating modes: linear | bilinear | bicubic | trilinear"
+            )
+        self.mode = mode
+        self.recompute_scale_factor = recompute_scale_factor
+        if align_corners == None:
+            align_corners = False
+        self.align_corners = align_corners
+        self.height_scale = None
+        self.width_scale = None
+        if isinstance(self.scale_factor, float):
+            self.height_scale = self.scale_factor
+            self.width_scale = self.scale_factor
+        elif isinstance(self.scale_factor, tuple):
+            self.height_scale = self.scale_factor[0]
+            self.width_scale = self.scale_factor[1]
+        else:
+            pass
+        if self.mode not in (
+            "nearest",
+            "bilinear",
+            "linear",
+            "area",
+            "bicubic",
+            "trilinear",
+        ):
+            raise ValueError(
+                'interpolation must be "nearest" or "bilinear" or "linear" or "area" or "bicubic" or "trilinear".'
+            )
+        if self.mode == "nearest" and self.align_corners:
+            raise ValueError('interpolation "nearest" does not support align_corners.')
+
+    def forward(self, x):
+        dim = len(x.shape) - 2
+        if self.size is not None and self.scale_factor is not None:
+            raise ValueError("only one of size or scale_factor should be defined")
+        elif self.size is not None:
+            assert self.scale_factor is None
+            scale_factors = []
+            if isinstance(self.size, (list, tuple)):
+                if len(self.size) != dim:
+                    raise ValueError(
+                        "size shape must match input shape. Input is {}D, size is {}".format(
+                            dim, len(self.size)
+                        )
+                    )
+                output_size = self.size
+            else:
+                output_size = [self.size for _ in range(dim)]
+            for i in range(dim):
+                scale_factors.append(output_size[i] / x.shape[i + 2])
+        elif self.scale_factor is not None:
+            assert self.size is None
+            output_size = None
+            if isinstance(self.scale_factor, (list, tuple)):
+                if len(self.scale_factor) != dim:
+                    raise ValueError(
+                        "scale_factor shape must match input shape. Input is {}D, scale_factor is {}".format(
+                            dim, len(self.scale_factor)
+                        )
+                    )
+                scale_factors = self.scale_factor
+            else:
+                scale_factors = [self.scale_factor for _ in range(dim)]
+        else:
+            raise ValueError("either size or scale_factor should be defined")
+        if self.recompute_scale_factor is None:
+            if scale_factors is not None:
+                for scale in scale_factors:
+                    if math.floor(scale) != scale:
+                        warnings.warn(
+                            "The default behavior for interpolate/upsample with float scale_factor changed in 1.6.0 to align with other frameworks/libraries, and now uses scale_factor directly, instead of relying on the computed output size. If you wish to restore the old behavior, please set recompute_scale_factor=True. See the documentation of nn.Upsample for details. "
+                        )
+                    break
+        elif self.recompute_scale_factor and self.size is not None:
+            raise ValueError(
+                "recompute_scale_factor is not meaningful with an explicit size."
+            )
+        if self.mode == "area" and output_size is None:
+            self.recompute_scale_factor = True
+        if self.recompute_scale_factor is True:
+            assert scale_factors is not None
+            output_size = [
+                int(math.floor(float(x.size(i + 2)) * scale_factors[i]))
+                for i in range(dim)
+            ]
+            scale_factors = []
+            for i in range(dim):
+                scale_factors.append(output_size[i] / x.shape[2 + i])
+        if len(x.shape) == 3 and self.mode == "nearest":
+            return flow.F.upsample_nearest_1d(
+                x, scale_factor=scale_factors[0], data_format="channels_first"
+            )
+        if len(x.shape) == 4 and self.mode == "nearest":
+            return flow.F.upsample_nearest_2d(
+                x,
+                height_scale=scale_factors[0],
+                width_scale=scale_factors[1],
+                data_format="channels_first",
+            )
+        if len(x.shape) == 5 and self.mode == "nearest":
+            return flow.F.upsample_nearest_3d(
+                x,
+                depth_scale=scale_factors[0],
+                height_scale=scale_factors[1],
+                width_scale=scale_factors[2],
+                data_format="channels_first",
+            )
+        if len(x.shape) == 3 and self.mode == "area":
+            assert output_size is not None
+            return flow.F.adaptive_avg_pool1d(x, output_size)
+        if len(x.shape) == 4 and self.mode == "area":
+            assert output_size is not None
+            return flow.F.adaptive_avg_pool2d(x, output_size)
+        if len(x.shape) == 5 and self.mode == "area":
+            assert output_size is not None
+            return flow.F.adaptive_avg_pool3d(x, output_size)
+        if len(x.shape) == 3 and self.mode == "linear":
+            assert self.align_corners is not None
+            return flow.F.upsample_linear_1d(
+                x,
+                scale_factor=scale_factors[0],
+                align_corners=self.align_corners,
+                data_format="channels_first",
+            )
+        if len(x.shape) == 4 and self.mode == "bilinear":
+            assert self.align_corners is not None
+            return flow.F.upsample_bilinear_2d(
+                x,
+                height_scale=scale_factors[0],
+                width_scale=scale_factors[1],
+                align_corners=self.align_corners,
+                data_format="channels_first",
+            )
+        if len(x.shape) == 4 and self.mode == "bicubic":
+            assert self.align_corners is not None
+            return flow.F.upsample_bicubic_2d(
+                x,
+                height_scale=scale_factors[0],
+                width_scale=scale_factors[1],
+                align_corners=self.align_corners,
+                data_format="channels_first",
+            )
+        if len(x.shape) == 5 and self.mode == "trilinear":
+            assert self.align_corners is not None
+            return flow.F.upsample_trilinear_3d(
+                x,
+                depth_scale=scale_factors[0],
+                height_scale=scale_factors[1],
+                width_scale=scale_factors[2],
+                align_corners=self.align_corners,
+                data_format="channels_first",
+            )
+
+
+def interpolate(
+    input,
+    size=None,
+    scale_factor=None,
+    mode="nearest",
+    align_corners=None,
+    recompute_scale_factor=None,
+):
+    """The interface is consistent with PyTorch.    
+    
+    The documentation is referenced from: https://pytorch.org/docs/1.9.0/_modules/torch/nn/functional.html#interpolate
+    
+
+    Down/up samples the input to either the given :attr:`size` or the given
+    :attr:`scale_factor`
+
+    The algorithm used for interpolation is determined by :attr:`mode`.
+
+    Currently temporal, spatial and volumetric sampling are supported, i.e.
+    expected inputs are 3-D, 4-D or 5-D in shape.
+
+    The input dimensions are interpreted in the form:
+    `mini-batch x channels x [optional depth] x [optional height] x width`.
+
+    The modes available for resizing are: `nearest`, `linear` (3D-only),
+    `bilinear`, `bicubic` (4D-only), `trilinear` (5D-only), `area`
+
+    Args:
+        input (Tensor): the input tensor
+        size (int or Tuple[int] or Tuple[int, int] or Tuple[int, int, int]):
+            output spatial size.
+        scale_factor (float or Tuple[float]): multiplier for spatial size. Has to match input size if it is a tuple.
+        mode (str): algorithm used for upsampling:
+            ``'nearest'`` | ``'linear'`` | ``'bilinear'`` | ``'bicubic'`` |
+            ``'trilinear'`` | ``'area'``. Default: ``'nearest'``
+        align_corners (bool, optional): Geometrically, we consider the pixels of the
+            input and output as squares rather than points.
+            If set to ``True``, the input and output tensors are aligned by the
+            center points of their corner pixels, preserving the values at the corner pixels.
+            If set to ``False``, the input and output tensors are aligned by the corner
+            points of their corner pixels, and the interpolation uses edge value padding
+            for out-of-boundary values, making this operation *independent* of input size
+            when :attr:`scale_factor` is kept the same. This only has an effect when :attr:`mode`
+            is ``'linear'``, ``'bilinear'``, ``'bicubic'`` or ``'trilinear'``.
+            Default: ``False``
+        recompute_scale_factor (bool, optional): recompute the scale_factor for use in the
+            interpolation calculation.  When `scale_factor` is passed as a parameter, it is used
+            to compute the `output_size`.  If `recompute_scale_factor` is ``False`` or not specified,
+            the passed-in `scale_factor` will be used in the interpolation computation.
+            Otherwise, a new `scale_factor` will be computed based on the output and input sizes for
+            use in the interpolation computation (i.e. the computation will be identical to if the computed
+            `output_size` were passed-in explicitly).  Note that when `scale_factor` is floating-point,
+            the recomputed scale_factor may differ from the one passed in due to rounding and precision
+            issues.
+
+    .. note::
+        With ``mode='bicubic'``, it's possible to cause overshoot, in other words it can produce
+        negative values or values greater than 255 for images.
+        Explicitly call ``result.clamp(min=0, max=255)`` if you want to reduce the overshoot
+        when displaying the image.
+
+    .. warning::
+        With ``align_corners = True``, the linearly interpolating modes
+        (`linear`, `bilinear`, and `trilinear`) don't proportionally align the
+        output and input pixels, and thus the output values can depend on the
+        input size. This was the default behavior for these modes up to version
+        0.3.1. Since then, the default behavior is ``align_corners = False``.
+        See :class:`~torch.nn.Upsample` for concrete examples on how this
+        affects the outputs.
+
+    .. warning::
+        When scale_factor is specified, if recompute_scale_factor=True,
+        scale_factor is used to compute the output_size which will then
+        be used to infer new scales for the interpolation.
+        The default behavior for recompute_scale_factor changed to False
+        in 1.6.0, and scale_factor is used in the interpolation
+        calculation.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        
+        >>> input = flow.Tensor(np.arange(1, 5).reshape((1, 1, 4)), dtype=flow.float32)
+        >>> output = flow.nn.functional.interpolate(input, scale_factor=2.0, mode="linear")
+        >>> output
+        tensor([[[1.  , 1.25, 1.75, 2.25, 2.75, 3.25, 3.75, 4.  ]]],
+               dtype=oneflow.float32)
+
+    """
+    return Interpolate(
+        size=size,
+        scale_factor=scale_factor,
+        mode=mode,
+        align_corners=align_corners,
+        recompute_scale_factor=recompute_scale_factor,
+    )(input)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/less.py b/python/oneflow/nn/modules/less.py
new file mode 100644
index 0000000000000000000000000000000000000000..146038409460f3334000be3b48871ee432fd2f55
--- /dev/null
+++ b/python/oneflow/nn/modules/less.py
@@ -0,0 +1,69 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class Less(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x, y):
+        if x.dtype != flow.float32:
+            x = flow.cast(x, flow.float32)
+        if isinstance(y, int) or isinstance(y, float):
+            y = flow.Tensor(
+                [float(y)], dtype=flow.float32, device=flow.device(x.device.type)
+            )
+        if y.dtype != flow.float32:
+            y = flow.cast(y, flow.float32)
+        return flow.F.broadcast_less(x, y)
+
+
+@register_tensor_op("lt")
+def less_op(x, y):
+    """Returns the truth value of :math:`x < y` element-wise.
+
+    Args:
+        x (oneflow.Tensor): A Tensor
+        y (oneflow.Tensor): A Tensor
+
+    Returns:
+        oneflow.Tensor: A Tensor with int8 type.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> input1 = flow.Tensor(np.array([1, 2, 3]).astype(np.float32), dtype=flow.float32)
+        >>> input2 = flow.Tensor(np.array([1, 2, 4]).astype(np.float32), dtype=flow.float32)
+
+        >>> out = flow.lt(input1, input2)
+        >>> out
+        tensor([0, 0, 1], dtype=oneflow.int8)
+
+    """
+    return Less()(x, y)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/less_equal.py b/python/oneflow/nn/modules/less_equal.py
new file mode 100644
index 0000000000000000000000000000000000000000..39f00f8a71140733370674aa09fe151fe1897040
--- /dev/null
+++ b/python/oneflow/nn/modules/less_equal.py
@@ -0,0 +1,69 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class LessEqual(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x, y):
+        if x.dtype != flow.float32:
+            x = flow.cast(x, flow.float32)
+        if isinstance(y, int) or isinstance(y, float):
+            y = flow.Tensor(
+                [float(y)], dtype=flow.float32, device=flow.device(x.device.type)
+            )
+        if y.dtype != flow.float32:
+            y = flow.cast(y, flow.float32)
+        return flow.F.broadcast_less_equal(x, y)
+
+
+@register_tensor_op("le")
+def less_equal_op(x, y):
+    """Returns the truth value of :math:`x <= y` element-wise.
+
+    Args:
+        x (oneflow.Tensor): A Tensor
+        y (oneflow.Tensor): A Tensor
+
+    Returns:
+        oneflow.Tensor: A Tensor with int8 type.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> input1 = flow.Tensor(np.array([1, 2, 3]).astype(np.float32), dtype=flow.float32)
+        >>> input2 = flow.Tensor(np.array([1, 1, 4]).astype(np.float32), dtype=flow.float32)
+
+        >>> out = flow.le(input1, input2)
+        >>> out
+        tensor([1, 0, 1], dtype=oneflow.int8)
+
+    """
+    return LessEqual()(x, y)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/linear.py b/python/oneflow/nn/modules/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..6affbc2167fe561ba82c22ec121800c0cf18c7fb
--- /dev/null
+++ b/python/oneflow/nn/modules/linear.py
@@ -0,0 +1,133 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import math
+
+import oneflow as flow
+from oneflow.framework.tensor import Tensor
+from oneflow.nn.init import _calculate_fan_in_and_fan_out
+from oneflow.nn.module import Module
+
+
+class Identity(Module):
+    """A placeholder identity operator that is argument-insensitive.
+
+    Args:
+        args: any argument (unused)
+        kwargs: any keyword argument (unused)
+
+    For example:
+
+    .. code-block:: python
+
+        import numpy as np
+        import oneflow as flow
+
+        m = flow.nn.Identity()
+        input = flow.Tensor(np.random.rand(2, 3, 4, 5))
+
+        output = m(input)
+
+        # output = input
+
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+
+    def forward(self, input: Tensor) -> Tensor:
+        return input
+
+
+class Linear(Module):
+    """Applies a linear transformation to the incoming data: :math:`y = xA^T + b`
+
+    Args:
+
+        - in_features: size of each input sample
+
+        - out_features: size of each output sample
+
+        - bias: If set to ``False``, the layer will not learn an additive bias. Default: ``True``
+
+    Shape:
+        - Input: :math:`(N, *, H_{in})` where :math:`*` means any number of
+          additional dimensions and :math:`H_{in} = {in\\_features}`
+
+        - Output: :math:`(N, *, H_{out})` where all but the last dimension
+          are the same shape as the input and :math:`H_{out} = {out\\_features}`.
+
+    Attr:
+        - :attr:`weight`: the learnable weights of the module of shape :math:`({out\\_features}, {in\\_features})`. The values are initialized from :math:`\\mathcal{U}(-\\sqrt{k}, \\sqrt{k})`, where :math:`(k = 1 / {in\\_features})`
+
+        - :attr:`bias`: the learnable bias of the module of shape :math:`({out\\_features})`. If :attr:`bias` is ``True``, the values are initialized from :math:`\\mathcal{U}(-\\sqrt{k}, \\sqrt{k})` where :math:`(k = 1 / {in\\_features})`
+
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+
+        >>> m = flow.nn.Linear(20, 30, False)
+        >>> input = flow.Tensor(np.random.randn(128, 20))
+        >>> output = m(input)
+        >>> output.size()
+        flow.Size([128, 30])
+
+    """
+
+    def __init__(self, in_features: int, out_features: int, bias: bool = True) -> None:
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.use_bias = bias
+        self.weight = flow.nn.Parameter(flow.Tensor(out_features, in_features))
+        self.bias = None
+        if bias:
+            self.bias = flow.nn.Parameter(flow.Tensor(out_features))
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        flow.nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+        if self.bias is not None:
+            (fan_in, _) = _calculate_fan_in_and_fan_out(self.weight)
+            bound = 1 / math.sqrt(fan_in)
+            flow.nn.init.uniform_(self.bias, -bound, bound)
+
+    def forward(self, x):
+        assert len(x.shape) >= 2, "Tensor x's dim should >=2"
+        if len(x.shape) == 2:
+            res = flow.F.matmul(x, self.weight, transpose_a=False, transpose_b=True)
+        else:
+            res = flow.F.broadcast_matmul(
+                x, self.weight, transpose_a=False, transpose_b=True
+            )
+        if self.use_bias:
+            res += self.bias
+        return res
+
+    def extra_repr(self) -> str:
+        return "in_features={}, out_features={}, bias={}".format(
+            self.in_features, self.out_features, self.bias is not None
+        )
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/log1p.py b/python/oneflow/nn/modules/log1p.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a1f434b87ffd2e1492f032cccae53065a7ebcda
--- /dev/null
+++ b/python/oneflow/nn/modules/log1p.py
@@ -0,0 +1,55 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class Log1p(Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op = flow.builtin_op("log1p").Input("x").Output("y").Build()
+
+    def forward(self, x):
+        return self._op(x)[0]
+
+
+@register_tensor_op("log1p")
+def log1p_op(input):
+    """Returns a new tensor with the natural logarithm of (1 + input).
+
+    .. math::
+        \\text{out}_{i}=\\log_e(1+\\text{input}_{i})
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> x = flow.Tensor(np.array([1.3, 1.5, 2.7]))
+        >>> out = flow.log1p(x).numpy()
+        >>> out
+        array([0.8329091 , 0.91629076, 1.3083328 ], dtype=float32)
+
+    """
+    return Log1p()(input)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/loss.py b/python/oneflow/nn/modules/loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..39fef93b0b86ae0b43c887e81b36b3e92acc2f62
--- /dev/null
+++ b/python/oneflow/nn/modules/loss.py
@@ -0,0 +1,1168 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional
+
+import oneflow as flow
+from oneflow.framework.tensor import Tensor
+from oneflow.nn.module import Module
+from oneflow.nn.modules.constant import _ConstantBase
+
+
+class L1Loss(Module):
+    """This operator computes the L1 Loss between each element in `input` and `target`.
+
+    The equation is:
+
+    if reduction = "none":
+
+    .. math::
+
+        output = |Target - Input|
+
+    if reduction = "mean":
+
+    .. math::
+
+        output = \\frac{1}{n}\\sum_{i=1}^n|Target_i - Input_i|
+
+    if reduction = "sum":
+
+    .. math::
+
+        output = \\sum_{i=1}^n|Target_i - Input_i|
+
+    Args:
+        input (oneflow.Tensor): The input Tensor.
+        target (oneflow.Tensor): The target Tensor.
+        reduction (str): The reduce type, it can be one of "none", "mean", "sum". Defaults to "mean".
+
+    Returns:
+        oneflow.Tensor: The result Tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> input = flow.Tensor([[1, 1, 1], [2, 2, 2], [7, 7, 7]], dtype = flow.float32)
+        >>> target = flow.Tensor([[4, 4, 4], [4, 4, 4], [4, 4, 4]], dtype = flow.float32)
+        >>> m = flow.nn.L1Loss(reduction="none")
+        >>> out = m(input, target)
+        >>> out
+        tensor([[3., 3., 3.],
+                [2., 2., 2.],
+                [3., 3., 3.]], dtype=oneflow.float32)
+        >>> m_mean = flow.nn.L1Loss(reduction="mean")
+        >>> out = m_mean(input, target)
+        >>> out
+        tensor([2.6667], dtype=oneflow.float32)
+        >>> m_mean = flow.nn.L1Loss(reduction="sum")
+        >>> out = m_mean(input, target)
+        >>> out
+        tensor([24.], dtype=oneflow.float32)
+    """
+
+    def __init__(self, reduction: str = "mean", reduce=True) -> None:
+        super().__init__()
+        if reduce is not None and (not reduce):
+            raise ValueError("Argument reduce is not supported yet")
+        assert reduction in [
+            "none",
+            "mean",
+            "sum",
+            None,
+        ], "only 'sum', 'mean' and 'none' supported by now"
+        self.reduction = reduction
+
+    def forward(self, input, target):
+        assert (
+            input.shape == target.shape
+        ), "The Input shape must be the same as Target shape"
+        l1_value = flow.abs(flow.sub(input, target))
+        if self.reduction == "mean":
+            return flow.mean(l1_value)
+        elif self.reduction == "sum":
+            return flow.sum(l1_value)
+        else:
+            return l1_value
+
+
+class CrossEntropyLoss(Module):
+    """This criterion combines :class:`~flow.nn.LogSoftmax` and :class:`~flow.nn.NLLLoss` in one single class.
+
+    It is useful when training a classification problem with `C` classes.
+
+    The `input` is expected to contain raw, unnormalized scores for each class.
+
+    `input` has to be a Tensor of size either :math:`(minibatch, C)` or
+    :math:`(minibatch, C, d_1, d_2, ..., d_K)`
+    with :math:`K \\geq 1` for the `K`-dimensional case (described later).
+
+    This criterion expects a class index in the range :math:`[0, C-1]` as the
+    `target` for each value of a 1D tensor of size `minibatch`;
+
+    The loss can be described as:
+
+    .. math::
+        \\text{loss}(x, class) = -\\log\\left(\\frac{\\exp(x[class])}{\\sum_j \\exp(x[j])}\\right)
+                       = -x[class] + \\log\\left(\\sum_j \\exp(x[j])\\right)
+
+    Can also be used for higher dimension inputs, such as 2D images, by providing
+    an input of size :math:`(minibatch, C, d_1, d_2, ..., d_K)` with :math:`K \\geq 1`,
+    where :math:`K` is the number of dimensions, and a target of appropriate shape
+    (see below).
+
+    Args:
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will
+            be applied, ``'mean'``: the weighted mean of the output is taken,
+            ``'sum'``: the output will be summed. Default: ``'mean'``
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        
+        >>> input = flow.Tensor(
+        ...    [[-0.1664078, -1.7256707, -0.14690138],
+        ...        [-0.21474946, 0.53737473, 0.99684894],
+        ...        [-1.135804, -0.50371903, 0.7645404]], dtype=flow.float32)
+        >>> target = flow.Tensor(np.array([0, 1, 2]), dtype=flow.int32)
+        >>> out = flow.nn.CrossEntropyLoss(reduction="none")(input, target)
+        >>> out
+        tensor([0.802 , 1.1167, 0.3583], dtype=oneflow.float32)
+        >>> out_sum = flow.nn.CrossEntropyLoss(reduction="sum")(input, target)
+        >>> out_sum
+        tensor([2.2769], dtype=oneflow.float32)
+        >>> out_mean = flow.nn.CrossEntropyLoss(reduction="mean")(input, target)
+        >>> out_mean
+        tensor([0.759], dtype=oneflow.float32)
+
+    """
+
+    def __init__(
+        self,
+        weight=None,
+        ignore_index: Optional[int] = None,
+        reduction: Optional[str] = "mean",
+    ) -> None:
+        super().__init__()
+        if weight is not None:
+            raise ValueError("Argument weight is not supported yet")
+        assert reduction in [
+            "sum",
+            "none",
+            "mean",
+            None,
+        ], "only 'sum', 'mean' and None supported by now"
+        self.ignore_index = ignore_index
+        self.reduction = reduction
+
+    def forward(self, input, target):
+        assert len(input.shape) <= 4
+        assert len(target.shape) == len(input.shape) - 1
+        input_shape_len = len(input.shape)
+        if input_shape_len == 3:
+            (b, c, h) = (input.shape[0], input.shape[1], input.shape[2])
+            input = flow.F.transpose(input, perm=(0, 2, 1))
+            input = input.reshape(shape=[-1, input.shape[2]])
+            target = target.flatten()
+        elif input_shape_len == 4:
+            (b, c, h, w) = (
+                input.shape[0],
+                input.shape[1],
+                input.shape[2],
+                input.shape[3],
+            )
+            input = flow.F.transpose(input, perm=(0, 2, 3, 1))
+            input = input.reshape(shape=[-1, input.shape[3]])
+            target = target.flatten()
+        elif input_shape_len >= 5:
+            raise NotImplemented
+        out = flow.F.sparse_softmax_cross_entropy(
+            input, target, depth=input.shape[len(input.shape) - 1]
+        )
+        if self.ignore_index is not None:
+            zeros = flow.zeros(size=out.shape, dtype=out.dtype, device=out.device)
+            condition = flow.eq(target, self.ignore_index)
+            ones = flow.ones(
+                size=condition.shape, dtype=condition.dtype, device=condition.device
+            )
+            condition = ones.sub(condition).reshape(tuple(out.shape))
+            out = flow.where(condition, out, zeros)
+            if self.reduction == "mean":
+                reduce_sum = out.sum()
+                reduce_count = condition.argwhere().shape[0]
+                out = flow.mul(reduce_sum, 1.0 / reduce_count)
+        if self.reduction == "mean":
+            return out.mean()
+        elif self.reduction == "sum":
+            return out.sum()
+        else:
+            if input_shape_len == 4:
+                out = out.reshape((b, h, w))
+            return out
+
+
+class BCELoss(Module):
+    """This operator computes the binary cross entropy loss.
+
+    The equation is:
+
+    if reduction = "none":
+
+    .. math::
+
+        out = -(Target_i*log(Input_i) + (1-Target_i)*log(1-Input_i))
+
+    if reduction = "mean":
+
+    .. math::
+
+        out = -\\frac{1}{n}\\sum_{i=1}^n(Target_i*log(Input_i) + (1-Target_i)*log(1-Input_i))
+
+    if reduction = "sum":
+
+    .. math::
+
+        out = -\\sum_{i=1}^n(Target_i*log(Input_i) + (1-Target_i)*log(1-Input_i))
+
+    Args:
+        weight (oneflow.Tensor, optional): The manual rescaling weight to the loss. Default to None, whose corresponding weight value is 1.
+        reduction (str, optional): The reduce type, it can be one of "none", "mean", "sum". Defaults to "mean".
+
+    Attention:
+        The input value must be in the range of (0, 1). Or the loss function may return `nan` value.
+
+    Returns:
+        oneflow.Tensor: The result Tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> input = flow.Tensor(np.array([[1.2, 0.2, -0.3], [0.7, 0.6, -2]]).astype(np.float32))
+        >>> target = flow.Tensor(np.array([[0, 1, 0], [1, 0, 1]]).astype(np.float32))
+        >>> weight = flow.Tensor(np.array([[2, 2, 2], [2, 2, 2]]).astype(np.float32))
+        >>> activation = flow.nn.Sigmoid()
+        >>> sigmoid_input = activation(input)
+        >>> m = flow.nn.BCELoss(weight, reduction="none")
+        >>> out = m(sigmoid_input, target)
+        >>> out
+        tensor([[2.9266, 1.1963, 1.1087],
+                [0.8064, 2.075 , 4.2539]], dtype=oneflow.float32)
+        >>> m_sum = flow.nn.BCELoss(weight, reduction="sum")
+        >>> out = m_sum(sigmoid_input, target)
+        >>> out
+        tensor([12.3668], dtype=oneflow.float32)
+        >>> m_mean = flow.nn.BCELoss(weight, reduction="mean")
+        >>> out = m_mean(sigmoid_input, target)
+        >>> out
+        tensor([2.0611], dtype=oneflow.float32)
+        >>> m_none = flow.nn.BCELoss()
+        >>> out = m_none(sigmoid_input, target)
+        >>> out
+        tensor([1.0306], dtype=oneflow.float32)
+
+    """
+
+    def __init__(self, weight: Tensor = None, reduction: str = "mean") -> None:
+        super().__init__()
+        assert reduction in [
+            "none",
+            "sum",
+            "mean",
+            None,
+        ], "only 'sum', 'mean' and 'none' supported by now"
+        self.weight = weight
+        self.reduction = reduction
+
+    def forward(self, input, target):
+        assert (
+            input.shape == target.shape
+        ), "The Input shape must be the same as Target shape"
+        _cross_entropy_loss = flow.negative(
+            target * flow.log(input) + (1 - target) * flow.log(1 - input)
+        )
+        if self.weight is not None:
+            assert (
+                self.weight.shape == input.shape
+            ), "The weight shape must be the same as Input shape"
+            _weighted_loss = self.weight * _cross_entropy_loss
+        else:
+            _weighted_loss = _cross_entropy_loss
+        if self.reduction == "mean":
+            return flow.mean(_weighted_loss)
+        elif self.reduction == "sum":
+            return flow.sum(_weighted_loss)
+        else:
+            return _weighted_loss
+
+
+class NLLLoss(Module):
+    """ The negative log likelihood loss. It is useful to train a classification
+    problem with `C` classes.
+
+    The `input` given through a forward call is expected to contain
+    log-probabilities of each class. `input` has to be a Tensor of size either
+    :math:`(minibatch, C)` or :math:`(minibatch, C, d_1, d_2, ..., d_K)`
+    with :math:`K \\geq 1` for the `K`-dimensional case (described later).
+
+    Obtaining log-probabilities in a neural network is easily achieved by
+    adding a  `LogSoftmax`  layer in the last layer of your network.
+    You may use `CrossEntropyLoss` instead, if you prefer not to add an extra
+    layer.
+
+    The `target` that this loss expects should be a class index in the range :math:`[0, C-1]`
+    where `C = number of classes`;
+
+    The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
+
+    .. math::
+        \\ell(x, y) = L = \\{l_1,\\dots,l_N\\}^\\top, \\quad
+        l_n = - w_{y_n} x_{n,y_n}, \\quad
+        w_{c} = \\mathbb{1},
+
+    where :math:`x` is the input, :math:`y` is the target, :math:`w` is the weight, and
+    :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
+    (default ``'mean'``), then
+
+    .. math::
+        \\ell(x, y) = \\begin{cases}
+            \\sum_{n=1}^N \\frac{1}{N} l_n, &
+            \\text{if reduction} = \\text{`mean';}\\\\
+            \\sum_{n=1}^N l_n,  &
+            \\text{if reduction} = \\text{`sum'.}
+        \\end{cases}
+
+    Can also be used for higher dimension inputs, such as 2D images, by providing
+    an input of size :math:`(minibatch, C, d_1, d_2, ..., d_K)` with :math:`K \\geq 1`,
+    where :math:`K` is the number of dimensions, and a target of appropriate shape
+    (see below). In the case of images, it computes NLL loss per-pixel.
+
+    Args:
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will
+            be applied, ``'mean'``: the weighted mean of the output is taken,
+            ``'sum'``: the output will be summed. Default: ``'mean'``
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+
+        >>> input = flow.Tensor(
+        ... [[-0.1664078, -1.7256707, -0.14690138],
+        ... [-0.21474946, 0.53737473, 0.99684894],
+        ... [-1.135804, -0.50371903, 0.7645404]], dtype=flow.float32)
+        >>> target = flow.Tensor(np.array([0, 1, 2]), dtype=flow.int32)
+        >>> m = flow.nn.NLLLoss(reduction="none")
+        >>> out = m(input, target)
+        >>> out
+        tensor([ 0.1664, -0.5374, -0.7645], dtype=oneflow.float32)
+
+        >>> m = flow.nn.NLLLoss(reduction="sum")
+        >>> out = m(input, target)
+        >>> out
+        tensor([-1.1355], dtype=oneflow.float32)
+
+        >>> m = flow.nn.NLLLoss(reduction="mean")
+        >>> out = m(input, target)
+        >>> out
+        tensor([-0.3785], dtype=oneflow.float32)
+
+    """
+
+    def __init__(
+        self, weight=None, ignore_index: int = None, reduction: str = "mean"
+    ) -> None:
+        super().__init__()
+        if weight != None:
+            raise ValueError("Argument weight is not supported yet")
+        assert reduction in [
+            "sum",
+            "none",
+            "mean",
+            None,
+        ], "only 'sum', 'mean' and None supported by now"
+        self.ignore_index = ignore_index
+        self.reduction = reduction
+
+    def nllloss_1d(self, input, target):
+        target = flow.F.reshape(target, shape=(target.shape[0], 1))
+        res = flow.F.dim_gather(input, target, dim=1)
+        res = flow.F.squeeze(res, dim=[1])
+        return res
+
+    def forward(self, input, target):
+        assert len(input.shape) <= 4
+        assert len(target.shape) == len(input.shape) - 1
+        input = input.negative()
+        if len(input.shape) == 2:
+            res = self.nllloss_1d(input, target)
+        elif len(input.shape) == 3:
+            (b, c, h) = (input.shape[0], input.shape[1], input.shape[2])
+            input = flow.F.transpose(input, perm=(0, 2, 1))
+            input = input.reshape(shape=[-1, input.shape[2]])
+            target = target.flatten()
+            res = self.nllloss_1d(input, target)
+            res = res.reshape((b, h))
+        elif len(input.shape) == 4:
+            (b, c, h, w) = (
+                input.shape[0],
+                input.shape[1],
+                input.shape[2],
+                input.shape[3],
+            )
+            input = flow.F.transpose(input, perm=(0, 2, 3, 1))
+            input = input.reshape(shape=[-1, input.shape[3]])
+            target = target.flatten()
+            res = self.nllloss_1d(input, target)
+            res = res.reshape((b, h, w))
+        else:
+            raise NotImplemented
+        if self.ignore_index is not None:
+            zeros = flow.zeros(size=res.shape, dtype=res.dtype, device=res.device)
+            condition = flow.eq(target, self.ignore_index)
+            ones = flow.ones(
+                size=condition.shape, dtype=condition.dtype, device=condition.device
+            )
+            condition = ones.sub(condition).reshape(tuple(res.shape))
+            res = flow.where(condition, res, zeros)
+            if self.reduction == "mean":
+                res = res.sum()
+                reduce_count = condition.argwhere().shape[0]
+                res = flow.mul(res, 1.0 / reduce_count)
+        if self.reduction == "none":
+            return res
+        elif self.reduction == "sum":
+            return res.sum()
+        else:
+            return res.mean()
+
+
+class KLDivLoss(Module):
+    """The interface is consistent with PyTorch.
+    The documentation is referenced from:
+    https://pytorch.org/docs/stable/generated/torch.nn.KLDivLoss.html?highlight=kldivloss#torch.nn.KLDivLoss
+
+    The Kullback-Leibler divergence loss measure
+
+    `Kullback-Leibler divergence`_ is a useful distance measure for continuous
+    distributions and is often useful when performing direct regression over
+    the space of (discretely sampled) continuous output distributions.
+
+    As with :class:`~torch.nn.NLLLoss`, the `input` given is expected to contain
+    *log-probabilities* and is not restricted to a 2D Tensor.
+    The targets are interpreted as *probabilities* by default, but could be considered
+    as *log-probabilities* with :attr:`log_target` set to ``True``.
+
+    This criterion expects a `target` `Tensor` of the same size as the
+    `input` `Tensor`.
+
+    The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
+
+    .. math::
+        l(x,y) = L = \\{ l_1,\\dots,l_N \\}, \\quad
+        l_n = y_n \\cdot \\left( \\log y_n - x_n \\right)
+
+    where the index :math:`N` spans all dimensions of ``input`` and :math:`L` has the same
+    shape as ``input``. If :attr:`reduction` is not ``'none'`` (default ``'mean'``), then:
+
+    .. math::
+        \\ell(x, y) = \\begin{cases}
+            \\operatorname{mean}(L), & \\text{if reduction} = \\text{`mean';} \\\\
+            \\operatorname{sum}(L),  & \\text{if reduction} = \\text{`sum'.}
+        \\end{cases}
+
+    In default :attr:`reduction` mode ``'mean'``, the losses are averaged for each minibatch over observations
+    **as well as** over dimensions. ``'batchmean'`` mode gives the correct KL divergence where losses
+    are averaged over batch dimension only. ``'mean'`` mode's behavior will be changed to the same as
+    ``'batchmean'`` in the next major release.
+
+    .. _`kullback-leibler divergence`: https://en.wikipedia.org/wiki/Kullback-Leibler_divergence
+
+    Args:
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'batchmean'`` | ``'sum'`` | ``'mean'``.
+            ``'none'``: no reduction will be applied.
+            ``'batchmean'``: the sum of the output will be divided by batchsize.
+            ``'sum'``: the output will be summed.
+            ``'mean'``: the output will be divided by the number of elements in the output.
+            Default: ``'mean'``
+        log_target (bool, optional): Specifies whether `target` is passed in the log space.
+            Default: ``False``
+
+    .. note::
+        :attr:`reduction` = ``'mean'`` doesn't return the true kl divergence value, please use
+        :attr:`reduction` = ``'batchmean'`` which aligns with KL math definition.
+        In the next major release, ``'mean'`` will be changed to be the same as ``'batchmean'``.
+
+    Shape:
+        - Input: :math:`(N, *)` where :math:`*` means, any number of additional
+          dimensions
+        - Target: :math:`(N, *)`, same shape as the input
+        - Output: scalar by default. If :attr:``reduction`` is ``'none'``, then :math:`(N, *)`,
+          the same shape as the input
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> input = flow.Tensor([-0.9021705, 0.08798598, 1.04686249], dtype=flow.float32)
+        >>> target = flow.Tensor([1.22386942, -0.89729659, 0.01615712], dtype=flow.float32)
+        >>> m = flow.nn.KLDivLoss(reduction="none", log_target=False)
+        >>> out = m(input, target)
+        >>> out
+        tensor([ 1.3514,  0.    , -0.0836], dtype=oneflow.float32)
+        >>> m = flow.nn.KLDivLoss(reduction="mean", log_target=False)
+        >>> out = m(input, target)
+        >>> out
+        tensor([0.4226], dtype=oneflow.float32)
+        >>> m = flow.nn.KLDivLoss(reduction="sum", log_target=True)
+        >>> out = m(input, target)
+        >>> out
+        tensor([5.7801], dtype=oneflow.float32)
+
+    """
+
+    def __init__(self, reduction: str = "mean", log_target: bool = False) -> None:
+        super().__init__()
+        assert reduction in [
+            "sum",
+            "none",
+            "mean",
+            None,
+        ], "Argument reduction only support 'sum'/'mean'/'none'/None for now!"
+        self.reduction = reduction
+        self.log_target = log_target
+
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        if self.log_target:
+            _kl_div_loss = flow.exp(target) * (target - input)
+        else:
+            _kl_div_out_loss = target * (flow.log(target) - input)
+            _zeros = flow.zeros(
+                size=_kl_div_out_loss.shape,
+                dtype=_kl_div_out_loss.dtype,
+                device=_kl_div_out_loss.device,
+            )
+            _condition = flow.gt(target, 0)
+            _kl_div_loss = flow.where(_condition, _kl_div_out_loss, _zeros)
+        if self.reduction == "mean":
+            return flow.mean(_kl_div_loss)
+        elif self.reduction == "sum":
+            return flow.sum(_kl_div_loss)
+        else:
+            return _kl_div_loss
+
+
+class MSELoss(Module):
+    """The interface is consistent with PyTorch.
+    The documentation is referenced from:
+    https://pytorch.org/docs/stable/generated/torch.nn.MSELoss.html?highlight=mseloss#torch.nn.MSELoss
+
+    Creates a criterion that measures the mean squared error (squared L2 norm) between
+    each element in the input :math:`x` and target :math:`y`.
+
+    The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
+
+    .. math::
+        \\ell(x, y) = L = \\{l_1,\\dots,l_N\\}^\\top, \\quad
+        l_n = \\left( x_n - y_n \\right)^2,
+
+    where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
+    (default ``'mean'``), then:
+
+    .. math::
+        \\ell(x, y) =
+        \\begin{cases}
+            \\operatorname{mean}(L), &  \\text{if reduction} = \\text{`mean';}\\\\
+            \\operatorname{sum}(L),  &  \\text{if reduction} = \\text{`sum'.}
+        \\end{cases}
+
+    :math:`x` and :math:`y` are tensors of arbitrary shapes with a total
+    of :math:`n` elements each.
+
+    The mean operation still operates over all the elements, and divides by :math:`n`.
+
+    The division by :math:`n` can be avoided if one sets ``reduction = 'sum'``.
+
+    Args:
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Default: ``'mean'``
+
+    Shape:
+        - Input: :math:`(N, *)` where :math:`*` means, any number of additional
+          dimensions
+        - Target: :math:`(N, *)`, same shape as the input
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> input = flow.Tensor(
+        ... [[-0.02557137, 0.03101675, 1.37493674],
+        ... [0.25599439, -1.08372561, -0.21006816]], dtype=flow.float32)
+        >>> target = flow.Tensor(
+        ... [[-1.53105064, -0.68137555, 0.5931354],
+        ... [-0.49158347, 0.93673637, 0.1324141]], dtype=flow.float32)
+        >>> m = flow.nn.MSELoss(reduction="none")
+        >>> out = m(input, target)
+        >>> out
+        tensor([[2.2665, 0.5075, 0.6112],
+                [0.5589, 4.0823, 0.1173]], dtype=oneflow.float32)
+        >>> m = flow.nn.MSELoss(reduction="mean")
+        >>> out = m(input, target)
+        >>> out
+        tensor([1.3573], dtype=oneflow.float32)
+        >>> m = flow.nn.MSELoss(reduction="sum")
+        >>> out = m(input, target)
+        >>> out
+        tensor([8.1436], dtype=oneflow.float32)
+
+    """
+
+    def __init__(self, reduction: str = "mean") -> None:
+        super().__init__()
+        assert reduction in [
+            "sum",
+            "none",
+            "mean",
+            None,
+        ], "Argument reduction only support 'sum'/'mean'/'none'/None for now!"
+        self.reduction = reduction
+
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        mean_squared_difference = flow.square(flow.sub(input, target))
+        if self.reduction == "mean":
+            return flow.mean(mean_squared_difference)
+        elif self.reduction == "sum":
+            return flow.sum(mean_squared_difference)
+        else:
+            return mean_squared_difference
+
+
+class MarginRankingLoss(Module):
+    """Creates a criterion that measures the loss given
+    inputs :math:`x1`, :math:`x2`, two 1D mini-batch `Tensors`,
+    and a label 1D mini-batch tensor :math:`y` (containing 1 or -1).
+
+    If :math:`y = 1` then it assumed the first input should be ranked higher
+    (have a larger value) than the second input, and vice-versa for :math:`y = -1`.
+
+    The loss function for each sample in the mini-batch is:
+
+    .. math::
+        \\text{loss}(x1, x2, y) = \\max(0, -y * (x1 - x2) + \\text{margin})
+
+    Args:
+        margin (float, optional): Has a default value of :math:`0`.
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Default: ``'mean'``
+
+    Shape:
+        - `x1` : :math:`(N, D)` where `N` is the batch size and `D` is the size of a sample.
+        - `x2` : :math:`(N, D)` where `N` is the batch size and `D` is the size of a sample.
+        - Target: :math:`(N)`
+        - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(N)`.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> x1 = flow.Tensor(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), dtype=flow.float32)
+        >>> x2 = flow.Tensor(np.array([[2, 2, 2], [2, 2, 2], [2, 2, 2]]), dtype=flow.float32)
+        >>> target = flow.Tensor(np.array([[1, -1, 1],[-1, 1, -1], [1, 1, 1]]), dtype=flow.float32)
+        >>> m = flow.nn.MarginRankingLoss(margin =1.0, reduction="none")
+        >>> out = m(x1, x2, target)
+        >>> out
+        tensor([[2., 1., 0.],
+                [3., 0., 5.],
+                [0., 0., 0.]], dtype=oneflow.float32)
+
+        >>> m = flow.nn.MarginRankingLoss(margin = 0.3, reduction="sum")
+        >>> out = m(x1, x2, target)
+        >>> out
+        tensor([8.2], dtype=oneflow.float32)
+
+        >>> m = flow.nn.MarginRankingLoss(margin = 10, reduction="mean")
+        >>> out = m(x1, x2, target)
+        >>> out
+        tensor([8.3333], dtype=oneflow.float32)
+
+
+    """
+
+    def __init__(self, margin=0.0, reduction: str = "mean") -> None:
+        super().__init__()
+        self.margin = margin
+        assert reduction in [
+            "sum",
+            "none",
+            "mean",
+            None,
+        ], "only 'sum', 'mean' and None supported by now"
+        self.reduction = reduction
+
+    def forward(self, input1, input2, target):
+        res = flow.clip(
+            flow.add(
+                self.margin, flow.mul(target, flow.mul(-1, flow.sub(input1, input2)))
+            ),
+            min=0.0,
+        )
+        if self.reduction == "none":
+            return res
+        elif self.reduction == "sum":
+            return res.sum()
+        else:
+            return res.mean()
+
+
+class CTCLoss(Module):
+    """The Connectionist Temporal Classification loss.
+    The interface is consistent with PyTorch.
+    The documentation is referenced from:
+    https://pytorch.org/docs/stable/generated/torch.nn.CTCLoss.html#torch.nn.CTCLoss
+
+    Calculates loss between a continuous (unsegmented) time series and a target sequence. CTCLoss sums over the
+    probability of possible alignments of input to target, producing a loss value which is differentiable
+    with respect to each input node. The alignment of input to target is assumed to be "many-to-one", which
+    limits the length of the target sequence such that it must be :math:`\\leq` the input length.
+
+    Args:
+        blank (int, optional): blank label. Default :math:`0`.
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the output losses will be divided by the target lengths and
+            then the mean over the batch is taken. Default: ``'mean'``
+        zero_infinity (bool, optional):
+            Whether to zero infinite losses and the associated gradients.
+            Default: ``False``
+            Infinite losses mainly occur when the inputs are too short
+            to be aligned to the targets.
+
+    Shape:
+        - Log_probs: Tensor of size :math:`(T, N, C)`,
+          where :math:`T = \\text{input length}`,
+          :math:`N = \\text{batch size}`, and
+          :math:`C = \\text{number of classes (including blank)}`.
+        - Targets: Tensor of size :math:`(N, S)` or
+          :math:`(\\operatorname{sum}(\\text{target\\_lengths}))`,
+          where :math:`N = \\text{batch size}` and
+          :math:`S = \\text{max target length, if shape is } (N, S)`.
+          It represent the target sequences. Each element in the target
+          sequence is a class index. And the target index cannot be blank (default=0).
+          In the :math:`(N, S)` form, targets are padded to the
+          length of the longest sequence, and stacked.
+          In the :math:`(\\operatorname{sum}(\\text{target\\_lengths}))` form,
+          the targets are assumed to be un-padded and
+          concatenated within 1 dimension.
+        - Input_lengths: Tuple or tensor of size :math:`(N)`,
+          where :math:`N = \\text{batch size}`. It represent the lengths of the
+          inputs (must each be :math:`\\leq T`). And the lengths are specified
+          for each sequence to achieve masking under the assumption that sequences
+          are padded to equal lengths.
+        - Target_lengths: Tuple or tensor of size :math:`(N)`,
+          where :math:`N = \\text{batch size}`. It represent lengths of the targets.
+          Lengths are specified for each sequence to achieve masking under the
+          assumption that sequences are padded to equal lengths. If target shape is
+          :math:`(N,S)`, target_lengths are effectively the stop index
+          :math:`s_n` for each target sequence, such that ``target_n = targets[n,0:s_n]`` for
+          each target in a batch. Lengths must each be :math:`\\leq S`
+          If the targets are given as a 1d tensor that is the concatenation of individual
+          targets, the target_lengths must add up to the total length of the tensor.
+
+    Reference:
+        A. Graves et al.: Connectionist Temporal Classification:
+        Labelling Unsegmented Sequence Data with Recurrent Neural Networks:
+        https://www.cs.toronto.edu/~graves/icml_2006.pdf
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> log_probs = np.array(
+        ...             [
+        ...                 [[-1.1031, -0.7998, -1.5200], [-0.9808, -1.1363, -1.1908]],
+        ...                 [[-1.2258, -1.0665, -1.0153], [-1.1135, -1.2331, -0.9671]],
+        ...                 [[-1.3348, -0.6611, -1.5118], [-0.9823, -1.2355, -1.0941]],
+        ...                 [[-1.3850, -1.3273, -0.7247], [-0.8235, -1.4783, -1.0994]],
+        ...                 [[-0.9049, -0.8867, -1.6962], [-1.4938, -1.3630, -0.6547]],
+        ...             ]
+        ...         ).astype(np.float32)
+        >>> log_probs = flow.Tensor(log_probs, dtype=flow.float32)
+        >>> targets = flow.Tensor(np.array([[1, 2, 2], [1, 2, 2]]).astype("int32"), dtype=flow.int32)
+        >>> input_lengths = flow.Tensor(np.array([5, 5]).astype("int32"), dtype=flow.int32)
+        >>> target_lengths = flow.Tensor(np.array([3, 3]).astype("int32"), dtype=flow.int32)
+        >>> loss_mean = flow.nn.CTCLoss()
+        >>> out = loss_mean(log_probs, targets, input_lengths, target_lengths)
+        >>> out
+        tensor([1.1376], dtype=oneflow.float32)
+        >>> loss_sum = flow.nn.CTCLoss(blank=0, reduction="sum")
+        >>> out = loss_sum(log_probs, targets, input_lengths, target_lengths)
+        >>> out
+        tensor([6.8257], dtype=oneflow.float32)
+        >>>
+
+    """
+
+    def __init__(
+        self, blank: int = 0, reduction: str = "mean", zero_infinity: bool = False
+    ) -> None:
+        super().__init__()
+        assert reduction in [
+            "sum",
+            "none",
+            "mean",
+            None,
+        ], "only 'sum', 'mean' and None supported by now"
+        self.reduction = reduction
+        self.zero_infinity = zero_infinity
+        self._op = (
+            flow.builtin_op("ctc_loss")
+            .Input("log_probs")
+            .Input("targets")
+            .Input("input_lengths")
+            .Input("target_lengths")
+            .Output("loss")
+            .Output("alpha")
+            .Attr("blank", int(blank))
+            .Attr("zero_infinity", zero_infinity)
+            .Build()
+        )
+        self._xdivy_op = (
+            flow.builtin_op("xdivy").Input("x").Input("y").Output("z").Build()
+        )
+        self.constant = _ConstantBase
+
+    def forward(
+        self,
+        log_probs: Tensor,
+        targets: Tensor,
+        input_lengths: Tensor,
+        target_lengths: Tensor,
+    ) -> Tensor:
+        (loss, _) = self._op(log_probs, targets, input_lengths, target_lengths)
+        if self.zero_infinity:
+            cond = flow.eq(
+                loss,
+                self.constant(
+                    size=loss.shape,
+                    value=float("inf"),
+                    dtype=loss.dtype,
+                    device=loss.device,
+                )(),
+            )
+            loss = flow.where(
+                cond,
+                flow.zeros(size=loss.shape, dtype=loss.dtype, device=loss.device),
+                loss,
+            )
+        if self.reduction == "mean":
+            return flow.mean(
+                self._xdivy_op(
+                    loss,
+                    flow.cast(flow.clamp(target_lengths, min=1), dtype=log_probs.dtype),
+                )[0]
+            )
+        elif self.reduction == "sum":
+            return flow.sum(loss)
+        else:
+            return loss
+
+
+class BCEWithLogitsLoss(Module):
+    """This operator combines the `Sigmoid` and `BCELoss` together. For numerical stability,
+    we apply some math tricks instead of using `Sigmoid` layer with `BCELoss`.
+
+    The equation is:
+
+    if :attr:`reduction` = ``"none"``:
+
+    .. math::
+
+        out = -weight*[Pos\\_weight*y*log\\sigma({x}) + (1-y)*log(1-\\sigma(x))]
+
+    if :attr:`reduction` = ``"mean"``:
+
+    .. math::
+
+        out = -\\frac{weight}{n}\\sum_{i=1}^n[Pos\\_weight*y*log\\sigma({x}) + (1-y)*log(1-\\sigma(x))]
+
+    if :attr:`reduction` = ``"sum"``:
+
+    .. math::
+
+        out = -weight*\\sum_{i=1}^n[Pos\\_weight*y*log\\sigma({x}) + (1-y)*log(1-\\sigma(x))]
+
+    Args:
+        weight (Tensor, optional): The manual rescaling weight to the loss. Default: ``None``
+        size_average (bool, optional) 鈥� Deprecated (see :attr:`reduction`). Default: ``True``
+        reduce (bool, optional) 鈥� Deprecated (see :attr:`reduction`). Default: ``True``
+        reduction (str, optional): The reduce type, it can be one of ``"none"``, ``"mean"``, ``"sum"``.
+            ``'none'``: no reduction will be applied, ``'mean'``: the sum of the output will be divided
+            by the number of elements in the output, ``'sum'``: the output will be summed. Default: ``"mean"``
+        pos_weight (Tensor, optional): The manual rescaling weight to the positive examples.
+            Default: ``None``
+
+    Shape:
+        - Input: :math:`(N,*)` where `*` means, any number of additional dimensions
+        - Target: :math:`(N,*)`, same shape as the input
+        - Output: scalar. If :attr:`reduction` is ``"none"``, then :math:`(N,*)`, same shape as input.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> input = flow.Tensor([[1.2, 0.2, -0.3], [0.7, 0.6, -2], [0.7, 0.6, -2]], dtype=flow.float32)
+        >>> target = flow.Tensor([[0, 1, 0], [1, 0, 1], [1, 0, 1]], dtype=flow.float32)
+        >>> weight = flow.Tensor([[2, 2, 2], [2, 2, 2], [2, 2, 2]], dtype=flow.float32)
+        >>> pos_weight = flow.Tensor([1.2, 1.3, 1.4], dtype=flow.float32)
+
+        >>> m = flow.nn.BCEWithLogitsLoss(weight=weight, pos_weight=pos_weight, reduction="none")
+        >>> out = m(input, target)
+        >>> out
+        tensor([[2.9266, 1.5552, 1.1087],
+                [0.9676, 2.075 , 5.9554],
+                [0.9676, 2.075 , 5.9554]], dtype=oneflow.float32)
+
+        >>> m = flow.nn.BCEWithLogitsLoss(weight=weight, pos_weight=pos_weight, reduction="mean")
+        >>> out = m(input, target)
+        >>> out
+        tensor([2.6207], dtype=oneflow.float32)
+
+        >>> m = flow.nn.BCEWithLogitsLoss(weight=weight, pos_weight=pos_weight, reduction="sum")
+        >>> out = m(input, target)
+        >>> out
+        tensor([23.5865], dtype=oneflow.float32)
+
+
+    """
+
+    def __init__(
+        self,
+        weight=None,
+        size_average: bool = True,
+        reduce: bool = True,
+        reduction: Optional[str] = "mean",
+        pos_weight=None,
+    ) -> None:
+        super().__init__()
+        assert reduction in [
+            "sum",
+            "none",
+            "mean",
+            None,
+        ], "only 'sum', 'mean' and None supported by now"
+        self.weight = weight
+        self.size_average = size_average
+        self.reduce = reduce
+        self.reduction = reduction
+        self.pos_weight = pos_weight
+
+    def forward(self, input, target):
+        if not target.shape == input.shape:
+            raise ValueError(
+                "Target size ({}) must be the same as input size ({})".format(
+                    target.size(), input.size()
+                )
+            )
+        _neg_input = flow.negative(input)
+        _max_val = flow.clip(_neg_input, 0)
+        _neg_max_val = flow.negative(_max_val)
+        if self.pos_weight:
+            _log_weight = (self.pos_weight - 1) * target + 1
+            _loss = (1 - target) * input + _log_weight * (
+                flow.log(flow.exp(_neg_max_val) + flow.exp(_neg_input - _max_val))
+                + _max_val
+            )
+        else:
+            _loss = (1 - target) * input + _max_val
+            _loss += flow.log(flow.exp(_neg_max_val) + flow.exp(_neg_input - _max_val))
+        if self.weight is not None:
+            assert (
+                self.weight.shape == input.shape
+            ), "The weight shape must be the same as Input shape"
+            _weighted_loss = self.weight * _loss
+        else:
+            _weighted_loss = _loss
+        if self.reduction == "mean":
+            return flow.mean(_weighted_loss)
+        elif self.reduction == "sum":
+            return flow.sum(_weighted_loss)
+        else:
+            return _weighted_loss
+
+
+class SmoothL1Loss(Module):
+    """Creates a criterion that uses a squared term if the absolute
+    element-wise error falls below beta and an L1 term otherwise.
+    The interface is consistent with PyTorch.
+    The documentation is referenced from:
+    https://pytorch.org/docs/stable/generated/torch.nn.SmoothL1Loss.html
+
+    It is less sensitive to outliers than :class:`torch.nn.MSELoss` and in some cases
+    prevents exploding gradients (e.g. see the paper `Fast R-CNN <https://openaccess.thecvf.com/content_iccv_2015/papers/Girshick_Fast_R-CNN_ICCV_2015_paper.pdf>`__ by Ross Girshick)..
+
+    For a batch of size :math:`N`, the unreduced loss can be described as:
+
+    .. math::
+        \\ell(x, y) = L = \\{l_1, ..., l_N\\}^T
+
+    with
+
+    .. math::
+        l_n = \\begin{cases}
+        0.5 (x_n - y_n)^2 / beta, & \\text{if } |x_n - y_n| < beta \\\\
+        |x_n - y_n| - 0.5 * beta, & \\text{otherwise }
+        \\end{cases}
+
+    If `reduction` is not `none`, then:
+
+    .. math::
+        \\ell(x, y) =
+        \\begin{cases}
+            \\operatorname{mean}(L), &  \\text{if reduction} = \\text{`mean';}\\\\
+            \\operatorname{sum}(L),  &  \\text{if reduction} = \\text{`sum'.}
+        \\end{cases}
+
+    .. note::
+        Smooth L1 loss can be seen as exactly :class:`L1Loss`, but with the :math:`|x - y| < beta`
+        portion replaced with a quadratic function such that its slope is 1 at :math:`|x - y| = beta`.
+        The quadratic segment smooths the L1 loss near :math:`|x - y| = 0`.
+
+    .. note::
+        Smooth L1 loss is closely related to :class:`HuberLoss`, being
+        equivalent to :math:`huber(x, y) / beta` (note that Smooth L1's beta hyper-parameter is
+        also known as delta for Huber). This leads to the following differences:
+
+        * As beta -> 0, Smooth L1 loss converges to :class:`L1Loss`, while :class:`HuberLoss`
+          converges to a constant 0 loss.
+        * As beta -> :math:`+\\infty`, Smooth L1 loss converges to a constant 0 loss, while
+          :class:`HuberLoss` converges to :class:`MSELoss`.
+        * For Smooth L1 loss, as beta varies, the L1 segment of the loss has a constant slope of 1.
+          For :class:`HuberLoss`, the slope of the L1 segment is beta.
+
+    Args:
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there are multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when :attr:`reduce` is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+        beta (float, optional): Specifies the threshold at which to change between L1 and L2 loss.
+            The value must be non-negative. Default: 1.0
+
+    Shape:
+        - Input: :math:`(N, *)` where :math:`*` means any number of additional dimensions
+        - Target: :math:`(N, *)`; same shape as the input
+        - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(N, *)`; same shape as the input
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        
+        >>> x = flow.Tensor(np.array([0.1, 0.4, 0.3, 0.5, 0.9]).astype(np.float32), dtype=flow.float32)
+        >>> y = flow.Tensor(np.array([0.3, 0.9, 2.5, 0.4, 0.3]).astype(np.float32), dtype=flow.float32)
+        >>> m = flow.nn.SmoothL1Loss(reduction="none")
+        >>> out = m(x, y)
+        >>> out
+        tensor([0.02 , 0.125, 1.7  , 0.005, 0.18 ], dtype=oneflow.float32)
+
+        >>> m = flow.nn.SmoothL1Loss(reduction="mean")
+        >>> out = m(x, y)
+        >>> out
+        tensor([0.406], dtype=oneflow.float32)
+
+        >>> m = flow.nn.SmoothL1Loss(reduction="sum")
+        >>> out = m(x, y)
+        >>> out
+        tensor([2.03], dtype=oneflow.float32)
+    """
+
+    def __init__(
+        self, size_average=None, reduce=None, reduction: str = "mean", beta: float = 1.0
+    ) -> None:
+        super().__init__()
+        if size_average is not None:
+            raise ValueError("Argument reduce is not supported yet")
+        if reduce is not None:
+            raise ValueError("Argument reduce is not supported yet")
+        assert reduction in [
+            "sum",
+            "none",
+            "mean",
+            None,
+        ], "only 'sum', 'mean' and None supported by now"
+        self.reduction = reduction
+        self.beta = beta
+        self._op = (
+            flow.builtin_op("smooth_l1_loss")
+            .Input("prediction")
+            .Input("label")
+            .Output("loss")
+            .Attr("beta", float(beta))
+            .Build()
+        )
+
+    def forward(self, input, target) -> Tensor:
+        loss = self._op(input, target)[0]
+        if self.reduction == "none":
+            return loss
+        elif self.reduction == "sum":
+            return flow.sum(loss)
+        elif self.reduction == "mean":
+            return flow.mean(loss)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/masked_fill.py b/python/oneflow/nn/modules/masked_fill.py
new file mode 100644
index 0000000000000000000000000000000000000000..599219be11d8c7472446375a6d6e00b45446d761
--- /dev/null
+++ b/python/oneflow/nn/modules/masked_fill.py
@@ -0,0 +1,77 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class MaskedFill(Module):
+    def __init__(self, value) -> None:
+        super().__init__()
+        self.value = value
+
+    def forward(self, input, mask):
+        in_shape = tuple(input.shape)
+        value_like_x = flow.Tensor(*in_shape, device=input.device)
+        value_like_x.fill_(self.value)
+        return flow.F.where(mask, value_like_x, input)
+
+
+@register_tensor_op("masked_fill")
+def masked_fill_op(input, mask, value):
+    """
+    Fills elements of :attr:`self` tensor with :attr:`value` where :attr:`mask` is True.
+    The shape of :attr:`mask` must be broadcastable with the shape of the underlying tensor.
+
+    Args:
+        mask (BoolTensor): the boolean mask
+        value (float): the value to fill in with
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> in_arr = np.array(
+        ...     [[[-0.13169311,  0.97277078,  1.23305363,  1.56752789],
+        ...     [-1.51954275,  1.87629473, -0.53301206,  0.53006478],
+        ...     [-1.38244183, -2.63448052,  1.30845795, -0.67144869]],
+        ...     [[ 0.41502161,  0.14452418,  0.38968   , -1.76905653],
+        ...     [ 0.34675095, -0.7050969 , -0.7647731 , -0.73233418],
+        ...     [-1.90089858,  0.01262963,  0.74693893,  0.57132389]]]
+        ... )
+        >>> fill_value = 8.7654321 # random value e.g. -1e9 3.1415
+        >>> input = flow.Tensor(in_arr, dtype=flow.float32)
+        >>> mask = flow.Tensor((in_arr > 0).astype(np.int8), dtype=flow.int)
+        >>> output = flow.masked_fill(input, mask, fill_value)
+
+        # tensor([[[-0.1317,  8.7654,  8.7654,  8.7654],
+        #  [-1.5195,  8.7654, -0.533 ,  8.7654],
+        #  [-1.3824, -2.6345,  8.7654, -0.6714]],
+
+        # [[ 8.7654,  8.7654,  8.7654, -1.7691],
+        #  [ 8.7654, -0.7051, -0.7648, -0.7323],
+        #  [-1.9009,  8.7654,  8.7654,  8.7654]]], dtype=oneflow.float32)
+
+    """
+    return MaskedFill(value)(input, mask)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/masked_select.py b/python/oneflow/nn/modules/masked_select.py
new file mode 100644
index 0000000000000000000000000000000000000000..deb7aaddd18929c6ea119295ab704f2457936df0
--- /dev/null
+++ b/python/oneflow/nn/modules/masked_select.py
@@ -0,0 +1,98 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class MaskedSelect(Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, mask):
+        assert len(x.shape) == len(
+            mask.shape
+        ), f"The dim of masked_select module's inputs can not match, please check!"
+        broadcast_like_shape = []
+        broadcast_x_axes = []
+        broadcast_mask_axes = []
+        for i in range(len(x.shape)):
+            max_dim = max(x.shape[i], mask.shape[i])
+            broadcast_like_shape.append(max_dim)
+            if max_dim != x.shape[i]:
+                broadcast_x_axes.append(i)
+            if max_dim != mask.shape[i]:
+                broadcast_mask_axes.append(i)
+        broadcast_like_tensor = flow.zeros(
+            tuple(broadcast_like_shape), dtype=flow.float32, device=x.device
+        )
+        broadcast_like_tensor.requires_grad = x.requires_grad or mask.requires_grad
+        if len(broadcast_x_axes) != 0:
+            x = flow.broadcast_like(
+                x, broadcast_like_tensor, broadcast_axes=tuple(broadcast_x_axes)
+            )
+        if len(broadcast_mask_axes) != 0:
+            mask = flow.broadcast_like(
+                mask, broadcast_like_tensor, broadcast_axes=tuple(broadcast_mask_axes)
+            )
+        mask = mask.to(dtype=x.dtype)
+        res = flow.F.mul(x, mask)
+        indices = flow.argwhere(res)
+        gather_res = flow.F.gather_nd(res, indices)
+        return gather_res.flatten()
+
+
+def masked_select_op(x, mask):
+    """
+
+    Returns a new 1-D tensor which indexes the input tensor according to the boolean mask mask which is a BoolTensor(In oneFlow BoolTensor is replaced by Int8Tensor).
+
+    The shapes of the mask tensor and the input tensor don鈥檛 need to match, but they must be broadcastable.
+
+    Args:
+        input (Tensor): the input tensor.
+        mask (Tensor): the tensor containing the binary mask to index with
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        
+        >>> x = flow.Tensor(np.array([[-0.4620, 0.3139], [0.3898, -0.7197], [0.0478, -0.1657]]), dtype=flow.float32)
+        >>> mask = x.gt(0.05)
+        >>> out = flow.masked_select(x, mask)
+        >>> out
+        tensor([0.3139, 0.3898], dtype=oneflow.float32)
+    """
+    return MaskedSelect()(x, mask)
+
+
+@register_tensor_op("masked_select")
+def tensor_masked_select_op(x, mask):
+    """
+
+    See :func:`oneflow.masked_select`
+
+    """
+    return MaskedSelect()(x, mask)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/math_ops.py b/python/oneflow/nn/modules/math_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe80399451414fc4452807a20ca39c383e43a8a6
--- /dev/null
+++ b/python/oneflow/nn/modules/math_ops.py
@@ -0,0 +1,1673 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import collections
+from typing import Optional, Sequence, Union
+
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+from oneflow.nn.modules.utils import _check_axis, _check_inplace_valid
+from oneflow.ops.transpose_util import (
+    get_inversed_perm,
+    get_perm_when_transpose_axis_to_last_dim,
+)
+
+
+class ScalarMul(Module):
+    def __init__(self, alpha) -> None:
+        super().__init__()
+        if not isinstance(alpha, (int, float)):
+            raise ValueError("alpha type can only be int or float")
+        self.alpha = alpha
+
+    def forward(self, x):
+        return flow.F.mul_scalar(x, self.alpha)
+
+
+class ScalarMulByTensor(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x, y):
+        return flow.F.mul_scalar_by_tensor(x, y)
+
+
+class ElementwiseMul(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x, y):
+        return flow.F.mul(x, y)
+
+
+class BroadcastMul(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x, y):
+        return flow.F.broadcast_mul(x, y)
+
+
+@register_tensor_op("mul")
+def _mul(input, other):
+    """Computes the multiplication of input by other for each element, scalar and broadcast promotation are supported.
+    
+    The formula is:
+
+    .. math::
+        out = input \\times other
+    
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        # element-wise multiply
+        >>> input = flow.Tensor(np.random.randn(2,3))
+        >>> other = flow.Tensor(np.random.randn(2,3))
+        >>> out = flow.mul(input,other).numpy()
+        >>> out.shape
+        (2, 3)
+
+        # scalar mutiply
+        >>> input = 5
+        >>> other = flow.Tensor(np.random.randn(2,3))
+        >>> out = flow.mul(input,other).numpy()
+        >>> out.shape
+        (2, 3)
+
+        # broadcast mutiply
+        >>> input = flow.Tensor(np.random.randn(1,1))
+        >>> other = flow.Tensor(np.random.randn(2,3))
+        >>> out = flow.mul(input,other).numpy()
+        >>> out.shape 
+        (2, 3)
+
+    """
+    if isinstance(input, (int, float)):
+        return ScalarMul(input)(other)
+    elif isinstance(other, (int, float)):
+        return ScalarMul(other)(input)
+    elif input.shape == other.shape:
+        return ElementwiseMul()(input, other)
+    elif input.shape == (1,):
+        return ScalarMulByTensor()(other, input)
+    elif other.shape == (1,):
+        return ScalarMulByTensor()(input, other)
+    else:
+        return BroadcastMul()(input, other)
+
+
+class Variance(Module):
+    def __init__(self, dim: int = None, keepdim: bool = False) -> None:
+        super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
+
+    def forward(self, input):
+        axis = _check_axis(self.dim, input.shape)
+        if isinstance(axis, list) and len(axis) == 0:
+            return flow.zeros(size=input.shape)
+        else:
+            return flow.sub(
+                flow.mean(flow.square(input), axis, self.keepdim),
+                flow.square(flow.mean(input, axis, self.keepdim)),
+            )
+
+
+@register_tensor_op("var")
+def variance_op(input, dim=None, keepdim=False):
+    """Returns the variance of each row of the `input` tensor in the given dimension `dim`.
+
+    If `keepdim` is `True`, the output tensor is of the same size as `input` except in the dimension(s) `dim` 
+    where it is of size 1. Otherwise, dim is squeezed (see `flow.squeeze()`), resulting in the output 
+    tensor having 1 (or `len(dim)`) fewer dimension(s).
+
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of python:ints): the dimension or dimensions to reduce. Defaults to None.
+        keepdim (bool, optional): whether the output tensor has dim retained or not. Defaults to False.
+
+    Returns:
+        Tensor: The result of variance on the specified axis of input Tensor
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> np_arr = np.random.randn(2,3,4,5)
+        >>> input = flow.Tensor(np_arr)
+        >>> output = flow.var(input, 1, True)
+
+    """
+    return Variance(dim, keepdim)(input)
+
+
+class ScalarSubByTensor(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x, y):
+        return flow.F.sub_scalar_by_tensor(x, y)
+
+
+class BroadcastSub(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x, y):
+        return flow.F.broadcast_sub(x, y)
+
+
+class ScalarAdd(Module):
+    def __init__(self, alpha, inplace: bool = False) -> None:
+        super().__init__()
+        if not isinstance(alpha, int) and (not isinstance(alpha, float)):
+            raise ValueError("scalar type can only be int or float")
+        self.alpha = alpha
+        self.inplace = inplace
+
+    def forward(self, x):
+        if self.inplace:
+            _check_inplace_valid(x)
+        return flow.F.add_scalar(x, self.alpha, self.inplace)
+
+
+@register_tensor_op("sub")
+def _sub(input, other):
+    """Computes the subtraction of input by other for each element, scalar and broadcast promotation are supported.
+    The formula is:
+
+    .. math::
+        out = input - other
+    
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        # element-wise subtract
+        >>> input = flow.Tensor(np.random.randn(2,3))
+        >>> other = flow.Tensor(np.random.randn(2,3))
+        >>> out = flow.sub(input,other).numpy()
+        >>> out.shape
+        (2, 3)
+
+        # scalar subtract
+        >>> input = 5
+        >>> other = flow.Tensor(np.random.randn(2,3))
+        >>> out = flow.sub(input,other).numpy()
+        >>> out.shape
+        (2, 3)
+
+        # broadcast subtract
+        >>> input = flow.Tensor(np.random.randn(1,1))
+        >>> other = flow.Tensor(np.random.randn(2,3))
+        >>> out = flow.sub(input,other).numpy()
+        >>> out.shape
+        (2, 3)
+
+    """
+    if isinstance(input, (int, float)):
+        return ScalarAdd(input)(ScalarMul(-1)(other))
+    elif isinstance(other, (int, float)):
+        return ScalarAdd(-1 * other)(input)
+    elif input.shape == other.shape:
+        return BroadcastSub()(input, other)
+    elif other.shape == (1,):
+        return ScalarSubByTensor()(input, other)
+    else:
+        return BroadcastSub()(input, other)
+
+
+class BroadcastDiv(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x, y):
+        return flow.F.broadcast_div(x, y)
+
+
+class ScalarDivByTensor(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x, scalar):
+        return flow.F.div_scalar_by_tensor(x, scalar)
+
+
+@register_tensor_op("div")
+def _div(input, other):
+    """Computes the division of input by other for each element, scalar and broadcast promotation are supported.
+    The formula is:
+
+    .. math::
+        out = \\frac{input}{other}
+    
+    Args:
+        input (Union[int, float, flow.Tensor]): input.
+        other (Union[int, float, flow.Tensor]): other.
+    
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        # element-wise divide
+        >>> input = flow.Tensor(np.random.randn(2,3))
+        >>> other = flow.Tensor(np.random.randn(2,3))
+        >>> out = flow.div(input,other).numpy()
+        >>> out.shape
+        (2, 3)
+
+        # scalar divide
+        >>> input = 5
+        >>> other = flow.Tensor(np.random.randn(2,3))
+        >>> out = flow.div(input,other).numpy()
+        >>> out.shape
+        (2, 3)
+
+        # broadcast divide
+        >>> input = flow.Tensor(np.random.randn(1,1))
+        >>> other = flow.Tensor(np.random.randn(2,3))
+        >>> out = flow.div(input,other).numpy()
+        >>> out.shape 
+        (2, 3)
+
+    """
+    if isinstance(input, (int, float)):
+        return ScalarMul(input)(flow.reciprocal(other))
+    elif isinstance(other, (int, float)):
+        if other == 0 or other == 0.0:
+            other = 0.0
+        else:
+            other = 1.0 / float(other)
+        return ScalarMul(other)(input)
+    elif input.shape == other.shape:
+        return BroadcastDiv()(input, other)
+    elif other.shape == (1,):
+        return ScalarDivByTensor()(input, other)
+    else:
+        return BroadcastDiv()(input, other)
+
+
+class Reciprocal(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.reciprocal_no_nan(x)
+
+
+@register_tensor_op("reciprocal")
+def _reciprocal(x):
+    """Computes the safe reciprocal of x. If x is zero, the reciprocal will
+    be also set to zero.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> x = flow.Tensor(np.array([[1, 2, 3], [4, 5, 6]]))
+        >>> out = flow.reciprocal(x)
+        >>> out.numpy()
+        array([[1.        , 0.5       , 0.33333334],
+               [0.25      , 0.2       , 0.16666667]], dtype=float32)
+    """
+    return Reciprocal()(x)
+
+
+class ScalarAddByTensor(Module):
+    def __init__(self, inplace: bool = False) -> None:
+        super().__init__()
+        self.inplace = inplace
+
+    def forward(self, x, y):
+        if self.inplace:
+            _check_inplace_valid(x)
+        return flow.F.add_scalar_by_tensor(x, y, self.inplace)
+
+
+class ElementwiseAdd(Module):
+    def __init__(self, inplace: bool = False) -> None:
+        super().__init__()
+        self.inplace = inplace
+
+    def forward(self, x, y):
+        if self.inplace:
+            _check_inplace_valid(x)
+        return flow.F.add(x, y, self.inplace)
+
+
+class BroadcastAdd(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x, y):
+        return flow.F.broadcast_add(x, y)
+
+
+@register_tensor_op("add")
+def _add(x, y):
+    """Computes the addition of x by y for each element, scalar and broadcast promotation are supported.
+    The formula is:
+
+    .. math::
+        out = x + y
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        # element-wise add
+        >>> x = flow.Tensor(np.random.randn(2,3))
+        >>> y = flow.Tensor(np.random.randn(2,3))
+        >>> out = flow.add(x, y).numpy()
+        >>> out.shape
+        (2, 3)
+
+        # scalar add
+        >>> x = 5
+        >>> y = flow.Tensor(np.random.randn(2,3))
+        >>> out = flow.add(x, y).numpy()
+        >>> out.shape
+        (2, 3)
+
+        # broadcast add
+        >>> x = flow.Tensor(np.random.randn(1,1))
+        >>> y = flow.Tensor(np.random.randn(2,3))
+        >>> out = flow.add(x, y).numpy()
+        >>> out.shape
+        (2, 3)
+
+    """
+    if isinstance(x, (int, float)):
+        return ScalarAdd(x)(y)
+    elif isinstance(y, (int, float)):
+        return ScalarAdd(y)(x)
+    elif x.shape == y.shape:
+        return ElementwiseAdd()(x, y)
+    elif x.shape == (1,):
+        return ScalarAddByTensor()(y, x)
+    elif y.shape == (1,):
+        return ScalarAddByTensor()(x, y)
+    else:
+        return BroadcastAdd()(x, y)
+
+
+@register_tensor_op("add_")
+def _add_inplace(x, y):
+    """
+    In-place version of :func:`oneflow.Tensor.add`.
+    """
+    if isinstance(y, (int, float)):
+        return ScalarAdd(y, inplace=True)(x)
+    elif x.shape == y.shape:
+        return ElementwiseAdd(inplace=True)(x, y)
+    elif x.shape == (1,):
+        raise RuntimeError(
+            f"output with shape {x.shape} doesn't match the broadcast shape {y.shape}"
+        )
+    elif y.shape == (1,):
+        return ScalarAddByTensor(inplace=True)(x, y)
+    else:
+        y = flow.broadcast_like(y, x)
+        return ElementwiseAdd(inplace=True)(x, y)
+
+
+class Asin(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.asin(x)
+
+
+def asin_op(input):
+    """
+    Returns a new tensor with the arcsine of the elements of :attr:`input`.
+
+    .. math::
+        \\text{out}_{i} = \\sin^{-1}(\\text{input}_{i})
+
+    Args:
+        input (Tensor): the input tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> input = flow.Tensor(np.array([-0.5,  0.8, 1.0,  -0.8]), dtype=flow.float32)
+        >>> output = flow.asin(input)
+        >>> output.shape
+        flow.Size([4])
+        >>> output
+        tensor([-0.5236,  0.9273,  1.5708, -0.9273], dtype=oneflow.float32)
+        >>> input1 = flow.Tensor(np.array([[0.8, 1.0], [-0.6, -1.0]]), dtype=flow.float32)
+        >>> output1 = input1.asin()
+        >>> output1.shape
+        flow.Size([2, 2])
+        >>> output1
+        tensor([[ 0.9273,  1.5708],
+                [-0.6435, -1.5708]], dtype=oneflow.float32)
+    """
+    return Asin()(input)
+
+
+@register_tensor_op("asin")
+def asin_op_tensor(input):
+    """
+
+    See :func:`oneflow.asin`
+    """
+    return Asin()(input)
+
+
+def arcsin_op(input):
+    """
+  
+    Alias for :func:`oneflow.asin`
+    """
+    return Asin()(input)
+
+
+@register_tensor_op("arcsin")
+def arcsin_op_tensor(input):
+    """
+
+    See :func:`oneflow.asin`
+    """
+    return Asin()(input)
+
+
+class Asinh(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.asinh(x)
+
+
+def asinh_op(input):
+    """
+    Returns a new tensor with the inverse hyperbolic sine of the elements of :attr:`input`.
+
+    .. math::
+        \\text{out}_{i} = \\sinh^{-1}(\\text{input}_{i})
+
+    Args:
+        input (Tensor): the input tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> input = flow.Tensor(np.array([2, 3, 4]), dtype=flow.float32)
+        >>> output = flow.asinh(input)
+        >>> output.shape
+        flow.Size([3])
+        >>> output
+        tensor([1.4436, 1.8184, 2.0947], dtype=oneflow.float32)
+
+        >>> input1 = flow.Tensor(np.array([[-1, 0, -0.4], [5, 7, 0.8]]), dtype=flow.float32)
+        >>> output1 = input1.asinh()
+        >>> output1.shape
+        flow.Size([2, 3])
+        >>> output1
+        tensor([[-0.8814,  0.    , -0.39  ],
+                [ 2.3124,  2.6441,  0.7327]], dtype=oneflow.float32)
+
+    """
+    return Asinh()(input)
+
+
+def arcsinh_op(input):
+    """
+  
+    Alias for :func:`oneflow.asinh`
+    """
+    return Asinh()(input)
+
+
+@register_tensor_op("asinh")
+def asinh_op_tensor(input):
+    """
+
+    See :func:`oneflow.asinh`
+    """
+    return Asinh()(input)
+
+
+@register_tensor_op("arcsinh")
+def arcsinh_op_tensor(input):
+    """
+
+    See :func:`oneflow.asinh`
+    """
+    return Asinh()(input)
+
+
+class Sin(Module):
+    def __init__(self, inplace: bool = False) -> None:
+        super().__init__()
+        self.inplace = inplace
+
+    def forward(self, x):
+        if self.inplace:
+            _check_inplace_valid(x)
+        return flow.F.sin(x, self.inplace)
+
+
+def sin_op(tensor):
+    """
+    Returns a new tensor with the sine of the elements of :attr:`input`.
+
+    .. math::
+
+        \\text{out}_{i} = \\sin(\\text{input}_{i})
+
+    Args:
+        input (Tensor): the input tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> x1 = flow.Tensor(np.array([-0.5461,  0.1347, -2.7266, -0.2746]).astype(np.float32))
+        >>> out1 = flow.sin(x1)
+        >>> out1
+        tensor([-0.5194,  0.1343, -0.4032, -0.2712], dtype=oneflow.float32)
+        >>> x2 = flow.Tensor(np.array([-1.4, 2.6, 3.7]).astype(np.float32),device=flow.device('cuda'))
+        >>> out2 = flow.sin(x2)
+        >>> out2
+        tensor([-0.9854,  0.5155, -0.5298], device='cuda:0', dtype=oneflow.float32)
+
+    """
+    return Sin(inplace=False)(tensor)
+
+
+@register_tensor_op("sin")
+def sin_op_tensor(tensor):
+    """
+
+    sin() -> Tensor
+
+    See :func:`oneflow.sin`
+    
+    """
+    return Sin(inplace=False)(tensor)
+
+
+@register_tensor_op("sin_")
+def inplace_sin_op_tensor(x):
+    """
+    In-place version of :func:`oneflow.sin`
+    
+    """
+    return Sin(inplace=True)(x)
+
+
+class Cos(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.cos(x)
+
+
+@register_tensor_op("cos")
+def cos_op(tensor):
+    """
+    Returns a new tensor with the cosine  of the elements of :attr:`input`.
+    
+    .. math::
+        \\text{out}_{i} = \\cos(\\text{input}_{i})
+
+    Args:
+        input (Tensor): the input tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> arr = np.array([1.4309,  1.2706, -0.8562,  0.9796])
+        >>> input = flow.Tensor(arr, dtype=flow.float32)
+        >>> output = flow.cos(input).numpy()
+
+    """
+    return Cos()(tensor)
+
+
+class Atan(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.atan(x)
+
+
+def atan_op(tensor):
+    """
+    Returns a new tensor with the arctangent of the elements of :attr:`input`.
+
+    .. math::
+        \\text{out}_{i} = \\tan^{-1}(\\text{input}_{i})
+
+    Args:
+        input (Tensor): the input tensor.
+
+    For example:
+
+    .. code-block:: python
+    
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> input = flow.Tensor(np.array([0.5, 0.6, 0.7]), dtype=flow.float32)
+        >>> output = flow.atan(input)
+        >>> output.shape
+        flow.Size([3])
+        
+    """
+    return Atan()(tensor)
+
+
+@register_tensor_op("atan")
+def atan_op_tensor(tensor):
+    """
+
+    See :func:`oneflow.atan`
+    
+    """
+    return Atan()(tensor)
+
+
+def arctan_op(tensor):
+    """
+    Alias for :func:`oneflow.atan`
+    
+    """
+    return Atan()(tensor)
+
+
+@register_tensor_op("arctan")
+def arctan_op_tensor(tensor):
+    """
+
+    See :func:`oneflow.arctan`
+    
+    """
+    return Atan()(tensor)
+
+
+class FMod(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x, y):
+        if not isinstance(x, (flow.Tensor, flow._oneflow_internal.Tensor)):
+            raise ValueError("Expected type of input is Tensor")
+        if isinstance(y, (int, float)):
+            x = flow.F.cast(x, flow.float32)
+            y = flow.tensor([y], dtype=flow.float32, device=x.device)
+        elif isinstance(y, (flow.Tensor, flow._oneflow_internal.Tensor)):
+            if x.dtype != y.dtype:
+                x = flow.F.cast(x, flow.float32)
+                y = flow.F.cast(y, flow.float32)
+        else:
+            raise ValueError("Expected type of other is Tensor or Scalar")
+        return flow.F.fmod(x, y)
+
+
+def fmod_op(input, other):
+    """
+    fmod(input, other, *, out=None) -> Tensor
+
+    Computes the element-wise remainder of division.
+
+    The dividend and divisor may contain both for integer and floating point
+    numbers. The remainder has the same sign as the dividend :attr:`input`.
+
+    Supports broadcasting to a common shape, integer and float inputs.
+
+
+    Args:
+        input (Tensor): the dividend
+        other (Tensor or Scalar): the divisor
+
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+
+    Example::
+
+        >>> import oneflow as flow
+        >>> flow.fmod(flow.tensor([-3., -2, -1, 1, 2, 3]), 2)
+        tensor([-1., -0., -1.,  1.,  0.,  1.], dtype=oneflow.float32)
+        >>> flow.fmod(flow.tensor([1, 2, 3, 4, 5]), 1.5)
+        tensor([1. , 0.5, 0. , 1. , 0.5], dtype=oneflow.float32)
+        >>> flow.fmod(flow.tensor([1, 2, 3, 4, -5]), flow.tensor([4, 2, 1, 3., 1]))
+        tensor([ 1.,  0.,  0.,  1., -0.], dtype=oneflow.float32)
+
+    """
+    return FMod()(input, other)
+
+
+@register_tensor_op("fmod")
+def fmod_op_tensor(input, other):
+    """
+
+    See :func:`oneflow.fmod`
+    
+    """
+    return FMod()(input, other)
+
+
+class Log(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.log(x)
+
+
+@register_tensor_op("log")
+def log_op(tensor):
+    """
+    Returns a new tensor with the natural logarithm of the elements of :attr:`input`.
+    
+    .. math::
+        y_{i} = \\log_{e} (x_{i})
+
+    Args:
+        input (Tensor): the input tensor.
+    
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> arr = np.random.randn(2, 3, 4, 5)
+        >>> input = flow.Tensor(arr, dtype=flow.float32)
+        >>> output = flow.log(input)
+
+
+    """
+    return Log()(tensor)
+
+
+class Subtract(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x, y):
+        if isinstance(x, (int, float)):
+            return ScalarAdd(x)(-1 * y)
+        elif isinstance(y, (int, float)):
+            return ScalarAdd(-1 * y)(x)
+        elif x.shape == y.shape:
+            return BroadcastSub()(x, y)
+        elif x.shape == (1,):
+            return ScalarSubByTensor()(y, x)
+        elif y.shape == (1,):
+            return ScalarSubByTensor()(x, y)
+        else:
+            return BroadcastSub()(x, y)
+
+
+class Sqrt(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, input):
+        return flow.F.sqrt(input)
+
+
+@register_tensor_op("rsqrt")
+def rsqrt_op(input):
+    """Returns a new tensor with the reciprocal of the square-root of each of
+        the elements of :attr:`input`.
+
+        .. math::
+            \\text{out}_{i} = \\frac{1}{\\sqrt{\\text{input}_{i}}}
+
+        Args:
+            input (Tensor) 鈥� the input tensor.
+
+         For example:
+
+        .. code-block:: python
+
+            >>> import oneflow as flow
+            >>> import numpy as np
+            
+            >>> a = flow.Tensor(np.array([1.0, 2.0, 3.0]))
+            >>> out = flow.rsqrt(a).numpy()
+            >>> out
+            array([1.        , 0.70710677, 0.57735026], dtype=float32)
+    """
+    return Rsqrt()(input)
+
+
+class Rsqrt(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, input):
+        return flow.F.rsqrt(input)
+
+
+@register_tensor_op("sqrt")
+def sqrt_op(input):
+    """Returns a new tensor with the square-root of the elements of :attr:`input`.
+
+        .. math::
+            \\text{out}_{i} = \\sqrt{\\text{input}_{i}}
+
+        Args:
+            input (Tensor): the input tensor.
+
+         For example:
+
+        .. code-block:: python
+
+            >>> import oneflow as flow
+            >>> import numpy as np
+            
+            >>> arr = np.array([1.0, 2.0, 3.0])
+            >>> input = flow.Tensor(arr)
+            >>> output = flow.sqrt(input).numpy()
+            >>> output
+            array([1.       , 1.4142135, 1.7320508], dtype=float32)
+        """
+    return Sqrt()(input)
+
+
+class Square(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, input):
+        return flow.F.square(input)
+
+
+@register_tensor_op("square")
+def square_op(input):
+    """Returns a new tensor with the square of the elements of :attr:`input`.
+
+        .. math::
+            \\text{out}_{i} = \\sqrt{\\text{input}_{i}}
+
+        Args:
+            input (Tensor): the input tensor.
+
+         For example:
+
+        .. code-block:: python
+
+            >>> import oneflow as flow
+            >>> import numpy as np
+            
+            >>> arr = np.array([1.0, 2.0, 3.0])
+            >>> input = flow.Tensor(arr)
+            >>> output = flow.square(input).numpy()
+            >>> output
+            array([1., 4., 9.], dtype=float32)
+        """
+    return Square()(input)
+
+
+class Std(Module):
+    def __init__(self, dim=None, unbiased=True, keepdim=False) -> None:
+        super().__init__()
+        assert unbiased == True, "Only support 'unbiased=True' for now!"
+        self.unbiased = unbiased
+        self.keepdim = keepdim
+        self.dim = dim
+        self.reduce_count = 1
+        self.square_op = Square()
+        self.sqrt_op = Sqrt()
+        self.subtract_op = Subtract()
+
+    def forward(self, x):
+        self.axis = _check_axis(self.dim, x.shape)
+        if isinstance(self.axis, list) and len(self.axis) == 0:
+            return flow.zeros(size=x.shape)
+        else:
+            if len(self.axis) == 0:
+                self.reduce_count = x.nelement()
+            else:
+                for i in self.axis:
+                    self.reduce_count *= x.shape[i]
+            sum = (
+                flow.sum(self.square_op(x), self.axis, self.keepdim) / self.reduce_count
+            )
+            square = self.square_op(
+                flow.sum(x, self.axis, self.keepdim) / self.reduce_count
+            )
+            subtract = self.subtract_op(sum, square)
+            res = self.sqrt_op(subtract)
+            return res
+
+
+@register_tensor_op("std")
+def std_op(tensor, dim, unbiased=True, keepdim=False):
+    """
+    Returns the standard-deviation of each row of the :attr:`input` tensor in the
+    dimension :attr:`dim`. If :attr:`dim` is a list of dimensions,
+    reduce over all of them.
+
+    If keepdim is True, the output tensor is of the same size as input except in 
+    the dimension(s) dim where it is of size 1. Otherwise, dim is squeezed, 
+    resulting in the output tensor having 1 (or len(dim)) fewer dimension(s).
+
+    If :attr:`unbiased` is ``False``, then the standard-deviation will be calculated
+    via the biased estimator. Otherwise, Bessel's correction will be used.
+
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of python:ints): the dimension or dimensions to reduce.
+        unbiased (bool): whether to use the unbiased estimation or not
+        keepdim (bool): whether the output tensor has `dim` retained or not.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        
+        >>> arr = np.array([1.0, 2.0, 3.0])
+        >>> input = flow.Tensor(arr)
+        >>> output = flow.std(input, dim=0).numpy()
+        >>> output
+        array([0.8164968], dtype=float32)
+
+    """
+    return Std(dim, unbiased, keepdim)(tensor)
+
+
+class Pow(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x, y):
+        if isinstance(y, (int, float)):
+            return flow.F.pow_scalar(x, alpha=y)
+        else:
+            return flow.F.pow(x, y)
+
+
+@register_tensor_op("pow")
+def pow_op(tensor, exponent):
+    """Takes the power of each element in input with exponent and returns a tensor with the result. Exponent can be either a single float number, a single int number, or a tensor with the same shape as input.
+    When exponent is a scalar value, the operation applied is:
+
+    .. math::
+        \\text{out}_i = x_i ^ \\text{exponent}
+\u200b
+    When exponent is a tensor, the operation applied is:
+
+    .. math::
+        \\text{out}_i = x_i ^ {\\text{exponent}_i}
+
+    Args:
+        - input (Tensor): the input tensor.
+        - exponent (int, float, Tensor): the exponent.
+
+    Returns:
+        Tensor: The result of variance on the specified axis of input Tensor
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        
+        >>> x = flow.Tensor(np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]))
+        >>> out = flow.pow(x, 2).numpy()
+        >>> out
+        array([ 1.,  4.,  9., 16., 25., 36.], dtype=float32)
+
+        >>> x = flow.Tensor(np.array([1.0, 2.0, 3.0, 4.0]))
+        >>> y = flow.Tensor(np.array([1.0, 2.0, 3.0, 4.0]))
+        >>> out = flow.pow(x, y).numpy()
+        >>> out
+        array([  1.,   4.,  27., 256.], dtype=float32)
+        
+    """
+    return Pow()(tensor, exponent)
+
+
+class Addmm(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x, mat1, mat2, alpha=1, beta=1):
+        if len(x.shape) > 2 or len(mat1.shape) > 2 or len(mat2.shape) > 2:
+            raise ValueError("input matrixes shape can not be greater than 2")
+        else:
+            return _mul(x, beta) + _mul(flow.F.matmul(mat1, mat2), alpha)
+
+
+def addmm_op(input, mat1, mat2, alpha=1, beta=1):
+    """addmm(beta=1, input, alpha=1, mat1, mat2, out=None) -> Tensor
+
+    Performs a matrix multiplication of the matrices :attr:`mat1` and :attr:`mat2`.
+    The matrix :attr:`input` is added to the final result.
+
+    If :attr:`mat1` is a :math:`(n \\times m)` tensor, :attr:`mat2` is a
+    :math:`(m \\times p)` tensor, then :attr:`input` must be
+    broadcastable with a :math:`(n \\times p)` tensor
+    and :attr:`out` will be a :math:`(n \\times p)` tensor.
+
+    :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between
+    :attr:`mat1` and :attr:`mat2` and the added matrix :attr:`input` respectively.
+
+    .. math::
+        \\text{out} = \\beta\\ \\text{input} + \\alpha\\ (\\text{mat1}_i \\mathbin{@} \\text{mat2}_i)
+
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+
+    Args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\\beta`)
+        input (Tensor): matrix to be added
+        alpha (Number, optional): multiplier for :math:`mat1 @ mat2` (:math:`\\alpha`)
+        mat1 (Tensor): the first matrix to be multiplied
+        mat2 (Tensor): the second matrix to be multiplied
+        out (Tensor, optional): the output tensor.
+
+    For example:
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        >>> input = flow.tensor(np.array([[1,2,4],[5,11,9.1]]))
+        >>> mat1 = flow.tensor(np.array([[7.3,1.9,7.3],[10.2,1,5.5]])) 
+        >>> mat2 = flow.tensor(np.array([[7.3,1.9,7.3],[10.2,1,5.5],[3.7,2.2,8.1]])) 
+        >>> output = flow.addmm(input, mat1, mat2)
+        >>> output
+        tensor([[100.68,  33.83, 126.87],
+                [110.01,  43.48, 133.61]], dtype=oneflow.float64)
+        >>> output.shape
+        flow.Size([2, 3])
+
+        >>> input2 = flow.tensor(np.array([1.7]))
+        >>> mat1 = flow.tensor(np.array([[1,2],[5,9.1],[7.7,1.4]]))
+        >>> mat2 = flow.tensor(np.array([[1,2,3.7],[5,9.1,6.8]]))
+        >>> output2 = flow.addmm(input2, mat1, mat2, alpha=1, beta=2)
+        >>> output2
+        tensor([[14.4 , 23.6 , 20.7 ],
+                [53.9 , 96.21, 83.78],
+                [18.1 , 31.54, 41.41]], dtype=oneflow.float64)
+        >>> output2.shape
+        flow.Size([3, 3])
+    """
+    return Addmm()(input, mat1, mat2, alpha, beta)
+
+
+@register_tensor_op("addmm")
+def addmm_op_tensor(input, mat1, mat2, alpha=1, beta=1):
+    """
+    See :func:`oneflow.addmm`
+    """
+    return Addmm()(input, mat1, mat2, alpha, beta)
+
+
+class Clamp(Module):
+    def __init__(self, min_value=None, max_value=None) -> None:
+        super().__init__()
+        if min_value is not None:
+            floating_min_value = float(min_value)
+            integral_min_value = int(min_value)
+        if max_value is not None:
+            floating_max_value = float(max_value)
+            integral_max_value = int(max_value)
+        if min_value is not None and max_value is not None:
+            self._op = (
+                flow.builtin_op("clip_by_scalar")
+                .Input("x")
+                .Output("y")
+                .Attr("floating_min", floating_min_value)
+                .Attr("integral_min", integral_min_value)
+                .Attr("floating_max", floating_max_value)
+                .Attr("integral_max", integral_max_value)
+                .Build()
+            )
+        elif min_value is not None:
+            self._op = (
+                flow.builtin_op("clip_by_scalar_min")
+                .Input("x")
+                .Output("y")
+                .Attr("floating_min", floating_min_value)
+                .Attr("integral_min", integral_min_value)
+                .Build()
+            )
+        elif max_value is not None:
+            self._op = (
+                flow.builtin_op("clip_by_scalar_max")
+                .Input("x")
+                .Output("y")
+                .Attr("floating_max", floating_max_value)
+                .Attr("integral_max", integral_max_value)
+                .Build()
+            )
+        else:
+            raise ValueError("min_value and max_value cannot be None at the same time")
+
+    def forward(self, x):
+        return self._op(x)[0]
+
+
+def clamp_op(tensor, min=None, max=None):
+    """
+    Clamp all elements in :attr:`input` into the range `[` :attr:`min`, :attr:`max` `]` and return
+    a resulting tensor:
+
+    .. math::
+        y_i = \\begin{cases}
+            \\text{min} & \\text{if } x_i < \\text{min} \\\\
+            x_i & \\text{if } \\text{min} \\leq x_i \\leq \\text{max} \\\\
+            \\text{max} & \\text{if } x_i > \\text{max}
+        \\end{cases}
+
+    If :attr:`input` is of type `FloatTensor` or `DoubleTensor`, args :attr:`min`
+    and :attr:`max` must be real numbers, otherwise they should be integers.
+
+    Args:
+        input (Tensor): the input tensor.
+        min (Number): lower-bound of the range to be clamped to. Defaults to None.
+        max (Number): upper-bound of the range to be clamped to. Defaults to None.
+        out (Tensor, optional): the output tensor.
+
+    For example:
+
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> arr = np.array([0.2, 0.6, -1.5, -0.3])
+        >>> input = flow.Tensor(arr)
+        >>> output = flow.clamp(input, min=-0.5, max=0.5)
+        >>> output
+        tensor([ 0.2,  0.5, -0.5, -0.3], dtype=oneflow.float32)
+
+        >>> arr = np.array([0.2, 0.6, -1.5, -0.3])
+        >>> input = flow.Tensor(arr)
+        >>> output = flow.clamp(input, min=None, max=0.5)
+        >>> output
+        tensor([ 0.2,  0.5, -1.5, -0.3], dtype=oneflow.float32)
+
+        >>> arr = np.array([0.2, 0.6, -1.5, -0.3])
+        >>> input = flow.Tensor(arr)
+        >>> output = flow.clamp(input, min=-0.5, max=None)
+        >>> output
+        tensor([ 0.2,  0.6, -0.5, -0.3], dtype=oneflow.float32)
+
+    """
+    return Clamp(min, max)(tensor)
+
+
+@register_tensor_op("clamp")
+def clamp_op_tensor(tensor, min=None, max=None):
+    """
+    See :func:`oneflow.clamp`
+    """
+    return Clamp(min, max)(tensor)
+
+
+def clip_op(tensor, min=None, max=None):
+    """
+    Alias for :func:`oneflow.clamp`
+    """
+    return Clamp(min, max)(tensor)
+
+
+@register_tensor_op("clip")
+def clip_op_tensor(tensor, min=None, max=None):
+    """
+    See :func:`oneflow.clamp`
+    """
+    return Clamp(min, max)(tensor)
+
+
+class Cosh(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.cosh(x)
+
+
+@register_tensor_op("cosh")
+def cosh_op(tensor):
+    """
+    Returns a new tensor with the hyperbolic cosine of the elements of :attr:`input`.
+
+    .. math::
+        \\text{out}_{i} = \\cosh(\\text{input}_{i})
+
+    Args:
+        input (Tensor): the input tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> arr = np.array([ 0.1632,  1.1835, -0.6979, -0.7325])
+        >>> input = flow.Tensor(arr, dtype=flow.float32)
+        >>> output = flow.cosh(input).numpy()
+        >>> output
+        array([1.0133467, 1.7859949, 1.2535787, 1.2804903], dtype=float32)
+
+    """
+    return Cosh()(tensor)
+
+
+class Erf(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, input):
+        return flow.F.erf(input)
+
+
+@register_tensor_op("erf")
+def erf_op(input):
+    """Computes the error function of each element. The error function is defined as follows:
+
+    .. math::
+            \\operatorname{erf}(x)=\\frac{2}{\\sqrt{\\pi}} \\int_{0}^{x} e^{-t^{2}} d t
+
+    Args:
+        x (oneflow.Tensor): A Tensor
+
+    Returns:
+        oneflow.Tensor: The result Tensor   
+               
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        
+        >>> x = flow.Tensor(np.array([0, -1., 10.]), dtype=flow.float32)
+        >>> out = flow.erf(x)
+        >>> out.shape
+        flow.Size([3])
+        >>> out.numpy()
+        array([ 0.       , -0.8427008,  1.       ], dtype=float32)
+
+        >>> x = flow.Tensor(np.array([[0, -1., 10.], [5, 7, 0.8]]), dtype=flow.float32)
+        >>> out = flow.erf(x)
+        >>> out.shape
+        flow.Size([2, 3])
+        >>> out.numpy()
+        array([[ 0.        , -0.8427008 ,  1.        ],
+               [ 1.        ,  1.        ,  0.74210095]], dtype=float32)
+
+        >>> x = flow.Tensor(np.array([[0, -1., 10.], [5, 7, 0.8], [2, 3, 4]]), dtype=flow.float32)
+        >>> out = x.erf()
+        >>> out.shape
+        flow.Size([3, 3])
+        >>> out.numpy()
+        array([[ 0.        , -0.8427008 ,  1.        ],
+               [ 1.        ,  1.        ,  0.74210095],
+               [ 0.9953223 ,  0.9999779 ,  1.        ]], dtype=float32)
+
+    """
+    return Erf()(input)
+
+
+@register_tensor_op("erf")
+def erf_op_tensor(input):
+    """
+    See :func:`oneflow.erf`
+    """
+    return Erf()(input)
+
+
+class Erfc(Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.erfc_op = flow.builtin_op("erfc").Input("x").Output("y").Build()
+
+    def forward(self, input):
+        return self.erfc_op(input)[0]
+
+
+@register_tensor_op("erfc")
+def erfc_op(input):
+    """Computes the complementary error function of each element of input. The complementary error 
+    function is defined as follows:
+
+    .. math::
+            \\operatorname{erfc}(x)=1-\\frac{2}{\\sqrt{\\pi}} \\int_{0}^{x} e^{-t^{2}} d t
+
+    Args:
+        x (oneflow.Tensor): A Tensor
+
+    Returns:
+        oneflow.Tensor: The result Tensor
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        
+        >>> x = flow.Tensor(np.array([0, -1., 10.]), dtype=flow.float32)
+        >>> out = flow.erfc(x)
+        >>> out.shape
+        flow.Size([3])
+        >>> out.numpy()
+        array([1.0000000e+00, 1.8427007e+00, 2.8025969e-45], dtype=float32)
+
+        >>> x = flow.Tensor(np.array([[0, -1., 10.], [5, 7, 0.8]]), dtype=flow.float32)
+        >>> out = flow.erfc(x)
+        >>> out.shape
+        flow.Size([2, 3])
+        >>> out.numpy()
+        array([[1.0000000e+00, 1.8427007e+00, 2.8025969e-45],
+               [1.5374597e-12, 4.1838257e-23, 2.5789905e-01]], dtype=float32)
+
+        >>> x = flow.Tensor(np.array([[0, -1., 10.], [5, 7, 0.8], [2, 3, 4]]), dtype=flow.float32)
+        >>> out = x.erfc()
+        >>> out.shape
+        flow.Size([3, 3])
+        >>> out.numpy()
+        array([[1.0000000e+00, 1.8427007e+00, 2.8025969e-45],
+               [1.5374597e-12, 4.1838257e-23, 2.5789905e-01],
+               [4.6777348e-03, 2.2090499e-05, 1.5417259e-08]], dtype=float32)
+        
+    """
+    return Erfc()(input)
+
+
+@register_tensor_op("erfc")
+def erfc_op_tensor(input):
+    """
+    See :func:`oneflow.erfc`
+    """
+    return Erfc()(input)
+
+
+class Ceil(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.ceil(x)
+
+
+def ceil_op(x):
+    """Returns a new tensor with the ceil of the elements of :attr:`x`,
+    the smallest integer greater than or equal to each element.
+
+    The equation is: 
+
+    .. math::
+        \\text{out}_{i} = \\left\\lceil \\text{input}_{i} \\right\\rceil = \\left\\lfloor \\text{input}_{i} \\right\\rfloor + 1
+
+    Args:
+        x (oneflow.Tensor): A Tensor.
+    
+    Returns:
+        oneflow.Tensor: The result Tensor
+
+    For example: 
+
+
+    .. code-block:: python 
+        
+        >>> import oneflow as flow
+        >>> import numpy as np   
+        >>> x = flow.Tensor(np.array([0.1, -2, 3.4]).astype(np.float32))
+        >>> y = flow.ceil(x)
+        >>> print(y.shape)
+        flow.Size([3])
+        >>> print(y.numpy())
+        [ 1. -2.  4.]
+
+
+        >>> x = flow.Tensor(np.array([[2.5, 4.6, 0.6],[7.8, 8.3, 9.2]]).astype(np.float32))
+        >>> y = x.ceil()
+        >>> print(y.shape)
+        flow.Size([2, 3])
+        >>> print(y.numpy())
+        [[ 3.  5.  1.]
+         [ 8.  9. 10.]]
+
+
+
+
+        >>> x = flow.Tensor(np.array([[[2.2, 4.4, 6.5],[7.1, 8.2, 9.3]],[[10.6,11.2,12.2],[13.5,14.8,15.9]]]).astype(np.float32))
+        >>> y = flow.ceil(x)
+        >>> print(y.shape)
+        flow.Size([2, 2, 3])
+        >>> print(y.numpy())
+        [[[ 3.  5.  7.]
+          [ 8.  9. 10.]]
+        <BLANKLINE>
+         [[11. 12. 13.]
+          [14. 15. 16.]]]
+
+    """
+    return Ceil()(x)
+
+
+@register_tensor_op("ceil")
+def ceil_op_tensor(x):
+    """
+    See :func:`oneflow.ceil`
+    """
+    return Ceil()(x)
+
+
+class Expm1(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.expm1(x)
+
+
+def expm1_op(x):
+    """Returns a new tensor with the exponential of the elements minus 1
+    of :attr:`x`.
+
+
+    The equation is: 
+
+    .. math::
+        y_{i} = e^{x_{i}} - 1
+
+    Args:
+        x (oneflow.Tensor): A Tensor.
+    
+    Returns:
+        oneflow.Tensor: The result Tensor
+
+    For example: 
+
+    .. code-block:: python 
+        
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> x = flow.Tensor(np.array([1, 2, 3]).astype(np.float32))
+        >>> y = flow.expm1(x)
+        >>> y.shape
+        flow.Size([3])
+        >>> y
+        tensor([ 1.7183,  6.3891, 19.0855], dtype=oneflow.float32)
+
+
+        >>> x = flow.Tensor(np.array([[2, 4, 6],[7, 8, 9]]).astype(np.float32))
+        >>> y = x.expm1()
+        >>> y.shape
+        flow.Size([2, 3])
+        >>> y
+        tensor([[6.3891e+00, 5.3598e+01, 4.0243e+02],
+                [1.0956e+03, 2.9800e+03, 8.1021e+03]], dtype=oneflow.float32)
+
+
+
+        >>> x = flow.Tensor(np.array([[[2, 4, 6],[7, 8, 9]],[[10,11,12],[13,14,15]]]).astype(np.float32))
+        >>> y = flow.expm1(x)
+        >>> print(y.shape)
+        flow.Size([2, 2, 3])
+        >>> print(y.numpy())
+        [[[6.3890562e+00 5.3598152e+01 4.0242880e+02]
+          [1.0956332e+03 2.9799580e+03 8.1020840e+03]]
+        <BLANKLINE>
+         [[2.2025465e+04 5.9873141e+04 1.6275380e+05]
+          [4.4241238e+05 1.2026032e+06 3.2690165e+06]]]
+
+
+    """
+    return Expm1()(x)
+
+
+@register_tensor_op("expm1")
+def expm1_op_tensor(x):
+    """
+    See :func:`oneflow.expm1`
+    """
+    return Expm1()(x)
+
+
+class Topk(Module):
+    def __init__(
+        self, k, dim: int = None, largest: bool = True, sorted: bool = True
+    ) -> None:
+        super().__init__()
+        self._op_topk_last_dim = (
+            flow.builtin_op("top_k")
+            .Input("in")
+            .Output("out")
+            .Attr("k", k)
+            .Attr("sorted", sorted)
+            .Build()
+        )
+        self.dim = dim
+        self.largest = largest
+
+    def forward(self, input):
+        if self.dim == None:
+            self.dim = -1
+        num_axes = len(input.shape)
+        axis = self.dim if self.dim >= 0 else self.dim + num_axes
+        assert 0 <= axis < num_axes, "axis out of range"
+        if axis == num_axes - 1:
+            if self.largest:
+                indices = self._op_topk_last_dim(input)[0]
+            else:
+                neg_input = flow.mul(input, -1)
+                indices = self._op_topk_last_dim(neg_input)[0]
+            return (flow.gather(input, indices, dim=axis), indices)
+        else:
+            perm = get_perm_when_transpose_axis_to_last_dim(num_axes, axis)
+            x = flow.F.transpose(input, perm=perm)
+            if self.largest:
+                indices = self._op_topk_last_dim(x)[0]
+            else:
+                neg_input = flow.mul(x, -1)
+                indices = self._op_topk_last_dim(neg_input)[0]
+            indices = flow.F.transpose(indices, perm=get_inversed_perm(perm))
+            return (flow.gather(input, indices, dim=axis), indices)
+
+
+@register_tensor_op("topk")
+def topk_op(input, k, dim: int = None, largest: bool = True, sorted: bool = True):
+    """Finds the values and indices of the k largest entries at specified axis.
+
+    Args:
+        input (oneflow.Tensor): Input Tensor
+        dim (int, optional): the dimension to sort along. Defaults to the last dim (-1)
+        largest (bool, optional): controls whether to return largest or smallest elements
+        sorted (bool, optional): controls whether to return the elements in sorted order
+
+    Returns:
+        Tuple(oneflow.Tensor, oneflow.Tensor(dtype=int32)): A tuple of (values, indices), where
+        the indices are the indices of the elements in the original input tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> x = np.array([[1, 3, 8, 7, 2], [1, 9, 4, 3, 2]], dtype=np.float32)
+        >>> (values, indices) = flow.topk(flow.Tensor(x), k=3, dim=1)
+        >>> values
+        tensor([[8., 7., 3.],
+                [9., 4., 3.]], dtype=oneflow.float32)
+        >>> indices
+        tensor([[2, 3, 1],
+                [1, 2, 3]], dtype=oneflow.int32)
+        >>> values.shape
+        flow.Size([2, 3])
+        >>> indices.shape
+        flow.Size([2, 3])
+        >>> (values, indices) = flow.topk(flow.Tensor(x), k=2, dim=1, largest=False)
+        >>> values
+        tensor([[1., 2.],
+                [1., 2.]], dtype=oneflow.float32)
+        >>> indices
+        tensor([[0, 4],
+                [0, 4]], dtype=oneflow.int32)
+        >>> values.shape
+        flow.Size([2, 2])
+        >>> indices.shape
+        flow.Size([2, 2])
+
+    """
+    return Topk(k=k, dim=dim, largest=largest, sorted=sorted)(input)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/matmul.py b/python/oneflow/nn/modules/matmul.py
new file mode 100644
index 0000000000000000000000000000000000000000..b72869c54e56f41ef27601dc53e6f7c09cd1cc33
--- /dev/null
+++ b/python/oneflow/nn/modules/matmul.py
@@ -0,0 +1,74 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional, Sequence
+
+import oneflow as flow
+import oneflow.framework.id_util as id_util
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class MatMul(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, a, b):
+        assert len(a.shape) >= 2, "Tensor a's dim should >=2"
+        assert len(b.shape) >= 2, "Tensor b's dim should >=2"
+        if len(a.shape) == len(b.shape):
+            if len(a.shape) == 2:
+                res = flow.F.matmul(a, b)
+            else:
+                res = flow.F.batch_matmul(a, b)
+        else:
+            assert (
+                len(b.shape) == 2
+            ), "Not support number of dimensions of a being less than number of dimensions of b!"
+            res = flow.F.broadcast_matmul(a, b)
+        return res
+
+
+@register_tensor_op("matmul")
+def matmul_op(input, other):
+    """This operator applies matrix multiplication to two Tensor.
+
+    Args:
+        a (oneflow.Tensor): A Tensor
+        b (oneflow.Tensor): A Tensor
+
+    Returns:
+        oneflow.Tensor: The result Tensor
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> input1 = flow.Tensor(np.random.randn(2, 6), dtype=flow.float32)
+        >>> input2 = flow.Tensor(np.random.randn(6, 5), dtype=flow.float32)
+        >>> of_out = flow.matmul(input1, input2)
+        >>> of_out.shape
+        flow.Size([2, 5])
+
+    """
+    return MatMul()(input, other)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/meshgrid.py b/python/oneflow/nn/modules/meshgrid.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ad591061583ea5d93c3423c478bc63e21b25c24
--- /dev/null
+++ b/python/oneflow/nn/modules/meshgrid.py
@@ -0,0 +1,92 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.nn.module import Module
+
+
+class MeshGrid(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, inputs):
+        size = len(inputs)
+        assert size > 0, f"meshgrid expects a non-empty TensorList"
+        shape = list()
+        for i in range(size):
+            assert inputs[i].dim() <= 1, f(
+                "Expected scalar or 1D tensor in the tensor list but got: ", inputs[i]
+            )
+            if inputs[i].dim() == 0:
+                shape.append(1)
+            else:
+                shape.append(inputs[i].shape[0])
+        for i in range(size - 1):
+            assert (
+                inputs[i].dtype == inputs[i + 1].dtype
+                and inputs[i].device == inputs[i + 1].device
+            ), f"meshgrid expects all tensors to have the same dtype and device"
+        outputs = []
+        for i in range(size):
+            view_shape = [1] * size
+            view_shape[i] = -1
+            outputs.append(inputs[i].reshape(view_shape).expand(*shape))
+        return outputs
+
+
+def meshgrid_op(*inputs):
+    """The interface is consistent with PyTorch.
+    The documentation is referenced from:
+    https://pytorch.org/docs/stable/_modules/torch/functional.html#meshgrid
+    
+    Take :math:`N` tensors, each of which can be either scalar or 1-dimensional
+    vector, and create :math:`N` N-dimensional grids, where the :math:`i` :sup:`th` grid is defined by
+    expanding the :math:`i` :sup:`th` input over dimensions defined by other inputs.
+
+    Args:
+        tensors (list of Tensor): list of scalars or 1 dimensional tensors. Scalars will be
+            treated as tensors of size :math:`(1,)` automatically
+
+    Returns:
+        seq (sequence of Tensors): If the input has :math:`k` tensors of size
+        :math:`(N_1,), (N_2,), \\ldots , (N_k,)`, then the output would also have :math:`k` tensors,
+        where all tensors are of size :math:`(N_1, N_2, \\ldots , N_k)`.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> input1 = flow.Tensor(np.array([1, 2, 3]), dtype=flow.float32)
+        >>> input2 = flow.Tensor(np.array([4, 5, 6]), dtype=flow.float32)
+        >>> of_x, of_y = flow.meshgrid(input1, input2)
+        >>> of_x
+        tensor([[1., 1., 1.],
+                [2., 2., 2.],
+                [3., 3., 3.]], dtype=oneflow.float32)
+        >>> of_y
+        tensor([[4., 5., 6.],
+                [4., 5., 6.],
+                [4., 5., 6.]], dtype=oneflow.float32)
+    """
+    return MeshGrid()(inputs)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/ne.py b/python/oneflow/nn/modules/ne.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdbcd55b980266c813e475c0627e33bb497c7dcd
--- /dev/null
+++ b/python/oneflow/nn/modules/ne.py
@@ -0,0 +1,79 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class Ne(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, input, other):
+        if isinstance(other, flow.Tensor) or isinstance(
+            other, flow._oneflow_internal.Tensor
+        ):
+            for i in range(len(input.size())):
+                assert (
+                    input.shape[i] >= other.shape[i]
+                ), "The second tensor's shape should broadcastable with the first argument."
+                if input.dtype != other.dtype:
+                    other = other.to(dtype=input.dtype)
+        elif isinstance(other, int) or isinstance(other, float):
+            other = flow.Tensor([other], dtype=input.dtype, device=input.device)
+        else:
+            raise NotImplementedError(
+                "Unsupport data type, The second argument can be a tensor whose shape is broadcastable with the first argument."
+            )
+        return flow.F.broadcast_not_equal(input, other)
+
+
+@register_tensor_op("ne")
+def ne_op(input, other):
+    """
+    Computes element-wise not equality.
+    The second argument can be a number or a tensor whose shape is broadcastable with the first argument.
+
+    Args:
+        input (oneflow.Tensor): the tensor to compare
+        other (oneflow.Tensor, float or int): the target to compare
+
+    Returns:
+
+        - A boolean tensor that is True where :attr:`input` is not equal to :attr:`other` and False elsewhere
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        
+        >>> input = flow.Tensor(np.array([2, 3, 4, 5]), dtype=flow.float32)
+        >>> other = flow.Tensor(np.array([2, 3, 4, 1]), dtype=flow.float32)
+
+        >>> y = flow.ne(input, other)
+        >>> y
+        tensor([0, 0, 0, 1], dtype=oneflow.int8)
+
+    """
+    return Ne()(input, other)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/negative.py b/python/oneflow/nn/modules/negative.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf42c096ede17b3649155495e303e02870bd7e10
--- /dev/null
+++ b/python/oneflow/nn/modules/negative.py
@@ -0,0 +1,60 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class Negative(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.negative(x)
+
+
+@register_tensor_op("negative")
+def negative_op(x):
+    """This operator computes the negative value of Tensor.
+
+    Args:
+        x (oneflow.Tensor): A Tensor
+
+    Returns:
+        oneflow.Tensor: The result Tensor
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> input = flow.Tensor(
+        ...    np.array([1.0, -1.0, 2.3]).astype(np.float32), dtype=flow.float32
+        ... )
+        >>> out = flow.negative(input)
+        >>> out
+        tensor([-1. ,  1. , -2.3], dtype=oneflow.float32)
+
+    """
+    return Negative()(x)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/norm.py b/python/oneflow/nn/modules/norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..444bc7363373d253e8b95277be5c12b9a7fb0184
--- /dev/null
+++ b/python/oneflow/nn/modules/norm.py
@@ -0,0 +1,451 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+def check_dim(num_dims, input_dim):
+    if input_dim == None:
+        dim = input_dim
+    elif isinstance(input_dim, (int, tuple)):
+        if isinstance(input_dim, int):
+            dim = input_dim if input_dim >= 0 else input_dim + num_dims
+            if dim >= num_dims or dim < 0:
+                raise IndexError("Dimension out of range")
+        else:
+            temp = list(input_dim)
+            for i in range(len(temp)):
+                temp[i] = temp[i] if temp[i] >= 0 else temp[i] + num_dims
+                if temp[i] >= num_dims or temp[i] < 0:
+                    raise IndexError("Dimension out of range")
+            dim = temp
+    else:
+        raise TypeError(
+            "linalg_vector_norm(): argument 'dim' must be tuple of ints, not {}".format(
+                type(input_dim)
+            )
+        )
+    return dim
+
+
+def _norm_min_max(input, ord, dim, keepdim):
+    if ord > 0:
+        return flow.max(input, dim=dim, keepdim=keepdim)
+    else:
+        return flow.min(input, dim=dim, keepdim=keepdim)
+
+
+class Vector_Norm(Module):
+    def __init__(self, ord=2, dim=None, keepdim=False) -> None:
+        super().__init__()
+        if ord == None:
+            self.ord = 2.0
+        elif isinstance(ord, (int, float)):
+            self.ord = float(ord)
+        else:
+            raise TypeError(
+                "linalg_vector_norm(): argument 'ord' must be Number, not {}".format(
+                    type(ord)
+                )
+            )
+        self.dim = dim
+        self.keepdim = keepdim
+
+    def _vector_norm(self, x, ord, dim, keepdim=False):
+        if ord == 0:
+            return flow.cast(flow.tensor([flow.argwhere(x).shape[0]]), flow.float32)
+        elif ord == float("inf"):
+            return flow.max(flow.abs(x), dim=dim, keepdim=keepdim)
+        elif ord == float("-inf"):
+            return flow.min(flow.abs(x), dim=dim, keepdim=keepdim)
+        else:
+            return flow.pow(
+                flow.sum(flow.pow(flow.abs(x), ord), dim=dim, keepdim=keepdim),
+                1.0 / ord,
+            )
+
+    def forward(self, x):
+        num_dims = len(x.shape)
+        dim = check_dim(num_dims, self.dim)
+        if dim == None:
+            return self._vector_norm(
+                x.flatten(), ord=self.ord, dim=self.dim, keepdim=self.keepdim
+            )
+        else:
+            return self._vector_norm(x, ord=self.ord, dim=dim, keepdim=self.keepdim)
+
+
+class Matrix_Norm(Module):
+    def __init__(self, ord="fro", dim=(-2, -1), keepdim=False) -> None:
+        super().__init__()
+        if isinstance(ord, str):
+            assert ord in ["fro", "nuc"], "{} are not supported in matrix norm".format(
+                ord
+            )
+            self.ord = ord
+        elif isinstance(ord, float):
+            assert ord in [
+                float("inf"),
+                float("-inf"),
+            ], "{} are not supported in matrix norm".format(ord)
+            self.ord = ord
+        elif isinstance(ord, int):
+            assert ord in [1, -1, 2, -2], "{} are not supported in matrix norm".format(
+                ord
+            )
+            self.ord = ord
+        elif ord == None:
+            self.ord = "fro"
+        else:
+            raise TypeError(
+                "linalg_matrix_norm(): argument 'ord' must be Number, not {}".format(
+                    type(ord)
+                )
+            )
+        if isinstance(dim, tuple) and len(dim) == 2 and (dim[0] != dim[1]):
+            self.dim = dim
+        else:
+            raise TypeError(
+                "linalg.matrix_norm(): dim must be a 2-tuple of ints with different elements"
+            )
+        self.keepdim = keepdim
+
+    def _matrix_norm(self, x, ord, dim, keepdim):
+        if ord == "nuc":
+            raise NotImplementedError
+        elif ord == "fro":
+            return flow.sqrt(flow.sum(flow.square(x), dim=dim, keepdim=keepdim))
+        elif ord in [float("inf"), float("-inf")]:
+            (dim_0, dim_1) = (dim[0], dim[1])
+            (dim_0, dim_1) = (dim_1, dim_0)
+            if dim_1 > dim_0 and (not keepdim):
+                dim_1 -= 1
+            res = flow.sum(flow.abs(x), dim=dim_0, keepdim=keepdim)
+            return _norm_min_max(res, ord, dim_1, keepdim)
+        elif ord in [1, -1]:
+            (dim_0, dim_1) = (dim[0], dim[1])
+            if dim_1 > dim_0 and (not keepdim):
+                dim_1 -= 1
+            res = flow.sum(flow.abs(x), dim=dim_0, keepdim=keepdim)
+            return _norm_min_max(res, ord, dim_1, keepdim)
+        elif ord in [2, -2]:
+            raise NotImplementedError
+        else:
+            raise ValueError("Invalid norm order: {}".format(ord))
+
+    def forward(self, x):
+        num_dims = len(x.shape)
+        if num_dims < 2:
+            raise RuntimeError(
+                "linalg.matrix_norm(): input tensor must be a matrix or batch of matrices"
+            )
+        dim = check_dim(num_dims, self.dim)
+        return self._matrix_norm(x, ord=self.ord, dim=dim, keepdim=self.keepdim)
+
+
+class Norm(Module):
+    def __init__(self, ord=None, dim=None, keepdim=False) -> None:
+        super().__init__()
+        self.ord = ord
+        self.dim = dim
+        self.keepdim = keepdim
+
+    def forward(self, x):
+        if isinstance(self.dim, int):
+            res = Vector_Norm(ord=self.ord, dim=self.dim, keepdim=self.keepdim)(x)
+        elif isinstance(self.dim, tuple):
+            res = Matrix_Norm(ord=self.ord, dim=self.dim, keepdim=self.keepdim)(x)
+        elif self.dim == None and self.ord != None:
+            assert (
+                len(x.shape) <= 2
+            ), "input must be 1-D or 2-D when dim is None and ord is not None"
+            if len(x.shape) == 1:
+                res = Vector_Norm(ord=self.ord, keepdim=self.keepdim)(x)
+            else:
+                res = Matrix_Norm(ord=self.ord, keepdim=self.keepdim)(x)
+        elif self.dim == None and self.ord == None:
+            res = Vector_Norm(keepdim=self.keepdim)(x)
+        return res
+
+
+def norm_op(input, ord=None, dim=None, keepdim=False):
+    """linalg.norm(input, ord=None, dim=None, keepdim=False, *, out=None) -> Tensor
+
+    Returns the matrix norm or vector norm of a given tensor.
+
+    This function can calculate one of eight different types of matrix norms, or one
+    of an infinite number of vector norms, depending on both the number of reduction
+    dimensions and the value of the `ord` parameter.
+
+    Args:
+        input (Tensor): The input tensor. If dim is None, input must be 1-D or 2-D, unless :attr:`ord`
+            is None. If both :attr:`dim` and :attr:`ord` are None, the 2-norm of the input flattened to 1-D
+            will be returned. Its data type must be either a floating point or complex type. For complex
+            inputs, the norm is calculated on of the absolute values of each element. If the input is
+            complex and neither :attr:`dtype` nor :attr:`out` is specified, the result's data type will
+            be the corresponding floating point type (e.g. float if :attr:`input` is complexfloat).
+
+        ord (int, float, inf, -inf, 'fro', 'nuc', optional): The order of norm.
+            inf refers to :attr:`float('inf')`, numpy's :attr:`inf` object, or any equivalent object.
+            The following norms can be calculated:
+
+            =====  ============================  ==========================
+            ord    norm for matrices             norm for vectors
+            =====  ============================  ==========================
+            None   Frobenius norm                2-norm
+            'fro'  Frobenius norm                -- not supported --
+            'nuc'  -- not supported yet --       -- not supported --
+            inf    max(sum(abs(x), dim=1))       max(abs(x))
+            -inf   min(sum(abs(x), dim=1))       min(abs(x))
+            0      -- not supported --           sum(x != 0)
+            1      max(sum(abs(x), dim=0))       as below
+            -1     min(sum(abs(x), dim=0))       as below
+            2      -- not supported yet --       as below
+            -2     -- not supported yet --       as below
+            other  -- not supported --           sum(abs(x)**ord)**(1./ord)
+            =====  ============================  ==========================
+
+            Default: ``None``
+
+        dim (int, 2-tuple of ints, 2-list of ints, optional): If :attr:`dim` is an int,
+            vector norm will be calculated over the specified dimension. If :attr:`dim`
+            is a 2-tuple of ints, matrix norm will be calculated over the specified
+            dimensions. If :attr:`dim` is None, matrix norm will be calculated
+            when the input tensor has two dimensions, and vector norm will be
+            calculated when the input tensor has one dimension. Default: ``None``
+
+        keepdim (bool, optional): If set to True, the reduced dimensions are retained
+            in the result as dimensions with size one. Default: ``False``
+
+        out (Tensor, optional): The output tensor.
+
+    Examples::
+
+        >>> import oneflow as flow
+        >>> from oneflow import linalg as LA
+        >>> import numpy as np
+        >>> a = flow.tensor(np.arange(9, dtype=np.float32) - 4)
+        >>> a
+        tensor([-4., -3., -2., -1.,  0.,  1.,  2.,  3.,  4.], dtype=oneflow.float32)
+        >>> b = a.reshape((3, 3))
+        >>> b
+        tensor([[-4., -3., -2.],
+                [-1.,  0.,  1.],
+                [ 2.,  3.,  4.]], dtype=oneflow.float32)
+
+        >>> LA.norm(a)
+        tensor([7.746], dtype=oneflow.float32)
+        >>> LA.norm(b)
+        tensor([7.746], dtype=oneflow.float32)
+        >>> LA.norm(b, 'fro')
+        tensor([7.746], dtype=oneflow.float32)
+        >>> LA.norm(a, float('inf'))
+        tensor([4.], dtype=oneflow.float32)
+        >>> LA.norm(b, float('inf'))
+        tensor([9.], dtype=oneflow.float32)
+        >>> LA.norm(a, -float('inf'))
+        tensor([0.], dtype=oneflow.float32)
+        >>> LA.norm(b, -float('inf'))
+        tensor([2.], dtype=oneflow.float32)
+
+        >>> LA.norm(a, 1)
+        tensor([20.], dtype=oneflow.float32)
+        >>> LA.norm(b, 1)
+        tensor([7.], dtype=oneflow.float32)
+        >>> LA.norm(a, -1)
+        tensor([0.], dtype=oneflow.float32)
+        >>> LA.norm(b, -1)
+        tensor([6.], dtype=oneflow.float32)
+        >>> LA.norm(a, 2)
+        tensor([7.746], dtype=oneflow.float32)
+        >>> LA.norm(a, -2)
+        tensor([0.], dtype=oneflow.float32)
+        >>> LA.norm(a, 3)
+        tensor([5.848], dtype=oneflow.float32)
+        >>> LA.norm(a, -3)
+        tensor([0.], dtype=oneflow.float32)
+
+    Using the :attr:`dim` argument to compute vector norms::
+
+        >>> c = flow.tensor([[1., 2., 3.],
+        ...                   [-1, 1, 4]])
+        >>> LA.norm(c, dim=0)
+        tensor([1.4142, 2.2361, 5.    ], dtype=oneflow.float32)
+        >>> LA.norm(c, dim=1, keepdim = True)
+        tensor([[3.7417],
+                [4.2426]], dtype=oneflow.float32)
+        >>> LA.norm(c, ord=1, dim=1)
+        tensor([6., 6.], dtype=oneflow.float32)
+
+    Using the :attr:`dim` argument to compute matrix norms::
+
+        >>> m = flow.tensor(np.arange(8, dtype=np.float32)).reshape((2, 2, 2))
+        >>> LA.norm(m, dim=(1,2))
+        tensor([ 3.7417, 11.225 ], dtype=oneflow.float32)
+    """
+    return Norm(ord, dim, keepdim)(input)
+
+
+@register_tensor_op("norm")
+def norm_tensor_op(input, ord=None, dim=None, keepdim=False):
+    """
+    See :func:`oneflow.linalg.norm`
+    """
+    return Norm(ord, dim, keepdim)(input)
+
+
+def vector_norm_tensor_op(input, ord=2, dim=None, keepdim=False):
+    """
+    linalg.vector_norm(input, ord=2, dim=None, keepdim=False, *, dtype=None, out=None) -> Tensor
+
+    Computes a vector norm.
+
+    Supports input of float, double dtypes.
+
+    This function does not necessarily treat multidimensonal attr:`input` as a batch of
+    vectors, instead:
+
+    - If :attr:`dim`\\ `= None`, :attr:`input` will be flattened before the norm is computed.
+    - If :attr:`dim` is an `int` or a `tuple`, the norm will be computed over these dimensions and the other dimensions will be treated as batch dimensions.
+
+    This behavior is for consistency with :func:`flow.linalg.norm`.
+
+    :attr:`ord` defines the vector norm that is computed. The following norms are supported:
+
+    ======================   ========================================================
+    :attr:`ord`              vector norm
+    ======================   ========================================================
+    `2` (default)            `2`-norm (see below)
+    `inf`                    `max(abs(x))`
+    `-inf`                   `min(abs(x))`
+    `0`                      `sum(x != 0)`
+    other `int` or `float`   `sum(abs(x)^{ord})^{(1 / ord)}`
+    ======================   ========================================================
+
+    where `inf` refers to `float('inf')`, NumPy's `inf` object, or any equivalent object.
+
+
+    Args:
+        input (Tensor): tensor, flattened by default, but this behavior can be
+            controlled using :attr:`dim`.
+        ord (int, float, inf, -inf, 'fro', 'nuc', optional): order of norm. Default: `2`
+        dim (int, Tuple[int], optional): dimensions over which to compute
+            the norm. See above for the behavior when :attr:`dim`\\ `= None`.
+            Default: `None`
+        keepdim (bool, optional): If set to `True`, the reduced dimensions are retained
+            in the result as dimensions with size one. Default: `False`
+
+    Returns:
+        A real-valued tensor.
+
+    Examples::
+
+        >>> import oneflow as flow
+        >>> from oneflow import linalg as LA
+        >>> import numpy as np
+        >>> a = flow.tensor(np.arange(9, dtype=np.float32) - 4)
+        >>> a
+        tensor([-4., -3., -2., -1.,  0.,  1.,  2.,  3.,  4.], dtype=oneflow.float32)
+        >>> b = a.reshape((3, 3))
+        >>> b
+        tensor([[-4., -3., -2.],
+                [-1.,  0.,  1.],
+                [ 2.,  3.,  4.]], dtype=oneflow.float32)
+        >>> LA.vector_norm(a, ord=3.5)
+        tensor([5.4345], dtype=oneflow.float32)
+        >>> LA.vector_norm(b, ord=3.5)
+        tensor([5.4345], dtype=oneflow.float32)
+    """
+    return Vector_Norm(ord, dim, keepdim)(input)
+
+
+def matrix_norm_tensor_op(input, ord="fro", dim=(-2, -1), keepdim=False):
+    """
+    linalg.matrix_norm(input, ord='fro', dim=(-2, -1), keepdim=False, *, dtype=None, out=None) -> Tensor
+
+    Computes a matrix norm.
+
+    Support input of float, double, cfloat and cdouble dtypes.
+    Also supports batches of matrices: the norm will be computed over the
+    dimensions specified by the 2-tuple :attr:`dim` and the other dimensions will
+    be treated as batch dimensions. The output will have the same batch dimensions.
+
+    :attr:`ord` defines the matrix norm that is computed. The following norms are supported:
+
+    ======================   ========================================================
+    :attr:`ord`              matrix norm
+    ======================   ========================================================
+    `'fro'` (default)        Frobenius norm
+    `'nuc'`                  -- not supported yet --
+    `inf`                    `max(sum(abs(x), dim=1))`
+    `-inf`                   `min(sum(abs(x), dim=1))`
+    `1`                      `max(sum(abs(x), dim=0))`
+    `-1`                     `min(sum(abs(x), dim=0))`
+    `2`                      -- not supported yet --
+    `-2`                     -- not supported yet --
+    ======================   ========================================================
+
+    where `inf` refers to `float('inf')`, NumPy's `inf` object, or any equivalent object.
+
+    Args:
+        input (Tensor): tensor with two or more dimensions. By default its
+            shape is interpreted as `(*, m, n)` where `*` is zero or more
+            batch dimensions, but this behavior can be controlled using :attr:`dim`.
+        ord (int, inf, -inf, 'fro', 'nuc', optional): order of norm. Default: `'fro'`
+        dim (Tuple[int, int], optional): dimensions over which to compute the norm. Default: `(-2, -1)`
+        keepdim (bool, optional): If set to `True`, the reduced dimensions are retained
+            in the result as dimensions with size one. Default: `False`
+
+
+    Returns:
+        A real-valued tensor.
+
+    Examples::
+
+        >>> import oneflow as flow
+        >>> from oneflow import linalg as LA
+        >>> import numpy as np
+        >>> a = flow.tensor(np.arange(9, dtype=np.float32)).reshape((3,3))
+        >>> a
+        tensor([[0., 1., 2.],
+                [3., 4., 5.],
+                [6., 7., 8.]], dtype=oneflow.float32)
+        >>> LA.matrix_norm(a)
+        tensor([14.2829], dtype=oneflow.float32)
+        >>> LA.matrix_norm(a, ord=-1)
+        tensor([9.], dtype=oneflow.float32)
+        >>> b = a.expand(2, -1, -1)
+        >>> b
+        tensor([[[0., 1., 2.],
+                 [3., 4., 5.],
+                 [6., 7., 8.]],
+        <BLANKLINE>
+                [[0., 1., 2.],
+                 [3., 4., 5.],
+                 [6., 7., 8.]]], dtype=oneflow.float32)
+        >>> LA.matrix_norm(b)
+        tensor([14.2829, 14.2829], dtype=oneflow.float32)
+        >>> LA.matrix_norm(b, dim=(0, 2))
+        tensor([ 3.1623, 10.    , 17.2627], dtype=oneflow.float32)
+    """
+    return Matrix_Norm(ord, dim, keepdim)(input)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/normalization.py b/python/oneflow/nn/modules/normalization.py
new file mode 100644
index 0000000000000000000000000000000000000000..df6814f6d05034691b52bada073ccbf20ce3bc57
--- /dev/null
+++ b/python/oneflow/nn/modules/normalization.py
@@ -0,0 +1,324 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Tuple, Union
+
+import oneflow as flow
+from oneflow.framework.tensor import Tensor
+from oneflow.nn import init
+from oneflow.nn.module import Module
+
+_shape_t = Union[int, Tuple[int], flow._oneflow_internal.Size]
+
+
+class GroupNorm(Module):
+    """The interface is consistent with PyTorch.
+    The documentation is referenced from:
+    https://pytorch.org/docs/stable/generated/torch.nn.GroupNorm.html
+
+    Applies Group Normalization over a mini-batch of inputs as described in
+    the paper `Group Normalization <https://arxiv.org/abs/1803.08494>`__
+
+    .. math::
+
+        y = \\frac{x - \\mathrm{E}[x]}{ \\sqrt{\\mathrm{Var}[x] + \\epsilon}} * \\gamma + \\beta
+
+    The input channels are separated into :attr:`num_groups` groups, each containing
+    ``num_channels / num_groups`` channels. The mean and standard-deviation are calculated
+    separately over the each group. :math:`\\gamma` and :math:`\\beta` are learnable
+    per-channel affine transform parameter vectors of size :attr:`num_channels` if
+    :attr:`affine` is ``True``.
+    The standard-deviation is calculated via the biased estimator, equivalent to
+    `torch.var(input, unbiased=False)`.
+
+    This layer uses statistics computed from input data in both training and
+    evaluation modes.
+
+    Args:
+        num_groups (int): number of groups to separate the channels into
+        num_channels (int): number of channels expected in input
+        eps: a value added to the denominator for numerical stability. Default: 1e-5
+        affine: a boolean value that when set to ``True``, this module
+            has learnable per-channel affine parameters initialized to ones (for weights)
+            and zeros (for biases). Default: ``True``.
+
+    Shape:
+        - Input: :math:`(N, C, *)` where :math:`C=\\text{num_channels}`
+        - Output: :math:`(N, C, *)` (same shape as input)
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> input = flow.Tensor(np.random.randn(20, 6, 10, 10))
+        >>> # Separate 6 channels into 3 groups
+        >>> m = flow.nn.GroupNorm(3, 6)
+        >>> # Separate 6 channels into 6 groups (equivalent with InstanceNorm)
+        >>> m = flow.nn.GroupNorm(6, 6)
+        >>> # Put all 6 channels into a single group (equivalent with LayerNorm)
+        >>> m = flow.nn.GroupNorm(1, 6)
+        >>> # Activating the module
+        >>> output = m(input)
+    
+"""
+
+    def __init__(
+        self,
+        num_groups: int,
+        num_channels: int,
+        eps: float = 1e-05,
+        affine: bool = True,
+    ) -> None:
+        super().__init__()
+        assert num_groups > 0, "The num_groups must larger than zero"
+        assert num_channels > 0, "The num_channels must larger than zero"
+        self.num_groups = num_groups
+        self.num_channels = num_channels
+        self.eps = eps
+        self.affine = affine
+        if self.affine:
+            self.weight = flow.nn.Parameter(flow.Tensor(1, num_channels, 1))
+            self.bias = flow.nn.Parameter(flow.Tensor(1, num_channels, 1))
+        else:
+            self.register_parameter("weight", None)
+            self.register_parameter("bias", None)
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        if self.affine:
+            flow.nn.init.ones_(self.weight)
+            flow.nn.init.zeros_(self.bias)
+
+    def forward(self, input: Tensor) -> Tensor:
+        assert (
+            len(input.shape) >= 3
+        ), "The dimensions of input tensor must larger than 2"
+        assert (
+            input.shape[1] == self.num_channels
+        ), "The channels of input tensor must equal num_channels"
+        origin_shape = input.shape
+        reshape_to_1d = flow.reshape(
+            input, shape=[origin_shape[0], self.num_groups, -1]
+        )
+        mean = flow.mean(reshape_to_1d, dim=2, keepdim=True)
+        variance = flow.var(reshape_to_1d, dim=2, keepdim=True)
+        normalized = (reshape_to_1d - mean) / flow.sqrt(variance + self.eps)
+        normalized = flow.reshape(
+            normalized, shape=[origin_shape[0], self.num_channels, -1]
+        )
+        if self.weight:
+            normalized = normalized * self.weight
+        if self.bias:
+            normalized = normalized + self.bias
+        res = flow.reshape(normalized, shape=tuple(input.shape))
+        return res
+
+    def extra_repr(self) -> str:
+        return "{num_groups}, {num_channels}, eps={eps}, affine={affine}".format(
+            **self.__dict__
+        )
+
+
+class LayerNorm(Module):
+    """Applies Layer Normalization over a mini-batch of inputs as described in
+    the paper `Layer Normalization <https://arxiv.org/abs/1607.06450>`__
+
+    .. math::
+        y = \\frac{x - \\mathrm{E}[x]}{ \\sqrt{\\mathrm{Var}[x] + \\epsilon}} * \\gamma + \\beta
+
+    The mean and standard-deviation are calculated separately over the last
+    certain number dimensions which have to be of the shape specified by
+    :attr:`normalized_shape`.
+    :math:`\\gamma` and :math:`\\beta` are learnable affine transform parameters of
+    :attr:`normalized_shape` if :attr:`elementwise_affine` is ``True``.
+    The standard-deviation is calculated via the biased estimator.
+
+    .. note::
+        Unlike Batch Normalization and Instance Normalization, which applies
+        scalar scale and bias for each entire channel/plane with the
+        :attr:`affine` option, Layer Normalization applies per-element scale and
+        bias with :attr:`elementwise_affine`.
+
+    This layer uses statistics computed from input data in both training and
+    evaluation modes.
+
+    Args:
+        normalized_shape (int or list or oneflow.Size): input shape from an expected input of size
+
+            .. math::
+                [* \\times \\text{normalized_shape}[0] \\times \\text{normalized_shape}[1] \\times \\ldots \\times \\text{normalized_shape}[-1]]
+
+            If a single integer is used, it is treated as a singleton list, and this module will
+
+            normalize over the last dimension which is expected to be of that specific size.
+
+        eps: a value added to the denominator for numerical stability. Default: 1e-5
+        elementwise_affine: a boolean value that when set to ``True``, this module
+            has learnable per-element affine parameters initialized to ones (for weights)
+            and zeros (for biases). Default: ``True``.
+
+    Shape:
+        - Input: :math:`(N, *)`
+        - Output: :math:`(N, *)` (same shape as input)
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> input_arr = np.array(
+        ...     [
+        ...         [
+        ...             [[-0.16046895, -1.03667831], [-0.34974465, 0.26505867]],
+        ...             [[-1.24111986, -0.53806001], [1.72426331, 0.43572459]],
+        ...         ],
+        ...         [
+        ...             [[-0.77390957, -0.42610624], [0.16398858, -1.35760343]],
+        ...             [[1.07541728, 0.11008703], [0.26361224, -0.48663723]],
+        ...         ],
+        ...     ],
+        ...     dtype=np.float32,
+        ... )
+
+        >>> x = flow.Tensor(input_arr)
+        >>> m = flow.nn.LayerNorm(2)
+        >>> y = m(x).numpy()
+        >>> y
+        array([[[[ 0.99997395, -0.99997395],
+                 [-0.999947  ,  0.999947  ]],
+        <BLANKLINE>
+                [[-0.9999596 ,  0.9999594 ],
+                 [ 0.999988  , -0.999988  ]]],
+        <BLANKLINE>
+        <BLANKLINE>
+               [[[-0.9998343 ,  0.9998341 ],
+                 [ 0.9999914 , -0.9999914 ]],
+        <BLANKLINE>
+                [[ 0.99997866, -0.99997866],
+                 [ 0.9999646 , -0.9999646 ]]]], dtype=float32)
+
+    """
+
+    __constants__ = ["normalized_shape", "eps", "elementwise_affine"]
+    normalized_shape: Tuple[int, ...]
+    eps: float
+    elementwise_affine: bool
+
+    def __init__(
+        self,
+        normalized_shape: _shape_t,
+        eps: float = 1e-05,
+        elementwise_affine: bool = True,
+    ) -> None:
+        super(LayerNorm, self).__init__()
+        if isinstance(normalized_shape, int):
+            normalized_shape = (normalized_shape,)
+        self.normalized_shape = tuple(normalized_shape)
+        self.epsilon = eps
+        self.elementwise_affine = elementwise_affine
+        if self.elementwise_affine:
+            self.weight = flow.nn.Parameter(flow.Tensor(*self.normalized_shape))
+            self.bias = flow.nn.Parameter(flow.Tensor(*self.normalized_shape))
+        else:
+            self.register_parameter("weight", None)
+            self.register_parameter("bias", None)
+        self.reset_parameters()
+        self.begin_norm_axis = 1
+        self.begin_params_axis = 1
+
+    def reset_parameters(self) -> None:
+        if self.elementwise_affine:
+            init.ones_(self.weight)
+            init.zeros_(self.bias)
+
+    def forward(self, x):
+        assert len(x.shape) > len(
+            self.normalized_shape
+        ), "Input tensor dim must greater than normalized dim!"
+        self.begin_norm_axis = len(x.shape) - len(self.normalized_shape)
+        self.begin_params_axis = len(x.shape) - len(self.normalized_shape)
+        if x.device == flow.device("cpu"):
+            reduce_axis = []
+            for dim in range(len(x.shape)):
+                if dim >= self.begin_norm_axis:
+                    reduce_axis.append(dim)
+            mean = x.mean(dim=reduce_axis, keepdim=True)
+            variance = x.var(dim=reduce_axis, keepdim=True)
+            axis = self.begin_norm_axis
+            params_shape = x.shape[self.begin_params_axis :]
+            weight = self.weight
+            bias = self.bias
+            if len(mean.shape) == 1:
+                nd_params_shape = [1] * len(x.shape)
+                nd_params_shape[self.begin_norm_axis] = params_shape[0]
+                mean = mean.reshape(shape=nd_params_shape)
+                variance = variance.reshape(shape=nd_params_shape)
+                if self.weight and params_shape[0] == self.weight.nelement():
+                    weight = self.weight.reshape(shape=nd_params_shape)
+                if self.bias and params_shape[0] == self.bias.nelement():
+                    bias = self.bias.reshape(shape=nd_params_shape)
+            elif len(mean.shape) == len(x.shape):
+                pass
+            else:
+                raise ValueError(
+                    "shape of mean and variance should be 1D or has number of axes and x's"
+                )
+            variance += self.epsilon
+            normalized = (x - mean) * variance.rsqrt()
+            if self.weight:
+                normalized = normalized * weight
+            if self.bias:
+                normalized = normalized + bias
+            affined = normalized
+            nd_params_shape = [1] * (len(x.shape) - len(params_shape)) + list(
+                params_shape
+            )
+            if self.elementwise_affine:
+                affined = affined * self.weight
+                affined = affined + self.bias
+            return affined
+        else:
+            if self.elementwise_affine:
+                res = flow.F.layer_norm_affine(
+                    x,
+                    self.weight,
+                    self.bias,
+                    begin_norm_axis=self.begin_norm_axis,
+                    begin_params_axis=self.begin_params_axis,
+                    epsilon=self.epsilon,
+                )
+            else:
+                res = flow.F.layer_norm(
+                    x,
+                    begin_norm_axis=self.begin_norm_axis,
+                    begin_params_axis=self.begin_params_axis,
+                    epsilon=self.epsilon,
+                )
+            return res
+
+    def extra_repr(self) -> str:
+        return "{normalized_shape}, eps={eps}, elementwise_affine={elementwise_affine}".format(
+            **self.__dict__
+        )
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/padding.py b/python/oneflow/nn/modules/padding.py
new file mode 100644
index 0000000000000000000000000000000000000000..5283c5c32b5ac40ad13fc42e0497ba06820af246
--- /dev/null
+++ b/python/oneflow/nn/modules/padding.py
@@ -0,0 +1,413 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Union
+
+import oneflow as flow
+from oneflow.nn.module import Module
+
+
+class ReplicationPad2d(Module):
+    """The interface is consistent with PyTorch.
+    The documentation is referenced from:
+    https://pytorch.org/docs/stable/generated/torch.nn.ReplicationPad2d.html?highlight=replicationpad2d#torch.nn.ReplicationPad2d
+
+    Pads the input tensor using the replication of the input boundary.
+
+    Args:
+        padding (Union[int, tuple, list]):  the size of the padding. If is `int`, uses the same padding in all boundaries. If a 4-`tuple`, uses (:math:`\\mathrm{padding_{left}}`, :math:`\\mathrm{padding_{right}}`, :math:`\\mathrm{padding_{top}}`, :math:`\\mathrm{padding_{bottom}}`)
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})`
+        - Output: :math:`(N, C, H_{out}, W_{out})` where
+
+            :math:`H_{out} = H_{in} + \\mathrm{padding_{top}} + \\mathrm{padding_{bottom}}`
+
+            :math:`W_{out} = W_{in} + \\mathrm{padding_{left}} + \\mathrm{padding_{right}}`
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> replicationpad_layer_0 = flow.nn.ReplicationPad2d((2, 2, 1, 1))
+        >>> input = flow.Tensor(np.arange(18).reshape((1, 2, 3, 3)).astype(np.float32))
+        >>> input_int = flow.Tensor(np.arange(18).reshape((1, 2, 3, 3)).astype(np.int32))
+        >>> output = replicationpad_layer_0(input)
+        >>> output.shape
+        flow.Size([1, 2, 5, 7])
+        >>> output
+        tensor([[[[ 0.,  0.,  0.,  1.,  2.,  2.,  2.],
+                  [ 0.,  0.,  0.,  1.,  2.,  2.,  2.],
+                  [ 3.,  3.,  3.,  4.,  5.,  5.,  5.],
+                  [ 6.,  6.,  6.,  7.,  8.,  8.,  8.],
+                  [ 6.,  6.,  6.,  7.,  8.,  8.,  8.]],
+        <BLANKLINE>
+                 [[ 9.,  9.,  9., 10., 11., 11., 11.],
+                  [ 9.,  9.,  9., 10., 11., 11., 11.],
+                  [12., 12., 12., 13., 14., 14., 14.],
+                  [15., 15., 15., 16., 17., 17., 17.],
+                  [15., 15., 15., 16., 17., 17., 17.]]]], dtype=oneflow.float32)
+        >>> output_int = replicationpad_layer_0(input_int)
+        >>> output_int
+        tensor([[[[ 0.,  0.,  0.,  1.,  2.,  2.,  2.],
+                  [ 0.,  0.,  0.,  1.,  2.,  2.,  2.],
+                  [ 3.,  3.,  3.,  4.,  5.,  5.,  5.],
+                  [ 6.,  6.,  6.,  7.,  8.,  8.,  8.],
+                  [ 6.,  6.,  6.,  7.,  8.,  8.,  8.]],
+        <BLANKLINE>
+                 [[ 9.,  9.,  9., 10., 11., 11., 11.],
+                  [ 9.,  9.,  9., 10., 11., 11., 11.],
+                  [12., 12., 12., 13., 14., 14., 14.],
+                  [15., 15., 15., 16., 17., 17., 17.],
+                  [15., 15., 15., 16., 17., 17., 17.]]]], dtype=oneflow.float32)
+
+    """
+
+    def __init__(self, padding: Union[int, tuple, list]):
+        super().__init__()
+        if isinstance(padding, (tuple, list)):
+            assert len(padding) == 4, ValueError("Length of padding must be 4")
+            boundary = [padding[0], padding[1], padding[2], padding[3]]
+        elif isinstance(padding, int):
+            boundary = [padding, padding, padding, padding]
+        else:
+            raise ValueError("padding must be int or list or tuple!")
+        self.padding = boundary
+
+    def forward(self, x):
+        (_, _, h, w) = x.shape
+        if (
+            self.padding[2] < h
+            and self.padding[3] < h
+            and (self.padding[0] < w)
+            and (self.padding[1] < w)
+        ):
+            return flow.F.pad(x, pad=self.padding, mode="replicate")
+        else:
+            raise AssertionError(
+                "Padding size should be less than the corresponding input dimension. Please check."
+            )
+
+    def extra_repr(self) -> str:
+        return "{}".format(self.padding)
+
+
+class ReflectionPad2d(Module):
+    """The interface is consistent with PyTorch.
+    The documentation is referenced from:
+    https://pytorch.org/docs/stable/generated/torch.nn.ReflectionPad2d.html
+
+
+    This operator pads the input tensor using the reflection of the input boundary.
+
+    Args:
+        padding (Union[int,tuple]): The size or bundary of padding, if is `int` uses the same padding in all dimension; if 4-dims `tuple`, uses :math:`(\\text{padding}_{\\text{left}}, \\text{padding}_{\\text{right}}, \\text{padding}_{\\text{top}}, \\text{padding}_{\\text{bottom}} )`
+
+    Returns:
+        Tensor: Returns a new tensor which is result of the reflection padding of the input tensor.
+
+    Shape:
+        - Input: :math:`(N, C, H_{\\text{in}}, W_{\\text{in}})`
+        - Output: :math:`(N, C, H_{\\text{out}}, W_{\\text{out}})` where
+
+          :math:`H_{\\text{out}} = H_{\\text{in}} + \\text{padding}_{\\text{top}} + \\text{padding}_{\\text{bottom}}`
+
+          :math:`W_{\\text{out}} = W_{\\text{in}} + \\text{padding}_{\\text{left}} + \\text{padding}_{\\text{right}}`
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> input = flow.Tensor(np.arange(18).reshape((1, 2, 3, 3)), dtype=flow.float32)
+        >>> m = flow.nn.ReflectionPad2d((2, 2, 1, 1))
+        >>> out = m(input)
+        >>> out
+        tensor([[[[ 5.,  4.,  3.,  4.,  5.,  4.,  3.],
+                  [ 2.,  1.,  0.,  1.,  2.,  1.,  0.],
+                  [ 5.,  4.,  3.,  4.,  5.,  4.,  3.],
+                  [ 8.,  7.,  6.,  7.,  8.,  7.,  6.],
+                  [ 5.,  4.,  3.,  4.,  5.,  4.,  3.]],
+        <BLANKLINE>
+                 [[14., 13., 12., 13., 14., 13., 12.],
+                  [11., 10.,  9., 10., 11., 10.,  9.],
+                  [14., 13., 12., 13., 14., 13., 12.],
+                  [17., 16., 15., 16., 17., 16., 15.],
+                  [14., 13., 12., 13., 14., 13., 12.]]]], dtype=oneflow.float32)
+
+    """
+
+    def __init__(self, padding: Union[int, tuple]) -> None:
+        super().__init__()
+        if isinstance(padding, tuple):
+            assert len(padding) == 4, ValueError("Padding length must be 4")
+            boundary = [padding[0], padding[1], padding[2], padding[3]]
+        elif isinstance(padding, int):
+            boundary = [padding, padding, padding, padding]
+        else:
+            raise ValueError("padding must be in or list or tuple!")
+        self.padding = boundary
+
+    def forward(self, x):
+        (H, W) = (x.shape[2], x.shape[3])
+        if (
+            self.padding[2] < H
+            and self.padding[3] < H
+            and (self.padding[0] < W)
+            and (self.padding[1] < W)
+        ):
+            return flow.F.pad(x, pad=self.padding, mode="reflect")
+        else:
+            raise ValueError(
+                "padding size should be less than the corresponding input dimension!"
+            )
+
+    def extra_repr(self) -> str:
+        return "{}".format(self.padding)
+
+
+class ConstantPad1d(Module):
+    """Pads the input tensor boundaries with a constant value.
+    The interface is consistent with PyTorch, and referenced from:
+    https://pytorch.org/docs/stable/generated/torch.nn.ConstantPad1d.html?highlight=constantpad1d#torch.nn.ConstantPad1d
+
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+
+    Args:
+        padding (int, list, tuple): the size of the padding. If is `int`, uses the same
+            padding in both boundaries. If a 2-`tuple`, uses
+            (:math:`\\text{padding_left}`, :math:`\\text{padding_right}`)
+
+        value (int, float): The constant value used for padding. Defaults to 0.
+
+    Shape:
+        - Input: :math:`(N, C, W_{in})`
+        - Output: :math:`(N, C, W_{out})` where
+
+          :math:`W_{out} = W_{in} + \\text{padding\\_left} + \\text{padding\\_right}`
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+
+        >>> input = flow.tensor(np.arange(8).reshape(2,2,2).astype(np.float32))
+        >>> m = flow.nn.ConstantPad1d(padding=[1, 2], value=9.9999)
+        >>> output = m(input)
+        >>> output
+        tensor([[[9.9999, 0.    , 1.    , 9.9999, 9.9999],
+                 [9.9999, 2.    , 3.    , 9.9999, 9.9999]],
+        <BLANKLINE>
+                [[9.9999, 4.    , 5.    , 9.9999, 9.9999],
+                 [9.9999, 6.    , 7.    , 9.9999, 9.9999]]], dtype=oneflow.float32)
+
+    """
+
+    def __init__(self, padding: Union[int, tuple, list], value: Union[int, float] = 0):
+        super().__init__()
+        if isinstance(padding, (tuple, list)):
+            assert len(padding) == 2, ValueError("Length of padding must be 4")
+            boundary = [padding[0], padding[1]]
+        elif isinstance(padding, int):
+            boundary = [padding, padding]
+        else:
+            raise ValueError("padding must be int or list or tuple!")
+        self.padding = boundary
+        self.value = value
+
+    def forward(self, x):
+        if x.dtype in (flow.float32, flow.float16, flow.float64):
+            self.value = float(self.value)
+        else:
+            self.value = int(self.value)
+        return flow.F.pad(x, pad=self.padding, mode="constant", value=self.value)
+
+
+class ConstantPad2d(Module):
+    """The interface is consistent with PyTorch.
+    The documentation is referenced from:
+    https://pytorch.org/docs/stable/generated/torch.nn.ConstantPad2d.html?highlight=constantpad2d#torch.nn.ConstantPad2d
+
+    This operator pads the input with constant value that user specifies.
+    User can set the amount of padding by setting the parameter `paddings`.
+
+    Args:
+        padding (int, tuple, list):  the size of the padding.
+            If is `int`, uses the same padding in all boundaries.
+            If a 4-`tuple`, uses
+            (:math:`\\mathrm{padding_{left}}`, :math:`\\mathrm{padding_{right}}`, :math:`\\mathrm{padding_{top}}`, :math:`\\mathrm{padding_{bottom}}`)
+
+        value (int, float): The constant value used for padding. Defaults to 0.
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})`
+        - Output: :math:`(N, C, H_{out}, W_{out})` where
+
+          :math:`H_{out} = H_{in} + \\mathrm{padding_{top}} + \\mathrm{padding_{bottom}}`
+          :math:`W_{out} = W_{in} + \\mathrm{padding_{left}} + \\mathrm{padding_{right}}`
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+
+        >>> constantpad_layer_0 = flow.nn.ConstantPad2d((2, 2, 1, 1), 1)
+        >>> input = flow.Tensor(np.arange(18).reshape((1, 2, 3, 3)).astype(np.float32))
+        >>> input_int = flow.Tensor(np.arange(18).reshape((1, 2, 3, 3)).astype(np.int32))
+        >>> output = constantpad_layer_0(input)
+        >>> output.shape
+        flow.Size([1, 2, 5, 7])
+        >>> output
+        tensor([[[[ 1.,  1.,  1.,  1.,  1.,  1.,  1.],
+                  [ 1.,  1.,  0.,  1.,  2.,  1.,  1.],
+                  [ 1.,  1.,  3.,  4.,  5.,  1.,  1.],
+                  [ 1.,  1.,  6.,  7.,  8.,  1.,  1.],
+                  [ 1.,  1.,  1.,  1.,  1.,  1.,  1.]],
+        <BLANKLINE>
+                 [[ 1.,  1.,  1.,  1.,  1.,  1.,  1.],
+                  [ 1.,  1.,  9., 10., 11.,  1.,  1.],
+                  [ 1.,  1., 12., 13., 14.,  1.,  1.],
+                  [ 1.,  1., 15., 16., 17.,  1.,  1.],
+                  [ 1.,  1.,  1.,  1.,  1.,  1.,  1.]]]], dtype=oneflow.float32)
+        >>> output_int = constantpad_layer_0(input_int)
+        >>> output_int
+        tensor([[[[ 1.,  1.,  1.,  1.,  1.,  1.,  1.],
+                  [ 1.,  1.,  0.,  1.,  2.,  1.,  1.],
+                  [ 1.,  1.,  3.,  4.,  5.,  1.,  1.],
+                  [ 1.,  1.,  6.,  7.,  8.,  1.,  1.],
+                  [ 1.,  1.,  1.,  1.,  1.,  1.,  1.]],
+        <BLANKLINE>
+                 [[ 1.,  1.,  1.,  1.,  1.,  1.,  1.],
+                  [ 1.,  1.,  9., 10., 11.,  1.,  1.],
+                  [ 1.,  1., 12., 13., 14.,  1.,  1.],
+                  [ 1.,  1., 15., 16., 17.,  1.,  1.],
+                  [ 1.,  1.,  1.,  1.,  1.,  1.,  1.]]]], dtype=oneflow.float32)
+
+    """
+
+    def __init__(self, padding: Union[int, tuple, list], value: Union[int, float] = 0):
+        super().__init__()
+        if isinstance(padding, (tuple, list)):
+            assert len(padding) == 4, ValueError("Length of padding must be 4")
+            boundary = [padding[0], padding[1], padding[2], padding[3]]
+        elif isinstance(padding, int):
+            boundary = [padding, padding, padding, padding]
+        else:
+            raise ValueError("padding must be int or list or tuple!")
+        self.padding = boundary
+        self.value = value
+
+    def forward(self, x):
+        if x.dtype in (flow.float32, flow.float16, flow.float64):
+            self.value = float(self.value)
+        else:
+            self.value = int(self.value)
+        return flow.F.pad(x, pad=self.padding, mode="constant", value=self.value)
+
+
+class ConstantPad3d(Module):
+    """Pads the input tensor boundaries with a constant value.
+    The interface is consistent with PyTorch, and referenced from:
+    https://pytorch.org/docs/stable/generated/torch.nn.ConstantPad3d.html?highlight=constantpad3d#torch.nn.ConstantPad3d
+
+    For `N`-dimensional padding, use :func:`flow.nn.functional.pad()`.
+
+    Args:
+        padding (int, list, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 6-`tuple`, uses
+            (:math:`\\text{padding_left}`, :math:`\\text{padding_right}`,
+            :math:`\\text{padding_top}`, :math:`\\text{padding_bottom}`,
+            :math:`\\text{padding_front}`, :math:`\\text{padding_back}`)
+
+        value (int, float): The constant value used for padding. Defaults to 0.
+
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})`
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` where
+
+          :math:`D_{out} = D_{in} + \\text{padding_front} + \\text{padding_back}`
+
+          :math:`H_{out} = H_{in} + \\text{padding_top} + \\text{padding_bottom}`
+
+          :math:`W_{out} = W_{in} + \\text{padding_left} + \\text{padding_right}`
+
+    Examples::
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+
+        >>> input = flow.tensor(np.arange(8).reshape(1,1,2,2,2).astype(np.int32))
+        >>> m = flow.nn.ConstantPad3d(padding=1, value=9)
+        >>> output = m(input)
+        >>> output
+        tensor([[[[[9, 9, 9, 9],
+                   [9, 9, 9, 9],
+                   [9, 9, 9, 9],
+                   [9, 9, 9, 9]],
+        <BLANKLINE>
+                  [[9, 9, 9, 9],
+                   [9, 0, 1, 9],
+                   [9, 2, 3, 9],
+                   [9, 9, 9, 9]],
+        <BLANKLINE>
+                  [[9, 9, 9, 9],
+                   [9, 4, 5, 9],
+                   [9, 6, 7, 9],
+                   [9, 9, 9, 9]],
+        <BLANKLINE>
+                  [[9, 9, 9, 9],
+                   [9, 9, 9, 9],
+                   [9, 9, 9, 9],
+                   [9, 9, 9, 9]]]]], dtype=oneflow.int32)
+    """
+
+    def __init__(self, padding: Union[int, tuple, list], value: Union[int, float] = 0):
+        super().__init__()
+        if isinstance(padding, (tuple, list)):
+            assert len(padding) == 6, ValueError("Length of padding must be 6")
+            boundary = [
+                padding[0],
+                padding[1],
+                padding[2],
+                padding[3],
+                padding[4],
+                padding[5],
+            ]
+        elif isinstance(padding, int):
+            boundary = [padding, padding, padding, padding, padding, padding]
+        else:
+            raise ValueError("padding must be int or list or tuple!")
+        self.padding = boundary
+        self.value = value
+
+    def forward(self, x):
+        if x.dtype in (flow.float32, flow.float16, flow.float64):
+            self.value = float(self.value)
+        else:
+            self.value = int(self.value)
+        return flow.F.pad(x, pad=self.padding, mode="constant", value=self.value)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/permute.py b/python/oneflow/nn/modules/permute.py
new file mode 100644
index 0000000000000000000000000000000000000000..a599af239577192104de5670674c68d0845b6340
--- /dev/null
+++ b/python/oneflow/nn/modules/permute.py
@@ -0,0 +1,67 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional, Sequence
+
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class Permute(Module):
+    def __init__(self, *dims) -> None:
+        super().__init__()
+        self.perm = list(*dims)
+
+    def forward(self, x):
+        assert len(self.perm) == len(x.shape)
+        new_perm = []
+        for dim in self.perm:
+            if dim < 0:
+                dim += len(self.perm)
+            assert dim >= 0 and dim < len(
+                x.shape
+            ), "Invalid dim0 {}, len(shape): {}".format(dim, len(x.shape))
+            new_perm.append(dim)
+        return flow.F.transpose(x, perm=new_perm)
+
+
+@register_tensor_op("permute")
+def permute_op(tensor, *dims):
+    """Returns a view of the original tensor with its dimensions permuted.
+
+    Args:
+        *dims (int...): The desired ordering of dimensions
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> input = flow.Tensor(np.random.randn(2, 6, 5, 3), dtype=flow.float32)
+        >>> out = input.permute(1, 0, 2, 3).shape
+        >>> out
+        flow.Size([6, 2, 5, 3])
+
+    """
+    return Permute(dims)(tensor)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/pixelshuffle.py b/python/oneflow/nn/modules/pixelshuffle.py
new file mode 100644
index 0000000000000000000000000000000000000000..f63102975cb6addfd5e155614565072e3026ff32
--- /dev/null
+++ b/python/oneflow/nn/modules/pixelshuffle.py
@@ -0,0 +1,154 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional
+
+from oneflow.framework.tensor import Tensor
+from oneflow.nn.module import Module
+
+
+class PixelShufflev2(Module):
+    """
+    Part of the documentation is referenced from:
+    https://pytorch.org/docs/stable/generated/torch.nn.PixelShuffle.html#torch.nn.PixelShuffle
+
+    Rearranges elements in a tensor of shape :math:`(*, C \\times r_h \\times r_w, H, W)`
+    to a tensor of shape :math:`(*, C, H \\times r_h, W \\times r_w)`, where r_h and r_w are upscale factors.
+
+    This is useful for implementing efficient sub-pixel convolution
+    with a stride of :math:`1/r`.
+
+    See the paper:
+    `Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network`_
+    by Shi et. al (2016) for more details.
+
+    Args:
+        upscale_factor (int, optional): factor to increase spatial resolution by, only use when factors of height and width spatial are the same.
+
+        h_upscale_factor (int, optional): factor to increase height spatial resolution by, only one of h_upscale_factor and upscale_factor can be used.
+        w_upscale_factor (int, optional): factor to increase width spatial resolution by, only one of w_upscale_factor and upscale_factor can be used.
+
+    Shape:
+        - Input: :math:`(*, C_{in}, H_{in}, W_{in})`, where * is zero or more batch dimensions
+        - Output: :math:`(*, C_{out}, H_{out}, W_{out})`, where
+
+    if use upscale_factor:
+
+    .. math::
+        C_{out} = C_{in} \\div \\text{h_upscale_factor}^2
+
+        H_{out} = H_{in} \\times \\text{upscale_factor}
+
+        W_{out} = W_{in} \\times \\text{upscale_factor}
+
+    if use h_upscale_factor and w_upscale_factor:
+
+    .. math::
+        C_{out} = C_{in} \\div \\text{h_upscale_factor} \\div \\text{w_upscale_factor}
+
+        H_{out} = H_{in} \\times \\text{h_upscale_factor}
+
+        W_{out} = W_{in} \\times \\text{w_upscale_factor}
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> m = flow.nn.PixelShuffle(upscale_factor=2)
+        >>> x = flow.Tensor(np.random.randn(3, 4, 5, 5))
+        >>> y = m(x)
+        >>> y.shape
+        flow.Size([3, 1, 10, 10])
+
+        >>> m = flow.nn.PixelShuffle(h_upscale_factor=3, w_upscale_factor=4)
+        >>> x = flow.Tensor(np.random.randn(1, 24, 2, 2))
+        >>> y = m(x)
+        >>> y.shape
+        flow.Size([1, 2, 6, 8])
+
+    .. _Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network:
+        https://arxiv.org/abs/1609.05158
+    """
+
+    def __init__(
+        self,
+        upscale_factor: Optional[int] = None,
+        h_upscale_factor: Optional[int] = None,
+        w_upscale_factor: Optional[int] = None,
+    ) -> None:
+        super().__init__()
+        if upscale_factor is None:
+            assert (
+                h_upscale_factor is not None and w_upscale_factor is not None
+            ), "h_upscale_factor and w_upscale_factor should be None if use upscale_factor"
+        else:
+            assert (
+                h_upscale_factor is None and w_upscale_factor is None
+            ), "upscale_factor should be None if use h_upscale_factor and w_upscale_factor"
+            h_upscale_factor = upscale_factor
+            w_upscale_factor = upscale_factor
+        assert (
+            h_upscale_factor > 0 and w_upscale_factor > 0
+        ), "The scale factor of height and width must larger than zero"
+        self.h_upscale_factor = h_upscale_factor
+        self.w_upscale_factor = w_upscale_factor
+
+    def forward(self, input: Tensor) -> Tensor:
+        assert len(input.shape) == 4, "Only Accept 4D Tensor"
+        (_batch, _channel, _height, _width) = input.shape
+        assert (
+            _channel % (self.h_upscale_factor * self.w_upscale_factor) == 0
+        ), "The channels of input tensor must be divisible by (upscale_factor * upscale_factor) or (h_upscale_factor * w_upscale_factor)"
+        _new_c = int(_channel / (self.h_upscale_factor * self.w_upscale_factor))
+        out = input.reshape(
+            [
+                _batch,
+                _new_c,
+                self.h_upscale_factor * self.w_upscale_factor,
+                _height,
+                _width,
+            ]
+        )
+        out = out.reshape(
+            [
+                _batch,
+                _new_c,
+                self.h_upscale_factor,
+                self.w_upscale_factor,
+                _height,
+                _width,
+            ]
+        )
+        out = out.permute(0, 1, 4, 2, 5, 3)
+        out = out.reshape(
+            [
+                _batch,
+                _new_c,
+                _height * self.h_upscale_factor,
+                _width * self.w_upscale_factor,
+            ]
+        )
+        return out
+
+    def extra_repr(self) -> str:
+        return f"w_upscale_factor={self.w_upscale_factor}, h_upscale_factor={self.h_upscale_factor}"
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/pooling.py b/python/oneflow/nn/modules/pooling.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb6b843db78363b7772ed72c13716e51551aec2d
--- /dev/null
+++ b/python/oneflow/nn/modules/pooling.py
@@ -0,0 +1,572 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional
+
+import oneflow as flow
+from oneflow.nn.common_types import _size_1_t, _size_2_t, _size_3_t
+from oneflow.nn.module import Module
+from oneflow.nn.modules.utils import _getint, _pair, _single, _triple
+from oneflow.ops.nn_ops import _GetSequence, calc_pool_padding, get_dhw_offset
+
+
+class AvgPool1d(Module):
+    """Applies a 1D average pooling over an input signal composed of several input planes.
+
+    In the simplest case, the output value of the layer with input size :math:`(N, C, H, W)`,
+    output :math:`(N, C, H_{out}, W_{out})` and `kernel_size` :math:`k`
+    can be precisely described as:
+
+    .. math::
+
+        out(N_i, C_j, l)  = \\frac{1}{k} \\sum_{m=0}^{k-1}
+                               input(N_i, C_j, stride[0] \\times h + m, stride*l + m)
+
+    If padding is non-zero, then the input is implicitly zero-padded on both sides for padding number of points.
+    The parameters kernel_size, stride, padding can each be an int or a one-element tuple.
+
+    Note:
+        When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding or the
+        input. Sliding windows that would start in the right padded region are ignored.
+    
+    Args:
+        kernel_size: the size of the window.
+        strides: the stride of the window. Default value is kernel_size.
+        padding: implicit zero padding to be added on both sides.
+        ceil_mode: when True, will use ceil instead of floor to compute the output shape.
+        count_include_pad: when True, will include the zero-padding in the averaging calculation.
+
+
+    # TODO: fix cuDNN bugs in pooling_1d
+    
+    """
+
+    def __init__(
+        self,
+        kernel_size: _size_1_t,
+        stride: Optional[_size_1_t] = None,
+        padding: _size_1_t = 0,
+        ceil_mode: bool = False,
+        count_include_pad: Optional[bool] = None,
+    ):
+        raise NotImplementedError
+
+
+class AvgPool2d(Module):
+    """Performs the 2d-average pooling on the input.
+
+    In the simplest case, the output value of the layer with input size :math:`(N, C, H, W)`,
+    output :math:`(N, C, H_{out}, W_{out})` and `kernel_size` :math:`(kH, kW)`
+    can be precisely described as:
+
+    .. math::
+
+        out(N_i, C_j, h, w)  = \\frac{1}{kH * kW} \\sum_{m=0}^{kH-1} \\sum_{n=0}^{kW-1}
+                               input(N_i, C_j, stride[0] \\times h + m, stride[1] \\times w + n)
+
+    Args:
+        kernel_size (Union[int, Tuple[int, int]]):  An int or list of ints that has length 1, 2. The size of the window for each dimension of the input Tensor.
+        strides (Union[int, Tuple[int, int]]): An int or list of ints that has length 1, 2. The stride of the sliding window for each dimension of the input Tensor.
+        padding (Tuple[int, int]): An int or list of ints that has length 1, 2. Implicit zero padding to be added on both sides.
+        ceil_mode (bool, default to False): When True, will use ceil instead of floor to compute the output shape.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow as flow
+        import numpy as np
+
+
+        of_avgpool2d = flow.nn.AvgPool2d(
+            kernel_size=(3, 2),
+            padding=0,
+            stride=(2, 1),
+        )
+        x = flow.Tensor(shape=(1, 1, 10, 10))
+        of_y = of_avgpool2d(x)   
+        
+    """
+
+    def __init__(
+        self,
+        kernel_size: _size_2_t,
+        stride: Optional[_size_2_t] = None,
+        padding: _size_2_t = 0,
+        ceil_mode: bool = False,
+        count_include_pad: Optional[bool] = None,
+        divisor_override: Optional[int] = None,
+    ):
+        super().__init__()
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride) if stride is not None else _pair(kernel_size)
+        assert isinstance(padding, int) or isinstance(
+            padding, tuple
+        ), "padding can only int int or tuple of 2 ints."
+        padding = _pair(padding)
+        self.padding = padding
+        padding = [0, 0, *padding]
+        assert count_include_pad is None, "count_include_pad not supported yet"
+        assert divisor_override is None, "divisor_override not supported yet"
+        self._channel_pos = "channels_first"
+        (self._padding_type, _pads_list) = calc_pool_padding(
+            padding, get_dhw_offset(self._channel_pos), 2
+        )
+        self._padding_before = [pad[0] for pad in _pads_list]
+        self._padding_after = [pad[1] for pad in _pads_list]
+        self.ceil_mode = ceil_mode
+
+    def forward(self, x):
+        return flow.F.avg_pool_2d(
+            x,
+            kernel_size=self.kernel_size,
+            stride=self.stride,
+            padding=self._padding_type,
+            padding_before=self._padding_before,
+            padding_after=self._padding_after,
+            ceil_mode=self.ceil_mode,
+            data_format=self._channel_pos,
+        )
+
+    def extra_repr(self) -> str:
+        return "kernel_size={kernel_size}, stride={stride}, padding={padding}, ceil_mode={ceil_mode}".format(
+            **self.__dict__
+        )
+
+
+class AvgPool3d(Module):
+    """Applies a 3D average pooling over an input signal composed of several input planes.
+
+    In the simplest case, the output value of the layer with input size :math:`(N, C, D, H, W)`,
+    output :math:`(N, C, D_{out}, H_{out}, W_{out})` and `kernel_size` :math:`(kD, kH, kW)`
+    can be precisely described as:
+
+    .. math::
+
+        out(N_i, C_j, d, h, w)  = \\frac{1}{kD * kH * kW } \\sum_{k=0}^{kD-1} \\sum_{m=0}^{kH-1} \\sum_{n=0}^{kW-1}
+                               input(N_i, C_j, stride[0] \\times d + k, stride[1] \\times h + m, stride[2] \\times w + n)
+    
+    If padding is non-zero, then the input is implicitly zero-padded on all three sides for padding number of points.
+
+    Note:
+        When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding or the
+        input. Sliding windows that would start in the right padded region are ignored.
+
+    Args:
+        kernel_size: the size of the window.
+        strides:  the stride of the window. Default value is kernel_size.
+        padding:  implicit zero padding to be added on all three sides.
+        ceil_mode:  when True, will use ceil instead of floor to compute the output shape.
+        count_include_pad: when True, will include the zero-padding in the averaging calculation.
+        divisor_override: if specified, it will be used as divisor, otherwise kernel_size will be used.
+
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})`
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})`, where
+
+          .. math::
+              D_{out} = \\left\\lfloor\\frac{D_{in} + 2 \\times \\text{padding}[0] - \\text{kernel_size}[0]}{\\text{stride}[0]} + 1\\right\\rfloor
+
+          .. math::
+              H_{out} = \\left\\lfloor\\frac{H_{in} + 2 \\times \\text{padding}[1] - \\text{kernel_size}[1]}{\\text{stride}[1]} + 1\\right\\rfloor
+
+          .. math::
+              W_{out} = \\left\\lfloor\\frac{W_{in} + 2 \\times \\text{padding}[2] - \\text{kernel_size}[2]}{\\text{stride}[2]} + 1\\right\\rfloor
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        
+        >>> m = flow.nn.AvgPool3d(kernel_size=(2,2,2),padding=(0,0,0),stride=(1,1,1))
+        >>> x = flow.Tensor(np.random.randn(9, 7, 11, 32, 20))
+        >>> y = m(x)
+        >>> y.shape
+        flow.Size([9, 7, 10, 31, 19])
+
+    """
+
+    def __init__(
+        self,
+        kernel_size: _size_3_t,
+        stride: Optional[_size_3_t] = None,
+        padding: _size_3_t = 0,
+        ceil_mode: bool = False,
+        count_include_pad: Optional[bool] = None,
+        divisor_override: Optional[int] = None,
+    ):
+        super().__init__()
+        kernel_size = _triple(kernel_size)
+        stride = _triple(stride) if stride is not None else _triple(kernel_size)
+        assert padding == (0, 0, 0), "padding>0 not supported yet"
+        assert isinstance(padding, int) or isinstance(
+            padding, tuple
+        ), "padding can only int int or tuple of 3 ints."
+        padding = _triple(padding)
+        padding = [0, 0, *padding]
+        assert count_include_pad is None, "count_include_pad not supported yet"
+        assert divisor_override is None, "divisor_override not supported yet"
+        _channel_pos = "channels_first"
+        (_padding_type, _pads_list) = calc_pool_padding(
+            padding, get_dhw_offset(_channel_pos), 3
+        )
+        _padding_before = [pad[0] for pad in _pads_list]
+        _padding_after = [pad[1] for pad in _pads_list]
+        self._op = (
+            flow.builtin_op("avg_pool_3d")
+            .Attr("data_format", _channel_pos)
+            .Attr("pool_size", kernel_size)
+            .Attr("strides", stride)
+            .Attr("ceil_mode", ceil_mode)
+            .Attr("padding", _padding_type)
+            .Attr("padding_before", _padding_before)
+            .Attr("padding_after", _padding_after)
+            .Input("x")
+            .Output("y")
+            .Build()
+        )
+
+    def forward(self, x):
+        return self._op(x)[0]
+
+
+class MaxPool1d(Module):
+    """The interface is consistent with PyTorch.
+    The documentation is referenced from: https://pytorch.org/docs/stable/generated/torch.nn.MaxPool1d.html#torch.nn.MaxPool1d
+
+    Applies a 1D max pooling over an input signal composed of several input planes.
+
+    In the simplest case, the output value of the layer with input size :math:`(N, C, L)`
+    and output :math:`(N, C, L_{out})` can be precisely described as:
+
+    .. math::
+        out(N_i, C_j, k) = \\max_{m=0, \\ldots, \\text{kernel\\_size} - 1}
+                input(N_i, C_j, stride \\times k + m)
+
+    If :attr:`padding` is non-zero, then the input is implicitly padded with minimum value on both sides
+    for :attr:`padding` number of points. :attr:`dilation` is the stride between the elements within the
+    sliding window. This `link`_ has a nice visualization of the pooling parameters.
+
+    Note:
+        When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding
+        or the input. Sliding windows that would start in the right padded region are ignored.
+
+    Args:
+        kernel_size: The size of the sliding window, must be > 0.
+        stride: The stride of the sliding window, must be > 0. Default value is :attr:`kernel_size`.
+        padding: Implicit negative infinity padding to be added on both sides, must be >= 0 and <= kernel_size / 2.
+        dilation: The stride between elements within a sliding window, must be > 0.
+        return_indices: If ``True``, will return the argmax along with the max values.
+                        Useful for :class:`torch.nn.MaxUnpool1d` later
+        ceil_mode: If ``True``, will use `ceil` instead of `floor` to compute the output shape. This
+                   ensures that every element in the input tensor is covered by a sliding window.
+
+    Shape:
+        - Input: :math:`(N, C, L_{in})`
+        - Output: :math:`(N, C, L_{out})`, where
+
+          .. math::
+              L_{out} = \\left\\lfloor \\frac{L_{in} + 2 \\times \\text{padding} - \\text{dilation}
+                    \\times (\\text{kernel_size} - 1) - 1}{\\text{stride}} + 1\\right\\rfloor
+
+    """
+
+    def __init__(
+        self,
+        kernel_size: _size_1_t,
+        stride: Optional[_size_1_t] = None,
+        padding: _size_1_t = 0,
+        dilation: _size_1_t = 1,
+        return_indices: bool = False,
+        ceil_mode: bool = False,
+    ):
+        super().__init__()
+        self.kernel_size = _single(kernel_size)
+        self.stride = _single(stride) if stride is not None else self.kernel_size
+        data_format = "NCL"
+        self.channel_pos = "channels_first" if data_format == "NCL" else "channels_last"
+        self.dilation = _single(dilation)
+        self.padding = _single(padding)
+        self.return_indices = return_indices
+        self.ceil_mode = ceil_mode
+
+    def forward(self, x):
+        (y, indice) = flow.F.maxpool_1d(
+            x,
+            data_format=self.channel_pos,
+            padding=self.padding,
+            kernel_size=self.kernel_size,
+            stride=self.stride,
+            dilation=self.dilation,
+            return_indices=True,
+            ceil_mode=self.ceil_mode,
+        )
+        if self.return_indices:
+            return (y, indice)
+        else:
+            return y
+
+    def extra_repr(self) -> str:
+        return "kernel_size={}, stride={}, padding={}".format(
+            self.kernel_size, self.stride, self.padding
+        )
+
+
+class MaxPool2d(Module):
+    """The interface is consistent with PyTorch.
+    The documentation is referenced from: https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html#torch.nn.MaxPool2d
+
+    Applies a 2D max pooling over an input signal composed of several input planes.
+
+    In the simplest case, the output value of the layer with input size :math:`(N, C, H, W)`,
+    output :math:`(N, C, H_{out}, W_{out})` and :attr:`kernel_size` :math:`(kH, kW)`
+    can be precisely described as:
+
+    .. math::
+        \\begin{aligned}
+            out(N_i, C_j, h, w) ={} & \\max_{m=0, \\ldots, kH-1} \\max_{n=0, \\ldots, kW-1} \\\\
+                                    & \\text{input}(N_i, C_j, \\text{stride[0]} \\times h + m,
+                                                   \\text{stride[1]} \\times w + n)
+        \\end{aligned}
+
+    If :attr:`padding` is non-zero, then the input is implicitly minimum value padded on both sides
+    for :attr:`padding` number of points. :attr:`dilation` controls the spacing between the kernel points.
+    It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
+
+    Note:
+        When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding
+        or the input. Sliding windows that would start in the right padded region are ignored.
+    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be:
+        - a single ``int`` -- in which case the same value is used for the height and width dimension
+        - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
+          and the second `int` for the width dimension
+
+    Args:
+        kernel_size: the size of the window to take a max over
+        stride: the stride of the window. Default value is :attr:`kernel_size`
+        padding: implicit minimum value padding to be added on both sides
+        dilation: a parameter that controls the stride of elements in the window
+        return_indices: if ``True``, will return the max indices along with the outputs.
+                        Useful for :class:`torch.nn.MaxUnpool2d` later
+        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})`
+        - Output: :math:`(N, C, H_{out}, W_{out})`, where
+
+          .. math::
+              H_{out} = \\left\\lfloor\\frac{H_{in} + 2 * \\text{padding[0]} - \\text{dilation[0]}
+                    \\times (\\text{kernel_size[0]} - 1) - 1}{\\text{stride[0]}} + 1\\right\\rfloor
+          .. math::
+              W_{out} = \\left\\lfloor\\frac{W_{in} + 2 * \\text{padding[1]} - \\text{dilation[1]}
+                    \\times (\\text{kernel_size[1]} - 1) - 1}{\\text{stride[1]}} + 1\\right\\rfloor
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> kernel_size, stride, padding = (3, 4), (1, 1), (1, 2)
+        >>> m = flow.nn.MaxPool2d(kernel_size, stride, padding)
+        >>> np.random.seed(0)
+        >>> x = flow.Tensor(np.random.rand(1, 1, 5, 3))
+        >>> y = m(x)
+        >>> y #doctest: +ELLIPSIS
+        tensor([[[[0.7152, 0.7152, 0.7152, 0.7152],
+                  ...
+                  [0.9256, 0.9256, 0.9256, 0.9256]]]], dtype=oneflow.float32)
+
+        >>> kernel_size, stride, padding = (2, 4), (4, 5), (1, 2)
+        >>> m = flow.nn.MaxPool2d(kernel_size, stride, padding)
+        >>> x = flow.Tensor(np.random.randn(9, 7, 32, 20))
+        >>> y = m(x)
+        >>> y.shape
+        flow.Size([9, 7, 9, 5])
+
+    """
+
+    def __init__(
+        self,
+        kernel_size: _size_2_t,
+        stride: Optional[_size_2_t] = None,
+        padding: _size_2_t = 0,
+        dilation: _size_2_t = 1,
+        return_indices: bool = False,
+        ceil_mode: bool = False,
+    ):
+        super().__init__()
+        self.kernel_size = _pair(kernel_size)
+        data_format = "NCHW"
+        self.channel_pos = (
+            "channels_first" if data_format == "NCHW" else "channels_last"
+        )
+        self.stride = _pair(stride) if stride is not None else _pair(kernel_size)
+        self.dilation = _GetSequence(dilation, 2, "dilation")
+        self.return_indices = return_indices
+        self.ceil_mode = ceil_mode
+        self.padding = _pair(padding)
+
+    def forward(self, x):
+        (y, indice) = flow.F.maxpool_2d(
+            x,
+            data_format=self.channel_pos,
+            padding=self.padding,
+            kernel_size=self.kernel_size,
+            stride=self.stride,
+            dilation=self.dilation,
+            return_indices=True,
+            ceil_mode=self.ceil_mode,
+        )
+        if self.return_indices:
+            return (y, indice)
+        else:
+            return y
+
+    def extra_repr(self) -> str:
+        return "kernel_size={}, stride={}, padding={}, dilation={}".format(
+            self.kernel_size, self.stride, self.padding, self.dilation
+        )
+
+
+class MaxPool3d(Module):
+    """The interface is consistent with PyTorch.
+    The documentation is referenced from: https://pytorch.org/docs/stable/generated/torch.nn.MaxPool3d.html#torch.nn.MaxPool3d
+
+    Applies a 3D max pooling over an input signal composed of several input planes.
+
+    In the simplest case, the output value of the layer with input size :math:`(N, C, D, H, W)`,
+    output :math:`(N, C, D_{out}, H_{out}, W_{out})` and :attr:`kernel_size` :math:`(kD, kH, kW)`
+    can be precisely described as:
+
+    .. math::
+        \\begin{aligned}
+            \\text{out}(N_i, C_j, d, h, w) ={} & \\max_{k=0, \\ldots, kD-1} \\max_{m=0, \\ldots, kH-1} \\max_{n=0, \\ldots, kW-1} \\\\
+                                              & \\text{input}(N_i, C_j, \\text{stride[0]} \\times d + k,
+                                                             \\text{stride[1]} \\times h + m, \\text{stride[2]} \\times w + n)
+        \\end{aligned}
+
+    If :attr:`padding` is non-zero, then the input is implicitly minimum value on both sides
+    for :attr:`padding` number of points. :attr:`dilation` controls the spacing between the kernel points.
+    It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
+
+    Note:
+        When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding
+        or the input. Sliding windows that would start in the right padded region are ignored.
+
+    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be:
+
+        - a single ``int`` -- in which case the same value is used for the depth, height and width dimension
+        - a ``tuple`` of three ints -- in which case, the first `int` is used for the depth dimension,
+          the second `int` for the height dimension and the third `int` for the width dimension
+
+    Args:
+        kernel_size: the size of the window to take a max over
+        stride: the stride of the window. Default value is :attr:`kernel_size`
+        padding: implicit minimum value padding to be added on all three sides
+        dilation: a parameter that controls the stride of elements in the window
+        return_indices: if ``True``, will return the max indices along with the outputs.
+                        Useful for :class:`torch.nn.MaxUnpool3d` later
+        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
+
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})`
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})`, where
+
+          .. math::
+              D_{out} = \\left\\lfloor\\frac{D_{in} + 2 \\times \\text{padding}[0] - \\text{dilation}[0] \\times
+                (\\text{kernel_size}[0] - 1) - 1}{\\text{stride}[0]} + 1\\right\\rfloor
+
+          .. math::
+              H_{out} = \\left\\lfloor\\frac{H_{in} + 2 \\times \\text{padding}[1] - \\text{dilation}[1] \\times
+                (\\text{kernel_size}[1] - 1) - 1}{\\text{stride}[1]} + 1\\right\\rfloor
+
+          .. math::
+              W_{out} = \\left\\lfloor\\frac{W_{in} + 2 \\times \\text{padding}[2] - \\text{dilation}[2] \\times
+                (\\text{kernel_size}[2] - 1) - 1}{\\text{stride}[2]} + 1\\right\\rfloor
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> kernel_size, stride, padding = (3, 3, 4), (1, 1, 1), (1, 1, 2)
+        >>> m = flow.nn.MaxPool3d(kernel_size, stride, padding)
+        >>> np.random.seed(0)
+        >>> x = flow.Tensor(np.random.rand(1, 1, 3, 5, 3))
+        >>> y = m(x)
+        >>> y #doctest: +ELLIPSIS
+        tensor([[[[[0.87  , 0.9786, 0.9786, 0.9786],
+                   ...
+                   [0.9447, 0.9447, 0.9447, 0.6668]]]]], dtype=oneflow.float32)
+        >>> kernel_size, stride, padding = (4, 2, 4), (3, 4, 5), (2, 1, 2)
+        >>> m = flow.nn.MaxPool3d(kernel_size, stride, padding)
+        >>> x = flow.Tensor(np.random.randn(9, 7, 11, 32, 20))
+        >>> y = m(x)
+        >>> y.shape
+        flow.Size([9, 7, 4, 9, 5])
+
+    """
+
+    def __init__(
+        self,
+        kernel_size: _size_3_t,
+        stride: Optional[_size_3_t] = None,
+        padding: _size_3_t = 0,
+        dilation: _size_3_t = 1,
+        return_indices: bool = False,
+        ceil_mode: bool = False,
+    ):
+        super().__init__()
+        self.kernel_size = _triple(kernel_size)
+        self.stride = _triple(stride) if stride is not None else _triple(kernel_size)
+        data_format = "NCDHW"
+        self.channel_pos = (
+            "channels_last" if data_format == "NDHWC" else "channels_first"
+        )
+        self.dilation = _GetSequence(dilation, 3, "dilation")
+        self.padding = _triple(padding)
+        self.return_indices = return_indices
+        self.ceil_mode = ceil_mode
+
+    def forward(self, x):
+        (y, indice) = flow.F.maxpool_3d(
+            x,
+            data_format=self.channel_pos,
+            padding=self.padding,
+            kernel_size=self.kernel_size,
+            stride=self.stride,
+            dilation=self.dilation,
+            return_indices=True,
+            ceil_mode=self.ceil_mode,
+        )
+        if self.return_indices:
+            return (y, indice)
+        else:
+            return y
+
+    def extra_repr(self) -> str:
+        return "kernel_size={}, stride={}, padding={}, dilation={}".format(
+            self.kernel_size, self.stride, self.padding, self.dilation
+        )
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/random_ops.py b/python/oneflow/nn/modules/random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ae2623643f2c7d12231d9ad3a17e8b2dc94a124
--- /dev/null
+++ b/python/oneflow/nn/modules/random_ops.py
@@ -0,0 +1,64 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import random
+import sys
+
+import oneflow as flow
+from oneflow.nn.module import Module
+
+
+def bernoulli(input, *, generator=None, out=None):
+    """This operator returns a Tensor with binaray random numbers (0 / 1) from a Bernoulli distribution.
+
+    Args:
+        input(Tensor) - the input tensor of probability values for the Bernoulli distribution
+        generator: (optional) 鈥� a pseudorandom number generator for sampling
+        out (Tensor, optional) 鈥� the output tensor.
+
+    Shape:
+        - Input: :math:`(*)`. Input can be of any shape
+        - Output: :math:`(*)`. Output is of the same shape as input
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> arr = np.array(
+        ...    [
+        ...        [1.0, 1.0, 1.0],
+        ...        [1.0, 1.0, 1.0],
+        ...        [1.0, 1.0, 1.0],
+        ...    ]
+        ... )
+        >>> x = flow.Tensor(arr)
+        >>> y = flow.bernoulli(x)
+        >>> y
+        tensor([[1., 1., 1.],
+                [1., 1., 1.],
+                [1., 1., 1.]], dtype=oneflow.float32)
+
+
+    """
+    return flow.F.bernoulli(input, flow.float32, generator)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/reduce_ops.py b/python/oneflow/nn/modules/reduce_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..867ef0323d0d8068b92b1ab9b230d0af3047257b
--- /dev/null
+++ b/python/oneflow/nn/modules/reduce_ops.py
@@ -0,0 +1,184 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import collections
+from typing import Optional, Sequence, Union
+
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+from oneflow.nn.modules.utils import _check_axis
+
+
+def _build_reduce_op(op_type_name, keepdims):
+    return (
+        flow.builtin_op(op_type_name)
+        .Input("input_tensor")
+        .Output("output_tensor")
+        .Attr("keepdims", keepdims)
+        .Build()
+    )
+
+
+class Sum(Module):
+    def __init__(
+        self, axis: Optional[Union[int, Sequence[int]]] = None, keepdims: bool = False
+    ) -> None:
+        super().__init__()
+        self.axis = axis
+        self.keepdims = keepdims
+
+    def forward(self, input):
+        axis_checked = _check_axis(self.axis, input.shape)
+        if len(axis_checked) == 0:
+            return input
+        return flow.F.reduce_sum(input, axis=axis_checked, keepdims=self.keepdims)
+
+
+@register_tensor_op("sum")
+def _sum(input, dim=None, keepdim=False):
+    """Computes the sum of row of elements in a tensor in the given axis, if the axis is None, sum of all elements will be caculated.
+    
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> input = flow.Tensor([[1, 2, 3], [4, 5, 6]])
+        >>> flow.sum(input)
+        tensor([21.], dtype=oneflow.float32)
+        >>> flow.sum(input, dim=0)
+        tensor([5., 7., 9.], dtype=oneflow.float32)
+        >>> flow.sum(input, dim=1)
+        tensor([ 6., 15.], dtype=oneflow.float32)
+
+    """
+    return Sum(dim, keepdim)(input)
+
+
+class Mean(Module):
+    def __init__(
+        self, axis: Optional[Union[int, Sequence[int]]] = None, keepdims: bool = False
+    ) -> None:
+        super().__init__()
+        self.axis = axis
+        self.keepdims = keepdims
+
+    def forward(self, input):
+        axis_checked = _check_axis(self.axis, input.shape)
+        if len(axis_checked) == 0:
+            return input
+        return flow.F.reduce_mean(input, axis=axis_checked, keepdims=self.keepdims)
+
+
+@register_tensor_op("mean")
+def _mean(input, dim=None, keepdim=False):
+    """Computes the mean of row of elements in a tensor in the given axis, if the axis is None, mean of all elements will be caculated.
+    
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> input = flow.Tensor([[1, 2, 3], [4, 5, 6]])
+        >>> flow.mean(input)
+        tensor([3.5], dtype=oneflow.float32)
+        >>> flow.mean(input, dim=0)
+        tensor([2.5, 3.5, 4.5], dtype=oneflow.float32)
+        >>> flow.mean(input, dim=1)
+        tensor([2., 5.], dtype=oneflow.float32)
+
+    """
+    return Mean(dim, keepdim)(input)
+
+
+class Min(Module):
+    def __init__(
+        self, axis: Optional[Union[int, Sequence[int]]] = None, keepdims: bool = False
+    ) -> None:
+        super().__init__()
+        self.axis = axis
+        self.keepdims = keepdims
+        self._op = _build_reduce_op("reduce_min", keepdims)
+
+    def forward(self, input):
+        axis_checked = _check_axis(self.axis, input.shape)
+        if len(axis_checked) == 0:
+            return input
+        return self._op(input, axis=axis_checked)[0]
+
+
+@register_tensor_op("min")
+def _min(input, dim=None, keepdim=False):
+    """Computes the minimum value of all elements in the input tensor.
+    
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> input = flow.Tensor([[4, 1, 5], [2, 6, 3]])
+        >>> flow.min(input)
+        tensor([1.], dtype=oneflow.float32)
+        >>> flow.min(input, dim=0)
+        tensor([2., 1., 3.], dtype=oneflow.float32)
+        >>> flow.min(input, dim=1)
+        tensor([1., 2.], dtype=oneflow.float32)
+
+    """
+    return Min(dim, keepdim)(input)
+
+
+class Max(Module):
+    def __init__(
+        self, axis: Optional[Union[int, Sequence[int]]] = None, keepdims: bool = False
+    ) -> None:
+        super().__init__()
+        self.axis = axis
+        self.keepdims = keepdims
+        self._op = _build_reduce_op("reduce_max", keepdims)
+
+    def forward(self, input):
+        axis_checked = _check_axis(self.axis, input.shape)
+        if len(axis_checked) == 0:
+            return input
+        return self._op(input, axis=axis_checked)[0]
+
+
+@register_tensor_op("max")
+def _max(input, dim=None, keepdim=False):
+    """Computes the maximum value of all elements in the input tensor.
+    
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> input = flow.Tensor([[4, 1, 5], [2, 6, 3]])
+        >>> flow.max(input)
+        tensor([6.], dtype=oneflow.float32)
+        >>> flow.max(input, dim=0)
+        tensor([4., 6., 5.], dtype=oneflow.float32)
+        >>> flow.max(input, dim=1)
+        tensor([5., 6.], dtype=oneflow.float32)
+
+    """
+    return Max(dim, keepdim)(input)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/repeat.py b/python/oneflow/nn/modules/repeat.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b9876f2078149f777ae844a29423daf6fc0667c
--- /dev/null
+++ b/python/oneflow/nn/modules/repeat.py
@@ -0,0 +1,94 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class Repeat(Module):
+    def __init__(self, sizes) -> None:
+        super().__init__()
+        self.sizes = sizes
+
+    def forward(self, input):
+        repeat = self.sizes
+        for repeat_v in repeat:
+            assert repeat_v > 0
+        input_shape = input.shape
+        assert len(repeat) >= len(input_shape)
+        in_reshape = []
+        out_reshape = []
+        expand_dim = []
+        diff = len(repeat) - len(input_shape)
+        for i in range(len(repeat) - 1, -1, -1):
+            if i >= diff:
+                if repeat[i] > 1:
+                    if input_shape[i - diff] > 1:
+                        in_reshape.insert(0, input_shape[i - diff])
+                        in_reshape.insert(0, 1)
+                        expand_dim.insert(0, input_shape[i - diff])
+                        expand_dim.insert(0, repeat[i])
+                        out_reshape.insert(0, input_shape[i - diff] * repeat[i])
+                    else:
+                        in_reshape.insert(0, input_shape[i - diff])
+                        expand_dim.insert(0, repeat[i])
+                        out_reshape.insert(0, repeat[i])
+                else:
+                    in_reshape.insert(0, input_shape[i - diff])
+                    expand_dim.insert(0, input_shape[i - diff])
+                    out_reshape.insert(0, input_shape[i - diff])
+            else:
+                expand_dim.insert(0, repeat[i])
+                out_reshape.insert(0, repeat[i])
+        new_tensor = flow.reshape(input, in_reshape)
+        tmp_tensor = new_tensor.expand(*expand_dim)
+        out = flow.reshape(tmp_tensor, out_reshape)
+        return out
+
+
+@register_tensor_op("repeat")
+def repeat_op(x, sizes):
+    """This operator repeat the input tensor to a larger size along the specified dimensions.
+
+    Args:
+        x (oneflow.Tensor): The input Tensor.
+        size (Sequence[int]): The number of times to repeat this tensor along each dimension
+
+    Returns:
+        oneflow.Tensor: The result Tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> x = np.array([[[[0, 1]],
+        ...               [[2, 3]],
+        ...               [[4, 5]]]]).astype(np.int32)
+
+        >>> input = flow.Tensor(x)
+        >>> out = input.repeat(sizes=(1, 1, 2, 2))
+        >>> out.shape
+        flow.Size([1, 3, 2, 4])
+    """
+    return Repeat(sizes=sizes)(x)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/reshape.py b/python/oneflow/nn/modules/reshape.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7282a78cc9c2d12f4ea6d2ba55c24b2110513d6
--- /dev/null
+++ b/python/oneflow/nn/modules/reshape.py
@@ -0,0 +1,118 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Sequence
+
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class Reshape(Module):
+    def __init__(self, shape: Sequence[int]) -> None:
+        super().__init__()
+        self.shape = shape
+
+    def forward(self, x):
+        return flow.F.reshape(x, shape=self.shape)
+
+
+@register_tensor_op("reshape")
+def reshape_op(x, shape: Sequence[int] = None):
+    """This operator reshapes a Tensor.
+
+    We can set one dimension in `shape` as `-1`, the operator will infer the complete shape.
+
+    Args:
+        x: A Tensor.
+        shape: Shape of the output tensor.
+    Returns:
+        A Tensor has the same type as `x`.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        >>> x = np.array(
+        ...    [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]]
+        ... ).astype(np.float32)
+        >>> input = flow.Tensor(x)
+
+        >>> y = flow.reshape(input, shape=[2, 2, 2, -1]).shape
+        >>> y
+        flow.Size([2, 2, 2, 2])
+
+    """
+    return Reshape(shape=shape)(x)
+
+
+@register_tensor_op("view")
+def view_op(x, shape: Sequence[int] = None):
+    """
+    The interface is consistent with PyTorch.
+    The documentation is referenced from: https://pytorch.org/docs/stable/generated/torch.Tensor.view.html
+
+    Returns a new tensor with the same data as the :attr:`self` tensor but of a
+    different :attr:`shape`.
+
+    The returned tensor shares the same data and must have the same number
+    of elements, but may have a different size. For a tensor to be viewed, the new
+    view size must be compatible with its original size and stride, i.e., each new
+    view dimension must either be a subspace of an original dimension, or only span
+    across original dimensions :math:`d, d+1, \\dots, d+k` that satisfy the following
+    contiguity-like condition that :math:`\\forall i = d, \\dots, d+k-1`,
+
+    .. math::
+
+      \\text{stride}[i] = \\text{stride}[i+1] \\times \\text{size}[i+1]
+
+    Otherwise, it will not be possible to view :attr:`self` tensor as :attr:`shape`
+    without copying it (e.g., via :meth:`contiguous`). When it is unclear whether a
+    :meth:`view` can be performed, it is advisable to use :meth:`reshape`, which
+    returns a view if the shapes are compatible, and copies (equivalent to calling
+    :meth:`contiguous`) otherwise.
+
+    Args:
+        x: A Tensor.
+        shape: Shape of the output tensor.
+    Returns:
+        A Tensor has the same type as `x`.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> x = np.array(
+        ...    [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]]
+        ... ).astype(np.float32)
+        >>> input = flow.Tensor(x)
+
+        >>> y = flow.view(input, shape=[2, 2, 2, -1]).numpy().shape
+        >>> y
+        (2, 2, 2, 2)
+
+    """
+    return Reshape(shape=shape)(x)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/round.py b/python/oneflow/nn/modules/round.py
new file mode 100644
index 0000000000000000000000000000000000000000..2abffa27dc4f1a3dc4fed95ea8a59381c0559d9e
--- /dev/null
+++ b/python/oneflow/nn/modules/round.py
@@ -0,0 +1,68 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class Round(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.round(x)
+
+
+def round_op(x):
+    """This operator rounds the value of Blob to the nearest integer.
+    Args:
+        x (oneflow.Tensor): A Tensor
+    Returns:
+        oneflow.Tensor: The result Tensor
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> x1 = flow.Tensor(np.array([1.49999, 1.500001, 2.7]).astype(np.float32))
+        >>> out1 = flow.round(x1)
+        >>> out1.numpy()
+        array([1., 2., 3.], dtype=float32)
+        >>> x2 = flow.Tensor(np.array([2.499999, 7.5000001, 5.3, 6.8]).astype(np.float32))
+        >>> out2 = flow.round(x2)
+        >>> out2.numpy()
+        array([2., 8., 5., 7.], dtype=float32)
+
+    """
+    return Round()(x)
+
+
+@register_tensor_op("round")
+def round_op_tensor(x):
+    """
+    round() -> Tensor
+
+    See :func:`oneflow.round`
+
+    """
+    return Round()(x)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/scatter_nd.py b/python/oneflow/nn/modules/scatter_nd.py
new file mode 100644
index 0000000000000000000000000000000000000000..c03bf73f2ac4c292b48652105f654c95b56b6d86
--- /dev/null
+++ b/python/oneflow/nn/modules/scatter_nd.py
@@ -0,0 +1,68 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.framework.tensor import Tensor
+from oneflow.nn.module import Module
+
+
+class ScatterNd(Module):
+    def __init__(self, shape: list):
+        super().__init__()
+        if not isinstance(shape, list):
+            raise ValueError("shape must be list!")
+        self.shape = shape
+
+    def forward(self, index, updates):
+        self._op = (
+            flow.builtin_op("scatter_nd")
+            .Input("indices")
+            .Input("updates")
+            .Output("out")
+            .Attr("shape", self.shape)
+            .Build()
+        )
+        res = self._op(index, updates)[0]
+        return res
+
+
+def _scatter_nd_op(index, update, shape):
+    """This operator inserts the elements in `updates` according to the `index` and create a new Tensor.
+
+    Args:
+        index: The indices of `updates`. Its type should be `flow.int`.
+        updates: The update Tensor.
+        shape (Sequence[int]): The constant tensor shape, the constant tensor elements are all zero.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> index = flow.Tensor(np.array([[1], [6], [4]]), dtype=flow.int)
+        >>> update = flow.Tensor(np.array([10.2,5.1,12.7]), dtype=flow.float)
+        >>> out = flow.scatter_nd(index,update, [8])
+        >>> out
+        tensor([ 0. , 10.2,  0. ,  0. , 12.7,  0. ,  5.1,  0. ], dtype=oneflow.float32)
+
+    """
+    return ScatterNd(shape)(index, update)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/sign.py b/python/oneflow/nn/modules/sign.py
new file mode 100644
index 0000000000000000000000000000000000000000..9befa45da1f2c88c6bf05a5c023d4e531104c730
--- /dev/null
+++ b/python/oneflow/nn/modules/sign.py
@@ -0,0 +1,73 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class Sign(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.sign(x)
+
+
+def sign_op(x):
+    """Computes the sign of Tensor.
+
+    .. math::
+
+        \\text{out}_{i}  = \\text{sgn}(\\text{input}_{i})
+
+    Args:
+        input (Tensor): the input tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> x1 = flow.Tensor(np.array([-2, 0, 2]).astype(np.float32))
+        >>> out1 = flow.sign(x1)
+        >>> out1.numpy()
+        array([-1.,  0.,  1.], dtype=float32)
+        >>> x2 = flow.Tensor(np.array([-3.2, -4.5, 5.8]).astype(np.float32),device=flow.device('cuda'))
+        >>> out2 = flow.sign(x2)
+        >>> out2.numpy()
+        array([-1., -1.,  1.], dtype=float32)
+
+    """
+    return Sign()(x)
+
+
+@register_tensor_op("sign")
+def sign_op_tensor(x):
+    """
+
+    sign() -> Tensor
+
+    See :func:`oneflow.sign`
+
+    """
+    return Sign()(x)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/sinh.py b/python/oneflow/nn/modules/sinh.py
new file mode 100644
index 0000000000000000000000000000000000000000..f01aee5bf87f821a113bd2ba766beb426efc9f51
--- /dev/null
+++ b/python/oneflow/nn/modules/sinh.py
@@ -0,0 +1,75 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class Sinh(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.sinh(x)
+
+
+def sinh_op(x):
+    """Returns a new tensor with the hyperbolic sine of the elements of :attr:`input`.
+
+    .. math::
+        \\text{out}_{i} = \\sinh(\\text{input}_{i})
+
+    Args:
+        input (Tensor): the input tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+
+        >>> x1 = flow.Tensor(np.array([1, 2, 3]))
+        >>> x2 = flow.Tensor(np.array([1.53123589,0.54242598,0.15117185]))
+        >>> x3 = flow.Tensor(np.array([1,0,-1]))
+
+        >>> flow.sinh(x1).numpy()
+        array([ 1.1752012,  3.6268604, 10.017875 ], dtype=float32)
+        >>> flow.sinh(x2).numpy()
+        array([2.20381  , 0.5694193, 0.1517483], dtype=float32)
+        >>> flow.sinh(x3).numpy()
+        array([ 1.1752012,  0.       , -1.1752012], dtype=float32)
+
+    """
+    return Sinh()(x)
+
+
+@register_tensor_op("sinh")
+def sinh_op_tensor(x):
+    """
+
+    sinh() -> Tensor
+
+    See :func:`oneflow.sinh`
+
+    """
+    return Sinh()(x)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/slice.py b/python/oneflow/nn/modules/slice.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ce3bb2b2b07dc6f70766fa2d28f95ec3509faca
--- /dev/null
+++ b/python/oneflow/nn/modules/slice.py
@@ -0,0 +1,146 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Sequence, Tuple
+
+import numpy as np
+
+import oneflow as flow
+from oneflow.nn.module import Module
+from oneflow.ops.array_ops import GetSliceAttrs, check_slice_tup_list
+
+
+class Slice(Module):
+    def __init__(
+        self, start: Tuple[int, ...], stop: Tuple[int, ...], step: Tuple[int, ...]
+    ) -> None:
+        super().__init__()
+        self.start = start
+        self.stop = stop
+        self.step = step
+
+    def forward(self, x):
+        return flow.F.slice(x, start=self.start, stop=self.stop, step=self.step)
+
+
+def slice_op(x, slice_tup_list: Sequence[Tuple[int, int, int]]):
+    """Extracts a slice from a tensor.
+    The `slice_tup_list` assigns the slice indices in each dimension, the format is (start, stop, step).
+    The operator will slice the tensor according to the `slice_tup_list`.
+
+    Args:
+        x: A `Tensor`.
+        slice_tup_list: A list of slice tuple, indicate each dimension slice (start, stop, step).
+
+    For example: 
+
+    .. code-block:: python 
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        >>> input = flow.Tensor(np.random.randn(3, 6, 9).astype(np.float32))
+        >>> tup_list = [[None, None, None], [0, 5, 2], [0, 6, 3]]
+        >>> y = flow.slice(input, slice_tup_list=tup_list)
+        >>> y.shape
+        flow.Size([3, 3, 2])
+    """
+    (start, stop, step) = check_slice_tup_list(slice_tup_list, x.shape)
+    return Slice(start, stop, step)(x)
+
+
+class SliceUpdate(Module):
+    def __init__(
+        self, start: Tuple[int, ...], stop: Tuple[int, ...], step: Tuple[int, ...]
+    ) -> None:
+        super().__init__()
+        self.start = start
+        self.stop = stop
+        self.step = step
+
+    def forward(self, x, update):
+        return flow.F.slice_update(
+            x, update, start=self.start, stop=self.stop, step=self.step
+        )
+
+
+def slice_update_op(x, update, slice_tup_list: Sequence[Tuple[int, int, int]]):
+    """Update a slice of tensor `x`. Like `x[start:stop:step] = update`. 
+
+    Args:
+        x: A `Tensor`, whose slice will be updated.
+        update: A `Tensor`, indicate the update content.
+        slice_tup_list: A list of slice tuple, indicate each dimension slice (start, stop, step).
+
+    For example: 
+
+    .. code-block:: python 
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        >>> input = flow.Tensor(np.array([1, 1, 1, 1, 1]).astype(np.float32))
+        >>> update = flow.Tensor(np.array([2, 3, 4]).astype(np.float32))
+        >>> y = flow.slice_update(input, update, slice_tup_list=[[1, 4, 1]])
+        >>> y.numpy()
+        array([1., 2., 3., 4., 1.], dtype=float32)
+    """
+    (start, stop, step) = GetSliceAttrs(slice_tup_list, x.shape)
+    return SliceUpdate(start, stop, step)(x, update)
+
+
+class LogicalSliceAssign(Module):
+    def __init__(
+        self, start: Tuple[int, ...], stop: Tuple[int, ...], step: Tuple[int, ...]
+    ) -> None:
+        super().__init__()
+        self.start = start
+        self.stop = stop
+        self.step = step
+
+    def forward(self, x, update):
+        if update.dtype != x.dtype:
+            update = update.to(dtype=x.dtype)
+        return flow.F.logical_slice_assign(
+            x, update, start=self.start, stop=self.stop, step=self.step
+        )
+
+
+def logical_slice_assign_op(x, update, slice_tup_list: Sequence[Tuple[int, int, int]]):
+    """Update a slice of tensor `x`(in-place). Like `x[start:stop:step] = update`. 
+
+    Args:
+        x: A `Tensor`, whose slice will be updated.
+        update: A `Tensor`, indicate the update content.
+        slice_tup_list: A list of slice tuple, indicate each dimension slice (start, stop, step).
+
+    For example: 
+
+    .. code-block:: python 
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> input = flow.Tensor(np.array([1, 1, 1, 1, 1]).astype(np.float32))
+        >>> update = flow.Tensor(np.array([2, 3, 4]).astype(np.float32))
+        >>> y = flow.tmp.logical_slice_assign(input, update, slice_tup_list=[[1, 4, 1]])
+    """
+    "[summary]\n\n    Returns:\n        [type]: [description]\n    "
+    (start, stop, step) = GetSliceAttrs(slice_tup_list, x.shape)
+    return LogicalSliceAssign(start, stop, step)(x, update)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/softplus.py b/python/oneflow/nn/modules/softplus.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c4b8153e5f6f695db5a31ca657780081e80455f
--- /dev/null
+++ b/python/oneflow/nn/modules/softplus.py
@@ -0,0 +1,70 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class Softplus(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x):
+        return flow.F.softplus(x)
+
+
+@register_tensor_op("softplus")
+def softplus_op(x):
+    """Applies the element-wise function:
+
+    .. math::
+        Softplus(x)= \\frac{1}{尾}*log(1+exp(尾鈭梮))
+
+    SoftPlus is a smooth approximation to the ReLU function and can be used to constrain the output of a machine to always be positive.
+
+    For numerical stability the implementation reverts to the linear function when :attr:`input X 尾 > threshold`.
+
+    Args:
+        beta:the value for the Softplus formulation.Default:1
+
+        threshold:values above this revert to a linear function.Default:20
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+
+        >>> x1 = flow.Tensor(np.array([1, 2, 3]))
+        >>> x2 = flow.Tensor(np.array([1.53123589,0.54242598,0.15117185]))
+        >>> x3 = flow.Tensor(np.array([1,0,-1]))
+
+        >>> flow.softplus(x1).numpy()
+        array([1.3132616, 2.126928 , 3.0485873], dtype=float32)
+        >>> flow.softplus(x2).numpy()
+        array([1.7270232, 1.0006962, 0.771587 ], dtype=float32)
+        >>> flow.softplus(x3).numpy()
+        array([1.3132616 , 0.6931472 , 0.31326166], dtype=float32)
+
+    """
+    return Softplus()(x)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/sort.py b/python/oneflow/nn/modules/sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..915540bec203f7697c281c72abad9d66214a1bfa
--- /dev/null
+++ b/python/oneflow/nn/modules/sort.py
@@ -0,0 +1,104 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+from oneflow.ops.transpose_util import (
+    get_inversed_perm,
+    get_perm_when_transpose_axis_to_last_dim,
+)
+
+
+class Sort(Module):
+    def __init__(self, dim: int = -1, descending: bool = False) -> None:
+        super().__init__()
+        self.dim = dim
+        direction = "DESCENDING" if descending else "ASCENDING"
+        self._argsort_op = (
+            flow.builtin_op("arg_sort")
+            .Input("in")
+            .Output("out")
+            .Attr("direction", direction)
+            .Build()
+        )
+
+    def forward(self, input):
+        num_dims = len(input.shape)
+        dim = self.dim if self.dim >= 0 else self.dim + num_dims
+        assert 0 <= dim < num_dims, "dim out of range"
+        if dim == num_dims - 1:
+            indices = self._argsort_op(input)[0]
+            return (flow.gather(input, indices, dim), indices)
+        else:
+            perm = get_perm_when_transpose_axis_to_last_dim(num_dims, dim)
+            x = flow.F.transpose(input, perm=perm)
+            indices = self._argsort_op(x)[0]
+            indices = flow.F.transpose(indices, perm=get_inversed_perm(perm))
+            return (flow.gather(input, indices, dim), indices)
+
+
+@register_tensor_op("sort")
+def sort_op(input, dim: int = -1, descending: bool = False):
+    """Sorts the elements of the input tensor along a given dimension in ascending order by value.
+
+    Args:
+        input (oneflow.Tensor): The input Tensor.
+        dim (int, optional): dimension to be sorted. Defaults to the last dim (-1).
+        descending (bool, optional): controls the sorting order (ascending or descending).
+
+    Returns:
+        Tuple(oneflow.Tensor, oneflow.Tensor(dtype=int32)): A tuple of (values, indices), where
+        where the values are the sorted values and the indices are the indices of the elements
+        in the original input tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> x = np.array([[1, 3, 8, 7, 2], [1, 9, 4, 3, 2]], dtype=np.float32)
+        >>> input = flow.Tensor(x)
+        >>> (values, indices) = flow.sort(input)
+        >>> values
+        tensor([[1., 2., 3., 7., 8.],
+                [1., 2., 3., 4., 9.]], dtype=oneflow.float32)
+        >>> indices
+        tensor([[0, 4, 1, 3, 2],
+                [0, 4, 3, 2, 1]], dtype=oneflow.int32)
+        >>> (values, indices) = flow.sort(input, descending=True)
+        >>> values
+        tensor([[8., 7., 3., 2., 1.],
+                [9., 4., 3., 2., 1.]], dtype=oneflow.float32)
+        >>> indices
+        tensor([[2, 3, 1, 4, 0],
+                [1, 2, 3, 4, 0]], dtype=oneflow.int32)
+        >>> (values, indices) = flow.sort(input, dim=0)
+        >>> values
+        tensor([[1., 3., 4., 3., 2.],
+                [1., 9., 8., 7., 2.]], dtype=oneflow.float32)
+        >>> indices
+        tensor([[0, 0, 1, 1, 0],
+                [1, 1, 0, 0, 1]], dtype=oneflow.int32)
+ 
+    """
+    return Sort(dim=dim, descending=descending)(input)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/sparse.py b/python/oneflow/nn/modules/sparse.py
new file mode 100644
index 0000000000000000000000000000000000000000..122cd44068bb32cb8f404f1e4b484d738870aacc
--- /dev/null
+++ b/python/oneflow/nn/modules/sparse.py
@@ -0,0 +1,109 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import List, Optional, Tuple
+
+import oneflow as flow
+from oneflow.framework.tensor import Tensor
+from oneflow.nn.module import Module
+
+
+class Embedding(Module):
+    """A simple lookup table that stores embeddings of a fixed dictionary and size.
+
+    This module is often used to store word embeddings and retrieve them using indices.
+    The input to the module is a list of indices, and the output is the corresponding
+    word embeddings.
+
+    Args:
+        num_embeddings (int): size of the dictionary of embeddings
+        embedding_dim (int): the size of each embedding vector
+        padding_idx (int, optional): If specified, the entries at :attr:`padding_idx` do not contribute to the gradient;
+                                    therefore, the embedding vector at :attr:`padding_idx` is not updated during training,
+                                    i.e. it remains as a fixed "pad". For a newly constructed Embedding,
+                                    the embedding vector at :attr:`padding_idx` will default to all zeros,
+                                    but can be updated to another value to be used as the padding vector.
+    
+    For example:
+
+    .. code-block:: python
+        
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> indices = flow.Tensor([[1, 2, 4, 5], [4, 3, 2, 9]], dtype=flow.int)
+        >>> m = flow.nn.Embedding(10, 3)
+        >>> y = m(indices)
+
+    """
+
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        padding_idx: Optional[int] = None,
+        max_norm: Optional[float] = None,
+        norm_type: Optional[float] = None,
+        scale_grad_by_freq: bool = False,
+        sparse: bool = False,
+        _weight: Optional[Tensor] = None,
+    ):
+        super().__init__()
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        if padding_idx is not None:
+            if padding_idx > 0:
+                assert (
+                    padding_idx < self.num_embeddings
+                ), "Padding_idx must be within num_embeddings"
+            elif padding_idx < 0:
+                assert (
+                    padding_idx >= -self.num_embeddings
+                ), "Padding_idx must be within num_embeddings"
+                padding_idx = self.num_embeddings + padding_idx
+        self.padding_idx = padding_idx
+        assert max_norm is None, "Not support max_norm yet!"
+        assert norm_type is None, "Not support norm_type yet!"
+        assert scale_grad_by_freq is False, "Not support scale_grad_by_freq=True yet!"
+        assert sparse is False, "Not support sparse=True yet!"
+        if _weight is None:
+            self.weight = flow.nn.Parameter(Tensor(num_embeddings, embedding_dim))
+            self.reset_parameters()
+        else:
+            assert list(_weight.shape) == [
+                num_embeddings,
+                embedding_dim,
+            ], "Shape of weight does not match num_embeddings and embedding_dim"
+            self.weight = flow.nn.Parameter(_weight)
+        self.sparse = sparse
+
+    def reset_parameters(self) -> None:
+        flow.nn.init.normal_(self.weight)
+        self._fill_padding_idx_with_zero()
+
+    def _fill_padding_idx_with_zero(self) -> None:
+        if self.padding_idx is not None:
+            with flow.no_grad():
+                self.weight[self.padding_idx].fill_(0)
+
+    def forward(self, indices):
+        res = flow.F.gather(self.weight, indices, axis=0)
+        return res
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/squeeze.py b/python/oneflow/nn/modules/squeeze.py
new file mode 100644
index 0000000000000000000000000000000000000000..28f04510679264a92546b767dbd26e134078c569
--- /dev/null
+++ b/python/oneflow/nn/modules/squeeze.py
@@ -0,0 +1,72 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional, Sequence
+
+import oneflow as flow
+import oneflow.framework.id_util as id_util
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class Squeeze(Module):
+    def __init__(self, dim: Optional[Sequence[int]] = None) -> None:
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, x):
+        if self.dim is None:
+            return x
+        return flow.F.squeeze(x, dim=self.dim)
+
+
+@register_tensor_op("squeeze")
+def squeeze_op(input, dim: Optional[Sequence[int]] = None):
+    """This operator removes the specified dimention which size is 1 of the input Tensor.
+    If the `dim` is not specified, this operator will remove all the dimention which size is 1 of the input Tensor.
+
+    The amount of element in return value is the same as Tensor `input`.
+
+    Args:
+        input (oneflow.Tensor): The input Tensor.
+        dim (Optional[Sequence[int]]): The dim. Defaults to None.
+
+    Returns:
+        Tensor: The result Tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> input = flow.Tensor(np.array([[[[1, 1, 1]]]]).astype(np.int32))
+        >>> out = flow.squeeze(input, dim=[1, 2]).shape
+        >>> out
+        flow.Size([1, 3])
+
+    """
+    if isinstance(dim, int):
+        dim = [dim]
+    elif dim is None:
+        dim = range(input.ndim)
+    dim = list(filter(lambda i: input.size(i) == 1, dim))
+    return Squeeze(dim=dim)(input)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/stack.py b/python/oneflow/nn/modules/stack.py
new file mode 100644
index 0000000000000000000000000000000000000000..99a6ea9e95fddbcf4da278c0189a7e8648cfce54
--- /dev/null
+++ b/python/oneflow/nn/modules/stack.py
@@ -0,0 +1,82 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import List, Tuple
+
+import oneflow as flow
+from oneflow.framework.tensor import Tensor, register_tensor_op
+from oneflow.nn.module import Module
+
+
+class Stack(Module):
+    def __init__(self, dim: int = 0) -> None:
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, inputs):
+        assert isinstance(inputs, (List, Tuple))
+        input_shape = inputs[0].shape
+        max_dim = len(input_shape)
+        if self.dim < 0:
+            self.dim = self.dim + max_dim + 1
+        assert self.dim >= 0 and self.dim <= max_dim
+        input_list_length = len(inputs)
+        unsqueezed = list()
+        for i in range(input_list_length):
+            current_shape = inputs[i].shape
+            assert (
+                input_shape == current_shape
+            ), "Each tensor should have the same shape ! Found a tensor instance shape is: {}".format(
+                current_shape
+            )
+            unsqueezed.append(inputs[i].unsqueeze(dim=self.dim))
+        return flow.cat(unsqueezed, dim=self.dim)
+
+
+@register_tensor_op("stack")
+def stack(inputs: Tensor, dim: int = 0) -> None:
+    """Concatenates a sequence of tensors along a new dimension.
+    The returned tensor shares the same underlying data with input tensors.
+
+    A :attr:`dim` value within the range `[-input.ndimension() - 1, input.ndimension() + 1]`
+    can be used. Negative :attr:`dim` will correspond to :meth:`stack`
+    applied at :attr:`dim` = ``dim + input.ndimension() + 1``.
+
+    Args:
+        inputs (List[oneflow.Tensor]): the list of input tensors. Each tensor should have the same shape.
+        dim (int): the index at which to insert the concatenated dimension.
+
+    Returns:
+        A `Tensor`
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> x = flow.Tensor(np.random.rand(1, 3, 5))
+        >>> y = flow.Tensor(np.random.rand(1, 3, 5))
+        >>> out = flow.stack([x, y], dim = -1)
+        >>> out.shape
+        flow.Size([1, 3, 5, 2])
+    """
+    return Stack(dim)(inputs)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/tan.py b/python/oneflow/nn/modules/tan.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f1cf0bd05bf0f9a6f888258fa97cd6774a45503
--- /dev/null
+++ b/python/oneflow/nn/modules/tan.py
@@ -0,0 +1,68 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class Tan(Module):
+    def __init__(self):
+        super().__init__()
+        self._op = flow.builtin_op("tan").Input("x").Output("y").Build()
+
+    def forward(self, x):
+        return self._op(x)[0]
+
+
+def tan_op(input):
+    """Returns  the tan value of the elements of :attr:`input`.
+
+    .. math::
+        \\text{out}_{i} = \\tan(\\text{input}_{i})
+
+    Args:
+        input (Tensor): the input tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> np_arr = np.array([-1/4*np.pi, 0, 1/4*np.pi]).astype(np.float32)
+        >>> input = flow.Tensor(np_arr)
+        >>> output = flow.tan(input)
+        >>> output
+        tensor([-1.,  0.,  1.], dtype=oneflow.float32)
+
+    """
+    return Tan()(input)
+
+
+@register_tensor_op("tan")
+def tan_op_tensor(input):
+    """
+    tan() -> Tensor
+    See :func:`oneflow.tan`
+
+    """
+    return Tan()(input)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/tensor_buffer.py b/python/oneflow/nn/modules/tensor_buffer.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b1d3e30703b418d7df5957e63b190e846f12c5e
--- /dev/null
+++ b/python/oneflow/nn/modules/tensor_buffer.py
@@ -0,0 +1,147 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional, Sequence
+
+import oneflow as flow
+from oneflow.nn.module import Module
+
+
+class TensorBufferToTensor(Module):
+    def __init__(self, dtype, instance_shape):
+        super().__init__()
+        self._op = (
+            flow.builtin_op("tensor_buffer_to_tensor")
+            .Input("in")
+            .Output("out")
+            .Attr("dtype", dtype)
+            .Attr("instance_shape", instance_shape)
+            .Build()
+        )
+
+    def forward(self, input):
+        return self._op(input)[0]
+
+
+def tensor_buffer_to_tensor_op(x, dtype: flow.dtype, instance_shape: Sequence[int]):
+    """This operator converts the Tensor's type from TensorBuffer to original type.
+    Some operator's output data type is `TensorBuffer`, you can use this operator to convert back
+    to `Tensor`.
+
+    Refer to `Concept Explanation <https://docs.oneflow.org/basics_topics/concept_explanation.html#3tensorbuffer-tensorlist>`_
+    for more about TensorBuffer.
+
+    Args:
+        x (oneflow.Tensor): The input Tensor.
+        dtype (flow.dtype): The data dtype.
+        instance_shape (Sequence[int]): The shape of each TensorBuffer instance.
+
+    Returns:
+        oneflow.Tensor: The result Tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        >>> x = np.random.randn(4, 16, 64, 64).astype(np.float32)
+        >>> x = flow.Tensor(x)
+        >>> x = flow.tensor_to_tensor_buffer(x, instance_dims=2)
+        >>> output = flow.tensor_buffer_to_tensor(x, instance_shape=(64, 64), dtype=flow.float)
+        >>> output.shape
+        flow.Size([4, 16, 64, 64])
+
+    """
+    return TensorBufferToTensor(dtype=dtype, instance_shape=instance_shape)(x)
+
+
+class TensorToTensorBuffer(Module):
+    def __init__(self, instance_dims):
+        super().__init__()
+        self._op = (
+            flow.builtin_op("tensor_to_tensor_buffer")
+            .Input("in")
+            .Output("out")
+            .Attr("instance_dims", instance_dims)
+            .Build()
+        )
+
+    def forward(self, input):
+        return self._op(input)[0]
+
+
+def tensor_to_tensor_buffer(x, instance_dims: int):
+    """This operator converts the Tensor's type to TensorBuffer.
+
+    Refer to `Concept Explanation <https://docs.oneflow.org/basics_topics/concept_explanation.html#3tensorbuffer-tensorlist>`_
+    for more about TensorBuffer.
+
+    Args:
+        x (oneflow.Tensor): The input Tensor.
+        instance_dims (int): The dimensions of dynamic tensor instance.
+
+    Returns:
+        oneflow.Tensor: The result Tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        >>> x = np.random.randn(4, 16, 64, 64).astype(np.float32)
+        >>> x = flow.Tensor(x)
+        >>> x = flow.tensor_to_tensor_buffer(x, instance_dims=2)
+        >>> output = flow.tensor_buffer_to_tensor(x, instance_shape=(64, 64), dtype=flow.float)
+        >>> output.shape
+        flow.Size([4, 16, 64, 64])
+    
+    """
+    return TensorToTensorBuffer(instance_dims=instance_dims)(x)
+
+
+class GenTensorBuffer(Module):
+    def __init__(self, shape, shape_list, value_list, data_type, dynamic_out):
+        super().__init__()
+        self._op = (
+            flow.builtin_op("gen_tensor_buffer")
+            .Output("out")
+            .Attr("shape", shape)
+            .Attr("shape_list", shape_list)
+            .Attr("value_list", value_list)
+            .Attr("data_type", data_type)
+            .Attr("dynamic_out", dynamic_out)
+            .Build()
+        )
+
+    def forward(self):
+        return self._op()[0]
+
+
+def gen_tensor_buffer(
+    shape: Sequence[int],
+    shape_list: Sequence[Sequence[int]],
+    value_list: Sequence[float],
+    data_type: Optional[flow.dtype] = flow.float32,
+    dynamic_out: Optional[bool] = False,
+):
+    return GenTensorBuffer(shape, shape_list, value_list, data_type, dynamic_out)()
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/tensor_ops.py b/python/oneflow/nn/modules/tensor_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c3a18ef2333b40b48e89a1ee3de3455403a4517
--- /dev/null
+++ b/python/oneflow/nn/modules/tensor_ops.py
@@ -0,0 +1,89 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class TypeAs(Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input, target):
+        return input.to(dtype=target.dtype)
+
+
+@register_tensor_op("type_as")
+def type_as_op(input, target):
+    """Returns this tensor cast to the type of the given tensor.
+        This is a no-op if the tensor is already of the correct type.
+
+    Args:
+        input  (Tensor): the input tensor.
+        target (Tensor): the tensor which has the desired type.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        
+        >>> input = flow.Tensor(np.random.randn(1, 2, 3), dtype=flow.float32)
+        >>> target = flow.Tensor(np.random.randn(4, 5, 6), dtype = flow.int32)
+        >>> input = input.type_as(target)
+        >>> input.dtype
+        oneflow.int32
+
+    """
+    return TypeAs()(input, target)
+
+
+class Long(Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input):
+        return input.to(dtype=flow.int64)
+
+
+@register_tensor_op("long")
+def long_op(input):
+    """`Tensor.long()` is equivalent to `Tensor.to(flow.int64)`. See to().
+
+    Args:
+        input  (Tensor): the input tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        
+        >>> input = flow.Tensor(np.random.randn(1, 2, 3), dtype=flow.float32)
+        >>> input = input.long()
+        >>> input.dtype
+        oneflow.int64
+
+    """
+    return Long()(input)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/tile.py b/python/oneflow/nn/modules/tile.py
new file mode 100644
index 0000000000000000000000000000000000000000..2441fb43e62934ea6bff11f50affea737f43a93c
--- /dev/null
+++ b/python/oneflow/nn/modules/tile.py
@@ -0,0 +1,92 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Union
+
+import oneflow as flow
+from oneflow.framework.tensor import Tensor, register_tensor_op
+from oneflow.nn.module import Module
+
+
+class Tile(Module):
+    def __init__(self, reps: tuple) -> None:
+        super().__init__()
+        self.reps = reps
+
+    def forward(self, input: Tensor) -> Tensor:
+        reps = self.reps
+        for s in self.reps:
+            assert s > 0
+        input_shape = input.shape
+        diff = len(input_shape) - len(reps)
+        if diff > 0:
+            shape = [1 for _ in range(diff)]
+            shape.extend([i for i in reps])
+            reps = tuple(shape)
+        return input.repeat(reps)
+
+
+@register_tensor_op("tile")
+def tile_op(x, reps):
+    """The interface is consistent with PyTorch.
+    The documentation is referenced from:
+    https://pytorch.org/docs/stable/generated/torch.tile.html
+
+    Constructs a tensor by repeating the elements of ``input``.  The ``reps`` argument specifies the number
+    of repetitions in each dimension.
+
+    If ``reps`` specifies fewer dimensions than ``input`` has, then ones are prepended to ``reps`` until
+    all dimensions are specified.  For example, if ``input`` has shape (8, 6, 4, 2) and ``reps`` is (2, 2),
+    then ``reps`` is treated as (1, 1, 2, 2).
+
+    Analogously, if ``input`` has fewer dimensions than ``reps`` specifies, then ``input`` is treated as
+    if it were unsqueezed at dimension zero until it has as many dimensions as ``reps`` specifies.
+    For example, if ``input`` has shape (4, 2) and ``reps`` is (3, 3, 2, 2), then ``input`` is treated as
+    if it had the shape (1, 1, 4, 2).
+
+    .. note::
+        This function is similar to NumPy鈥檚 tile function.
+
+    Args:
+        input (oneflow.Tensor): the tensor whose elements to repeat.
+        reps (tuple): the number of repetitions per dimension.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        
+        >>> x = np.array([1, 2]).astype(np.int32)
+        >>> input = flow.Tensor(x, dtype=flow.int32)
+        >>> out = input.tile(reps=(2,))
+        >>> out
+        tensor([1, 2, 1, 2], dtype=oneflow.int32)
+
+        >>> x = np.random.randn(5, 2, 1)
+        >>> input = flow.Tensor(x)
+        >>> out = input.tile(reps=(3, 4))
+        >>> out.size()
+        flow.Size([5, 6, 4])
+
+    """
+    return Tile(reps=reps)(x)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/to.py b/python/oneflow/nn/modules/to.py
new file mode 100644
index 0000000000000000000000000000000000000000..993e9868a569819b9e899bc011cbaf6578680dc6
--- /dev/null
+++ b/python/oneflow/nn/modules/to.py
@@ -0,0 +1,100 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional, Union
+
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class To(Module):
+    def __init__(self, copy):
+        super().__init__()
+        self.copy = copy
+
+    def forward(self, x, device, dtype):
+        result = x
+        if device is not None:
+            if x.device != device or self.copy:
+                result = flow.F.copy(x, device_type=device.type, device_id=device.index)
+        if dtype is not None:
+            if x.dtype != dtype or self.copy:
+                result = flow.F.cast(result, dtype=dtype)
+        return result
+
+
+@register_tensor_op("to")
+def to_op(input, *args, **kwargs):
+    """Performs Tensor dtype and/or device conversion. 
+        A flow.dtype and flow.device are inferred from the arguments of `input.to(*args, **kwargs)`.
+    
+    .. note::
+        If the ``input`` Tensor already
+        has the correct :class:`flow.dtype` and :class:`flow.device`, then ``input`` is returned.
+        Otherwise, the returned tensor is a copy of ``input`` with the desired.
+
+    Args:
+        input (oneflow.Tensor): An input tensor.
+        *args (oneflow.Tensor or oneflow.device or oneflow.dtype): Positional arguments
+        **kwargs (oneflow.device or oneflow.dtype) : Key-value arguments
+
+    Returns:
+        oneflow.Tensor: A Tensor.
+    
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> arr = np.random.randint(1, 9, size=(1, 2, 3, 4))
+        >>> input = flow.Tensor(arr)
+        >>> output = input.to(dtype=flow.float32)
+        >>> np.array_equal(arr.astype(np.float32), output.numpy())
+        True
+
+    """
+    copy = kwargs.get("copy", False)
+    device = kwargs.get("device", None)
+    dtype = kwargs.get("dtype", None)
+    if len(args) > 0:
+        if isinstance(args[0], flow.Tensor):
+            if len(args) == 2:
+                copy = args[1]
+            return To(copy)(input, args[0].device, args[0].dtype)
+        elif isinstance(args[0], flow.dtype):
+            if len(args) == 2:
+                copy = args[1]
+            return To(copy)(input, None, args[0])
+        else:
+            device = flow.device(args[0]) if isinstance(args[0], str) else args[0]
+            if len(args) > 1:
+                dtype = args[1]
+                assert isinstance(dtype, flow.dtype)
+            if len(args) > 2:
+                copy = args[2]
+            assert isinstance(device, flow.device)
+            return To(copy)(input, device, dtype)
+    if isinstance(device, flow.device) or isinstance(dtype, flow.dtype):
+        return To(copy)(input, device, dtype)
+    raise TypeError("to() received an invalid combination of arguments")
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/transpose.py b/python/oneflow/nn/modules/transpose.py
new file mode 100644
index 0000000000000000000000000000000000000000..a503074c743fcede7b47b21fbc29ff4ccfd7d31d
--- /dev/null
+++ b/python/oneflow/nn/modules/transpose.py
@@ -0,0 +1,91 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional, Sequence
+
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class Transpose(Module):
+    def __init__(
+        self,
+        dim0: int,
+        dim1: int,
+        conjugate: bool = False,
+        batch_axis_non_change: bool = False,
+    ) -> None:
+        super().__init__()
+        if conjugate:
+            raise NotImplementedError
+        if batch_axis_non_change:
+            raise NotImplementedError
+        self.dim0 = dim0
+        self.dim1 = dim1
+
+    def forward(self, x):
+        x_shape = x.shape
+        dim0 = self.dim0
+        dim1 = self.dim1
+        if dim0 < 0:
+            dim0 += len(x_shape)
+        if dim1 < 0:
+            dim1 += len(x_shape)
+        assert dim0 >= 0 and dim0 < len(
+            x_shape
+        ), "Invalid dim0 {}, len(shape): {}".format(dim0, len(x_shape))
+        assert dim1 >= 0 and dim1 < len(
+            x_shape
+        ), "Invalid dim1 {}, len(shape): {}".format(dim1, len(x_shape))
+        perm = []
+        for i in range(len(x_shape)):
+            perm.append(i)
+        (perm[dim0], perm[dim1]) = (perm[dim1], perm[dim0])
+        return flow.F.transpose(x, perm=perm)
+
+
+@register_tensor_op("transpose")
+def transpose_op(tensor, dim0, dim1):
+    """Returns a tensor that is a transposed version of input. The given dimensions dim0 and dim1 are swapped.
+
+    The resulting out tensor shares its underlying storage with the input tensor, so changing the content of one would change the content of the other.
+
+    Args:
+        tensor (oneflow.Tensor): The input tensor.
+        dim0 (int): the first dimension to be transposed.
+        dim1 (int): the second dimension to be transposed.
+    Returns:
+        Tensor: A transposed tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        >>> input = flow.Tensor(np.random.randn(2, 6, 5, 3), dtype=flow.float32)
+        >>> out = flow.transpose(input, 0, 1).shape
+        >>> out
+        flow.Size([6, 2, 5, 3])
+
+    """
+    return Transpose(dim0=dim0, dim1=dim1)(tensor)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/triu.py b/python/oneflow/nn/modules/triu.py
new file mode 100644
index 0000000000000000000000000000000000000000..532b2684c2ae4f7319899d5351633d9496d5ff8a
--- /dev/null
+++ b/python/oneflow/nn/modules/triu.py
@@ -0,0 +1,59 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class Triu(Module):
+    def __init__(self, diagonal=0):
+        super().__init__()
+        self.diagonal = diagonal
+
+    def forward(self, x):
+        return flow.F.triu(x, self.diagonal)
+
+
+@register_tensor_op("triu")
+def triu_op(x, diagonal=0):
+    """Returns the upper triangular part of a matrix (2-D tensor) or batch of matrices input, 
+    the other elements of the result tensor out are set to 0.
+    
+    Args:
+        input (Tensor): the input tensor. 
+        diagonal (int, optional): the diagonal to consider
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        
+        >>> x = flow.Tensor(np.ones(shape=(3, 3)).astype(np.float32))
+        >>> flow.triu(x)
+        tensor([[1., 1., 1.],
+                [0., 1., 1.],
+                [0., 0., 1.]], dtype=oneflow.float32)
+
+    """
+    return Triu(diagonal)(x)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/unsqueeze.py b/python/oneflow/nn/modules/unsqueeze.py
new file mode 100644
index 0000000000000000000000000000000000000000..34fea85c01addad901936661c7829debbb97bbf0
--- /dev/null
+++ b/python/oneflow/nn/modules/unsqueeze.py
@@ -0,0 +1,68 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class Unsqueeze(Module):
+    def __init__(self, dim: int = 0) -> None:
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, input):
+        assert (
+            -(1 + input.ndimension()) <= self.dim <= input.ndimension()
+        ), "dim should within the range [-input.ndimension() - 1, input.ndimension() + 1)"
+        if self.dim < 0:
+            self.dim = 1 + input.ndimension() + self.dim
+        return flow.F.expand_dims(input, axis=self.dim)
+
+
+@register_tensor_op("unsqueeze")
+def unsqueeze_op(input, dim):
+    """Returns a new tensor with a dimension of size one inserted at the
+    specified position.
+
+    The returned tensor shares the same underlying data with this tensor.
+
+    A :attr:`dim` value within the range `[-input.ndimension() - 1, input.ndimension() + 1)`
+    can be used. Negative :attr:`dim` will correspond to :meth:`unsqueeze`
+    applied at :attr:`dim` = ``dim + input.ndimension() + 1``.
+
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the index at which to insert the singleton dimension
+
+    For example: 
+
+    .. code-block:: python 
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> x = flow.Tensor(np.random.rand(2, 3, 4))
+        >>> y = x.unsqueeze(2)
+        >>> y.shape
+        flow.Size([2, 3, 1, 4])
+    """
+    return Unsqueeze(dim)(input)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/upsampling.py b/python/oneflow/nn/modules/upsampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..34e6ba64a966ba1734aa7228de710a82ab013718
--- /dev/null
+++ b/python/oneflow/nn/modules/upsampling.py
@@ -0,0 +1,243 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional, Tuple, Union
+
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class Upsample(Module):
+    """The interface is consistent with PyTorch.    
+    
+    The documentation is referenced from: https://pytorch.org/docs/1.9.0/_modules/torch/nn/modules/upsampling.html#Upsample
+    
+    Upsamples a given multi-channel 1D (temporal), 2D (spatial) or 3D (volumetric) data.
+
+    The input data is assumed to be of the form
+    `minibatch x channels x [optional depth] x [optional height] x width`.
+    Hence, for spatial inputs, we expect a 4D Tensor and for volumetric inputs, we expect a 5D Tensor.
+
+    The algorithms available for upsampling are nearest neighbor and linear,
+    bilinear, bicubic and trilinear for 3D, 4D and 5D input Tensor,
+    respectively.
+
+    One can either give a :attr:`scale_factor` or the target output :attr:`size` to
+    calculate the output size. (You cannot give both, as it is ambiguous)
+
+    Args:
+        size (int or Tuple[int] or Tuple[int, int] or Tuple[int, int, int], optional):
+            output spatial sizes
+        scale_factor (float or Tuple[float] or Tuple[float, float] or Tuple[float, float, float], optional):
+            multiplier for spatial size. Has to match input size if it is a tuple.
+        mode (str, optional): the upsampling algorithm: one of ``'nearest'``,
+            ``'linear'``, ``'bilinear'``, ``'bicubic'`` and ``'trilinear'``.
+            Default: ``'nearest'``
+        align_corners (bool, optional): if ``True``, the corner pixels of the input
+            and output tensors are aligned, and thus preserving the values at
+            those pixels. This only has effect when :attr:`mode` is
+            ``'linear'``, ``'bilinear'``, or ``'trilinear'``. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, C, W_{in})`, :math:`(N, C, H_{in}, W_{in})` or :math:`(N, C, D_{in}, H_{in}, W_{in})`
+        - Output: :math:`(N, C, W_{out})`, :math:`(N, C, H_{out}, W_{out})`
+          or :math:`(N, C, D_{out}, H_{out}, W_{out})`, where
+
+    .. math::
+        D_{out} = \\left\\lfloor D_{in} \\times \\text{scale_factor} \\right\\rfloor
+
+    .. math::
+        H_{out} = \\left\\lfloor H_{in} \\times \\text{scale_factor} \\right\\rfloor
+
+    .. math::
+        W_{out} = \\left\\lfloor W_{in} \\times \\text{scale_factor} \\right\\rfloor
+
+    .. warning::
+        With ``align_corners = True``, the linearly interpolating modes
+        (`linear`, `bilinear`, `bicubic`, and `trilinear`) don't proportionally
+        align the output and input pixels, and thus the output values can depend
+        on the input size. This was the default behavior for these modes up to
+        version 0.3.1. Since then, the default behavior is
+        ``align_corners = False``. See below for concrete examples on how this
+        affects the outputs.
+
+    .. note::
+        If you want downsampling/general resizing, you should use :func:`~nn.functional.interpolate`.
+
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+
+        >>> input = flow.Tensor(np.arange(1, 5).reshape((1, 1, 2, 2)), dtype=flow.float32)
+        >>> input = input.to("cuda")
+        >>> m = flow.nn.Upsample(scale_factor=2.0, mode="nearest")
+        >>> output = m(input)
+        >>> output #doctest: +ELLIPSIS
+        tensor([[[[1., 1., 2., 2.],
+                  ...
+                  [3., 3., 4., 4.]]]], device='cuda:0', dtype=oneflow.float32)
+
+    """
+
+    def __init__(
+        self,
+        size: Optional[Union[int, Tuple[int, ...]]] = None,
+        scale_factor: Optional[Union[float, Tuple[float, ...]]] = None,
+        mode: str = "nearest",
+        align_corners: Optional[bool] = None,
+    ):
+        super().__init__()
+        self.size = size
+        self.scale_factor = scale_factor
+        self.mode = mode
+        self.align_corners = align_corners
+
+    def forward(self, x):
+        return flow.nn.functional.interpolate(
+            x,
+            size=self.size,
+            scale_factor=self.scale_factor,
+            mode=self.mode,
+            align_corners=self.align_corners,
+        )
+
+    def extra_repr(self) -> str:
+        if self.scale_factor is not None:
+            info = "scale_factor=" + str(self.scale_factor)
+        else:
+            info = "size=" + str(self.size)
+        info += ", mode=" + self.mode
+        return info
+
+
+class UpsamplingNearest2d(Upsample):
+    """Applies a 2D nearest neighbor upsampling to an input signal composed of several input
+    channels.
+
+    To specify the scale, it takes either the :attr:`size` or the :attr:`scale_factor`
+    as it's constructor argument.
+
+    When :attr:`size` is given, it is the output size of the image `(h, w)`.
+
+    Args:
+        size (int or Tuple[int, int], optional): output spatial sizes
+        scale_factor (float or Tuple[float, float], optional): multiplier for
+            spatial size.
+
+    .. warning::
+        This class is deprecated in favor of :func:`~nn.functional.interpolate`.
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})`
+        - Output: :math:`(N, C, H_{out}, W_{out})` where
+
+    .. math::
+          H_{out} = \\left\\lfloor H_{in} \\times \\text{scale_factor} \\right\\rfloor
+
+    .. math::
+          W_{out} = \\left\\lfloor W_{in} \\times \\text{scale_factor} \\right\\rfloor
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> input = flow.Tensor(np.arange(1, 5).reshape((1, 1, 2, 2)), dtype=flow.float32)
+        >>> input = input.to("cuda")
+        >>> m = flow.nn.UpsamplingNearest2d(scale_factor=2.0)
+        >>> output = m(input)
+        >>> output #doctest: +ELLIPSIS
+        tensor([[[[1., 1., 2., 2.],
+                  ...
+                  [3., 3., 4., 4.]]]], device='cuda:0', dtype=oneflow.float32)
+
+    """
+
+    def __init__(
+        self,
+        size: Optional[Tuple[int, int]] = None,
+        scale_factor: Optional[Tuple[float, float]] = None,
+    ) -> None:
+        super(UpsamplingNearest2d, self).__init__(size, scale_factor, mode="nearest")
+
+
+class UpsamplingBilinear2d(Upsample):
+    """Applies a 2D bilinear upsampling to an input signal composed of several input
+    channels.
+
+    To specify the scale, it takes either the :attr:`size` or the :attr:`scale_factor`
+    as it's constructor argument.
+
+    When :attr:`size` is given, it is the output size of the image `(h, w)`.
+
+    Args:
+        size (int or Tuple[int, int], optional): output spatial sizes
+        scale_factor (float or Tuple[float, float], optional): multiplier for
+            spatial size.
+
+    .. warning::
+        This class is deprecated in favor of :func:`~nn.functional.interpolate`. It is
+        equivalent to ``nn.functional.interpolate(..., mode='bilinear', align_corners=True)``.
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})`
+        - Output: :math:`(N, C, H_{out}, W_{out})` where
+
+    .. math::
+        H_{out} = \\left\\lfloor H_{in} \\times \\text{scale_factor} \\right\\rfloor
+
+    .. math::
+        W_{out} = \\left\\lfloor W_{in} \\times \\text{scale_factor} \\right\\rfloor
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        >>> input = flow.Tensor(np.arange(1, 5).reshape((1, 1, 2, 2)), dtype=flow.float32)
+        >>> input = input.to("cuda")
+        >>> m = flow.nn.UpsamplingBilinear2d(scale_factor=2.0)
+        >>> output = m(input)
+        >>> output #doctest: +ELLIPSIS
+        tensor([[[[1.    , 1.3333, 1.6667, 2.    ],
+                  ...
+                  [3.    , 3.3333, 3.6667, 4.    ]]]], device='cuda:0',
+               dtype=oneflow.float32)
+
+    """
+
+    def __init__(
+        self,
+        size: Optional[Tuple[int, int]] = None,
+        scale_factor: Optional[Tuple[float, float]] = None,
+    ) -> None:
+        super(UpsamplingBilinear2d, self).__init__(
+            size, scale_factor, mode="bilinear", align_corners=True
+        )
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/utils.py b/python/oneflow/nn/modules/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..becd72a6cc131a83d63d83bade63ecbed5310cdf
--- /dev/null
+++ b/python/oneflow/nn/modules/utils.py
@@ -0,0 +1,90 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import collections.abc as container_abcs
+from itertools import repeat
+from typing import List
+
+
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, container_abcs.Iterable):
+            return tuple(x)
+        return tuple(repeat(x, n))
+
+    return parse
+
+
+def _getint():
+    def parse(x):
+        if isinstance(x, container_abcs.Iterable):
+            return int(x[0])
+        return int(x)
+
+    return parse
+
+
+_getint = _getint()
+_single = _ntuple(1)
+_pair = _ntuple(2)
+_triple = _ntuple(3)
+_quadruple = _ntuple(4)
+
+
+def _reverse_repeat_tuple(t, n):
+    """Reverse the order of `t` and repeat each element for `n` times.
+    This can be used to translate padding arg used by Conv and Pooling modules
+    to the ones used by `F.pad`.
+    """
+    return tuple((x for x in reversed(t) for _ in range(n)))
+
+
+def _list_with_default(out_size, defaults):
+    if isinstance(out_size, int):
+        return out_size
+    if len(defaults) <= len(out_size):
+        raise ValueError(
+            "Input dimension should be at least {}".format(len(out_size) + 1)
+        )
+    return [
+        v if v is not None else d
+        for (v, d) in zip(out_size, defaults[-len(out_size) :])
+    ]
+
+
+def _check_axis(axis, shape):
+    ndim = len(shape)
+    if axis is None:
+        axis = list(range(len(shape)))
+    if isinstance(axis, int):
+        axis = [axis]
+    assert isinstance(axis, (list, tuple)), "Invalid axis {}".format(axis)
+    axis = list(axis)
+    for i in range(len(axis)):
+        assert (
+            -ndim <= axis[i] <= ndim - 1
+        ), "Dimension out of range (expected to be in range of [{}, {}], but got {})".format(
+            -ndim, ndim - 1, axis[i]
+        )
+        if axis[i] < 0:
+            axis[i] = axis[i] + ndim
+    return axis
+
+
+def _check_inplace_valid(x):
+    if x.is_leaf and x.requires_grad:
+        raise RuntimeError(
+            "a leaf Tensor that requires grad is being used in an in-place operation"
+        )
diff --git a/python/oneflow/nn/modules/where.py b/python/oneflow/nn/modules/where.py
new file mode 100644
index 0000000000000000000000000000000000000000..560495a02856789ee85575b463360be86b1aae95
--- /dev/null
+++ b/python/oneflow/nn/modules/where.py
@@ -0,0 +1,130 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.framework.tensor import register_tensor_op
+from oneflow.nn.module import Module
+
+
+class Where(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, condition, x, y):
+        assert condition.dtype == flow.int32 or condition.dtype == flow.int8
+        if isinstance(x, int) or isinstance(x, float):
+            x = flow.Tensor(
+                [float(x)],
+                dtype=flow.float32,
+                device=flow.device(condition.device.type),
+            )
+        if isinstance(y, int) or isinstance(y, float):
+            y = flow.Tensor(
+                [float(y)],
+                dtype=flow.float32,
+                device=flow.device(condition.device.type),
+            )
+        assert (
+            condition.device.type == x.device.type
+            and condition.device.type == y.device.type
+        )
+        assert len(condition.shape) == len(x.shape) and len(condition.shape) == len(
+            y.shape
+        ), f"The dim of where module's inputs can not match, please check!"
+        broadcast_cond = condition
+        broadcast_x = x
+        broadcast_y = y
+        broadcast_like_shape = []
+        broadcast_condition_axes = []
+        broadcast_x_axes = []
+        broadcast_y_axes = []
+        for i in range(len(x.shape)):
+            max_dim = max(x.shape[i], max(y.shape[i], condition.shape[i]))
+            broadcast_like_shape.append(max_dim)
+            if max_dim != condition.shape[i]:
+                broadcast_condition_axes.append(i)
+            if max_dim != x.shape[i]:
+                broadcast_x_axes.append(i)
+            if max_dim != y.shape[i]:
+                broadcast_y_axes.append(i)
+        broadcast_like_tensor = flow.zeros(
+            tuple(broadcast_like_shape), dtype=flow.float32
+        )
+        broadcast_like_tensor = broadcast_like_tensor.to(x.device.type)
+        broadcast_like_tensor.requires_grad = x.requires_grad or y.requires_grad
+        if len(broadcast_condition_axes) != 0:
+            condition = flow.cast(condition, flow.float32)
+            broadcast_cond = flow.broadcast_like(
+                condition, broadcast_like_tensor, tuple(broadcast_condition_axes)
+            )
+            broadcast_cond = flow.cast(broadcast_cond, flow.int32)
+        if len(broadcast_x_axes) != 0:
+            broadcast_x = flow.broadcast_like(
+                x, broadcast_like_tensor, broadcast_axes=tuple(broadcast_x_axes)
+            )
+        if len(broadcast_y_axes) != 0:
+            broadcast_y = flow.broadcast_like(
+                y, broadcast_like_tensor, broadcast_axes=tuple(broadcast_y_axes)
+            )
+        return flow.F.where(broadcast_cond, broadcast_x, broadcast_y)
+
+
+@register_tensor_op("where")
+def where_op(condition, x, y):
+    """Return a tensor of elements selected from either :attr:`x` or :attr:`y`, depending on :attr:`condition`.
+    If the element in condition is larger than 0,
+
+    it will take the `x` element, else it will take the `y` element
+
+    .. note::
+
+        The tensors :attr:`condition`, :attr:`x`, :attr:`y` must be broadcastable.
+        It will take the `x` element, else it will take the `y` element.
+
+    Args:
+        condition (IntTensor): When 1 (nonzero), yield x, otherwise yield y
+        x (Tensor or Scalar): value (if :attr:x is a scalar) or values selected at indices
+                            where :attr:`condition` is True
+        y (Tensor or Scalar): value (if :attr:x is a scalar) or values selected at indices
+                            where :attr:`condition` is False
+    Returns:
+        Tensor: A tensor of shape equal to the broadcasted shape of :attr:`condition`, :attr:`x`, :attr:`y`
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        >>> x = flow.Tensor(
+        ...    np.array([[-0.4620, 0.3139], [0.3898, -0.7197], [0.0478, -0.1657]]),
+        ...    dtype=flow.float32,
+        ... )
+        >>> y = flow.Tensor(np.ones(shape=(3, 2)), dtype=flow.float32)
+        >>> condition = flow.Tensor(np.array([[0, 1], [1, 0], [1, 0]]), dtype=flow.int32)
+        >>> out = condition.where(x, y)
+        >>> out #doctest: +ELLIPSIS
+        tensor([[1.    , 0.3139],
+                ...
+                [0.0478, 1.    ]], dtype=oneflow.float32)
+
+    """
+    return Where()(condition, x, y)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/zeropad2d.py b/python/oneflow/nn/modules/zeropad2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..81a932297a25617c295af4bb463b6a0127116d46
--- /dev/null
+++ b/python/oneflow/nn/modules/zeropad2d.py
@@ -0,0 +1,119 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Union
+
+import oneflow as flow
+from oneflow.nn.module import Module
+
+
+class ZeroPad2d(Module):
+    """The interface is consistent with PyTorch.
+    The documentation is referenced from:
+    https://pytorch.org/docs/stable/generated/torch.nn.ZeroPad2d.html
+
+    Pads the input tensor boundaries with zero. User can set the amount of padding by setting the parameter `paddings`.
+
+    Args:
+        padding (Union[int, tuple]):  the size of the padding. If is `int`, uses the same padding in all boundaries. If a 4-`tuple`, uses (:math:`\\mathrm{padding_{left}}`, :math:`\\mathrm{padding_{right}}`, :math:`\\mathrm{padding_{top}}`, :math:`\\mathrm{padding_{bottom}}`)
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})`
+        - Output: :math:`(N, C, H_{out}, W_{out})` where
+
+            :math:`H_{out} = H_{in} + \\mathrm{padding_{top}} + \\mathrm{padding_{bottom}}`
+
+            :math:`W_{out} = W_{in} + \\mathrm{padding_{left}} + \\mathrm{padding_{right}}`
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> zeropad_layer_int = flow.nn.ZeroPad2d(2)
+        >>> zeropad_layer_tuple = flow.nn.ZeroPad2d((1,2,2,0))
+        >>> input = flow.Tensor(np.arange(18).reshape((1, 2, 3, 3)).astype(np.float32))
+        >>> output_int = zeropad_layer_int(input)
+        >>> output_int.shape
+        flow.Size([1, 2, 7, 7])
+        >>> output_int
+        tensor([[[[ 0.,  0.,  0.,  0.,  0.,  0.,  0.],
+                  [ 0.,  0.,  0.,  0.,  0.,  0.,  0.],
+                  [ 0.,  0.,  0.,  1.,  2.,  0.,  0.],
+                  [ 0.,  0.,  3.,  4.,  5.,  0.,  0.],
+                  [ 0.,  0.,  6.,  7.,  8.,  0.,  0.],
+                  [ 0.,  0.,  0.,  0.,  0.,  0.,  0.],
+                  [ 0.,  0.,  0.,  0.,  0.,  0.,  0.]],
+        <BLANKLINE>
+                 [[ 0.,  0.,  0.,  0.,  0.,  0.,  0.],
+                  [ 0.,  0.,  0.,  0.,  0.,  0.,  0.],
+                  [ 0.,  0.,  9., 10., 11.,  0.,  0.],
+                  [ 0.,  0., 12., 13., 14.,  0.,  0.],
+                  [ 0.,  0., 15., 16., 17.,  0.,  0.],
+                  [ 0.,  0.,  0.,  0.,  0.,  0.,  0.],
+                  [ 0.,  0.,  0.,  0.,  0.,  0.,  0.]]]], dtype=oneflow.float32)
+        >>> output_tuple = zeropad_layer_tuple(input)
+        >>> output_tuple
+        tensor([[[[ 0.,  0.,  0.,  0.,  0.,  0.],
+                  [ 0.,  0.,  0.,  0.,  0.,  0.],
+                  [ 0.,  0.,  1.,  2.,  0.,  0.],
+                  [ 0.,  3.,  4.,  5.,  0.,  0.],
+                  [ 0.,  6.,  7.,  8.,  0.,  0.]],
+        <BLANKLINE>
+                 [[ 0.,  0.,  0.,  0.,  0.,  0.],
+                  [ 0.,  0.,  0.,  0.,  0.,  0.],
+                  [ 0.,  9., 10., 11.,  0.,  0.],
+                  [ 0., 12., 13., 14.,  0.,  0.],
+                  [ 0., 15., 16., 17.,  0.,  0.]]]], dtype=oneflow.float32)
+    """
+
+    def __init__(self, padding: Union[int, tuple]):
+        super().__init__()
+        if isinstance(padding, tuple):
+            assert len(padding) == 4, ValueError("Length of padding must be 4")
+            boundary = [padding[0], padding[1], padding[2], padding[3]]
+        elif isinstance(padding, int):
+            boundary = [padding, padding, padding, padding]
+        else:
+            raise ValueError("padding must be int  or tuple!")
+        self.padding = boundary
+        self.value = 0.0
+
+    def forward(self, x):
+        (_, _, h, w) = x.shape
+        if x.dtype in [flow.float32, flow.float16, flow.float64]:
+            floating_value = float(self.value)
+            integral_value = int(0)
+        else:
+            floating_value = float(0)
+            integral_value = int(self.value)
+        self._op = (
+            flow.builtin_op("constant_pad2d")
+            .Input("x")
+            .Output("y")
+            .Attr("padding", self.padding)
+            .Attr("floating_value", floating_value)
+            .Attr("integral_value", integral_value)
+            .Build()
+        )
+        res = self._op(x)[0]
+        return res
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/optimizer/__init__.py b/python/oneflow/nn/optimizer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/python/oneflow/nn/optimizer/adam.py b/python/oneflow/nn/optimizer/adam.py
new file mode 100644
index 0000000000000000000000000000000000000000..32b6e5ff66f6bc366425d0e13a480e8b2647b776
--- /dev/null
+++ b/python/oneflow/nn/optimizer/adam.py
@@ -0,0 +1,140 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import collections
+from typing import Callable, Dict, Iterator, List, Tuple, Union
+
+import oneflow as flow
+from oneflow.nn.optimizer.optimizer import Optimizer, ParamGroup
+from oneflow.nn.parameter import Parameter
+
+
+class Adam(Optimizer):
+    """Implements Adam algorithm.
+
+    It has been proposed in `Adam: A Method for Stochastic Optimization`_.
+    The implementation of the L2 penalty follows changes proposed in
+    `Decoupled Weight Decay Regularization`_.
+
+    This algorithm can adjust the learning rate of each parameter dynamically according to the 1st-moment estimates and the 2nd-moment estimates of gradient.
+
+    the equation of parameters updating is:
+
+    .. math::
+
+        & V_t = \\beta_1*V_{t-1} + (1-\\beta_1)*grad
+
+        & S_t = \\beta_2*S_{t-1} + (1-\\beta_2)*{grad} \\odot {grad}
+
+        & \\hat{g} = learning\\_rate*\\frac{{V_t}}{\\sqrt{{S_t}}+\\epsilon}
+
+        & param_{new} = param_{old} - \\hat{g}
+
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        scale (float, optional): the scale factor of loss (default: 1.0)
+
+    .. _Adam\\: A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+    .. _Decoupled Weight Decay Regularization:
+        https://arxiv.org/abs/1711.05101
+
+    """
+
+    def __init__(
+        self,
+        parameters: Union[Iterator[Parameter], List[Dict]],
+        lr: float = 0.001,
+        betas: Tuple[float, float] = (0.9, 0.999),
+        eps: float = 1e-08,
+        weight_decay: float = 0,
+        amsgrad: bool = False,
+        scale: float = 1.0,
+    ):
+        super().__init__()
+        assert lr >= 0.0, f"Invalid learning rate: {lr}"
+        assert eps >= 0.0, f"Invalid epsilon value: {eps}"
+        assert (
+            betas[0] >= 0.0 and betas[0] < 1.0
+        ), f"Invalid beta parameter at index 0: {betas[0]}"
+        assert (
+            betas[1] >= 0.0 and betas[1] < 1.0
+        ), f"Invalid beta parameter at index 1: {betas[1]}"
+        assert weight_decay >= 0.0, f"Invalid weight_decay value: {weight_decay}"
+        assert scale > 0.0, f"Invalid scale factor: {scale}"
+        assert amsgrad is False, "Not support AMSGrad now!"
+        self._default_options["lr"] = lr
+        self._default_options["eps"] = eps
+        self._default_options["betas"] = betas
+        self._default_options["weight_decay"] = weight_decay
+        self._default_options["amsgrad"] = amsgrad
+        self._default_options["scale"] = scale
+        if isinstance(parameters, collections.abc.Iterator):
+            self.param_groups.append(ParamGroup(parameters, self._default_options))
+        else:
+            for param in parameters:
+                self.param_groups.append(ParamGroup(param, self._default_options))
+        for param_group in self.param_groups:
+            for param in param_group.parameters:
+                assert param.is_leaf, "parameters must be leaf tensor"
+                self._state[param] = dict()
+                self._state[param]["exp_avg"] = flow.zeros_like(param)
+                self._state[param]["exp_avg_sq"] = flow.zeros_like(param)
+        self._op = (
+            flow.builtin_op("adam_update")
+            .Input("model")
+            .Input("model_diff")
+            .Input("m")
+            .Input("v")
+            .Attr("l1", 0.0)
+            .Attr("weight_decay", 0.0)
+            .Build()
+        )
+
+    def step(self, closure: Callable = None):
+        """Performs a single optimization step.
+
+        Args:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        with flow.no_grad():
+            loss = None
+            if closure is not None:
+                loss = closure()
+            for param_group in self.param_groups:
+                kwargs = {
+                    "learning_rate_val": param_group["lr"],
+                    "scale": param_group["scale"],
+                    "l2": param_group["weight_decay"],
+                    "beta1": param_group["betas"][0],
+                    "beta2": param_group["betas"][1],
+                    "epsilon": param_group["eps"],
+                }
+                for param in param_group.parameters:
+                    if param.grad is None:
+                        continue
+                    m_tensor = self._state[param]["exp_avg"]
+                    v_tensor = self._state[param]["exp_avg_sq"]
+                    self._op(param, param.grad, m_tensor, v_tensor, **kwargs)
+            self._state["step"] = self._state["step"] + 1
+            return loss
diff --git a/python/oneflow/nn/optimizer/adamw.py b/python/oneflow/nn/optimizer/adamw.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d277f7e3d62f7202a7f79d92b98874fa759c0e2
--- /dev/null
+++ b/python/oneflow/nn/optimizer/adamw.py
@@ -0,0 +1,143 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import collections
+from typing import Callable, Dict, Iterator, List, Tuple, Union
+
+import oneflow as flow
+from oneflow.nn.optimizer.optimizer import Optimizer, ParamGroup
+from oneflow.nn.parameter import Parameter
+
+
+class AdamW(Optimizer):
+    """Implements AdamW algorithm.
+
+    The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_.
+    The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_.
+
+    The optimizer of the Adam-weight-decay algorithm.
+
+    (More details please refer to `Adam-weight-decay <https://www.fast.ai/2018/07/02/adam-weight-decay/>`_).
+
+    So we use Adam-weight-decay algorithm to solve this problem.
+
+    the equation of parameters updating is:
+
+    .. math::
+
+        & V_t = \\beta_1*V_{t-1} + (1-\\beta_1)*grad
+
+        & S_t = \\beta_2*S_{t-1} + (1-\\beta_2)*{grad} \\odot {grad}
+
+        & \\hat{g} = learning\\_rate*(\\frac{{V_t}}{\\sqrt{{S_t}}+\\epsilon}+\\lambda*param_{old})
+
+        & param_{new} = param_{old} - \\hat{g}
+
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (In the equation is 位, default: 0)
+        scale (float, optional): the scale factor of loss (default: 1.0)
+
+    .. _Adam\\: A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+    .. _Decoupled Weight Decay Regularization:
+        https://arxiv.org/abs/1711.05101
+
+    """
+
+    def __init__(
+        self,
+        parameters: Union[Iterator[Parameter], List[Dict]],
+        lr: float = 0.001,
+        betas: Tuple[float, float] = (0.9, 0.999),
+        eps: float = 1e-08,
+        weight_decay: float = 0,
+        amsgrad: bool = False,
+        scale: float = 1.0,
+    ):
+        super().__init__()
+        assert lr >= 0.0, f"Invalid learning rate: {lr}"
+        assert eps >= 0.0, f"Invalid epsilon value: {eps}"
+        assert (
+            betas[0] >= 0.0 and betas[0] < 1.0
+        ), f"Invalid beta parameter at index 0: {betas[0]}"
+        assert (
+            betas[1] >= 0.0 and betas[1] < 1.0
+        ), f"Invalid beta parameter at index 1: {betas[1]}"
+        assert weight_decay >= 0.0, f"Invalid weight_decay value: {weight_decay}"
+        assert scale > 0.0, f"Invalid scale factor: {scale}"
+        assert amsgrad is False, "Not support AMSGrad now!"
+        self._default_options["lr"] = lr
+        self._default_options["eps"] = eps
+        self._default_options["betas"] = betas
+        self._default_options["weight_decay"] = weight_decay
+        self._default_options["amsgrad"] = amsgrad
+        self._default_options["scale"] = scale
+        if isinstance(parameters, collections.abc.Iterator):
+            self.param_groups.append(ParamGroup(parameters, self._default_options))
+        else:
+            for param in parameters:
+                self.param_groups.append(ParamGroup(param, self._default_options))
+        for param_group in self.param_groups:
+            for param in param_group.parameters:
+                assert param.is_leaf, "parameters must be leaf tensor"
+                self._state[param] = dict()
+                self._state[param]["exp_avg"] = flow.zeros_like(param)
+                self._state[param]["exp_avg_sq"] = flow.zeros_like(param)
+        self._op = (
+            flow.builtin_op("adam_update")
+            .Input("model")
+            .Input("model_diff")
+            .Input("m")
+            .Input("v")
+            .Attr("l1", 0.0)
+            .Attr("l2", 0.0)
+            .Build()
+        )
+
+    def step(self, closure: Callable = None):
+        """Performs a single optimization step.
+
+        Args:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        with flow.no_grad():
+            loss = None
+            if closure is not None:
+                loss = closure()
+            for param_group in self.param_groups:
+                kwargs = {
+                    "learning_rate_val": param_group["lr"],
+                    "scale": param_group["scale"],
+                    "weight_decay": param_group["weight_decay"],
+                    "beta1": param_group["betas"][0],
+                    "beta2": param_group["betas"][1],
+                    "epsilon": param_group["eps"],
+                }
+                for param in param_group.parameters:
+                    if param.grad is None:
+                        continue
+                    m_tensor = self._state[param]["exp_avg"]
+                    v_tensor = self._state[param]["exp_avg_sq"]
+                    self._op(param, param.grad, m_tensor, v_tensor, **kwargs)
+            self._state["step"] = self._state["step"] + 1
+            return loss
diff --git a/python/oneflow/nn/optimizer/cosine_annealing_lr.py b/python/oneflow/nn/optimizer/cosine_annealing_lr.py
new file mode 100644
index 0000000000000000000000000000000000000000..5095916f8ebbbaafa8776ec1f2792189300e547b
--- /dev/null
+++ b/python/oneflow/nn/optimizer/cosine_annealing_lr.py
@@ -0,0 +1,81 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import math
+
+from .lr_scheduler import LrScheduler
+
+
+class CosineAnnealingLR(LrScheduler):
+    """This operator creates a Cosine decayed learning rate scheduler.
+
+    Before the steps are specified by user, the learning rate will be updated as:
+
+    .. math::
+
+        & cos\\_decay = 0.5*(1+cos(\\pi*\\frac{current\\_step}{steps}))
+
+        & decay\\_factor = (1-\\alpha)*cos\\_decay+\\alpha
+
+        & learning\\_rate = base\\_learning\\_rate*decay\\_factor
+
+    After the steps specified by user, the learning rate will be :
+
+    .. math::
+
+        learning\\_rate = {base\\_learning\\_rate}*{\\alpha}
+
+    It has been proposed in
+    `SGDR: Stochastic Gradient Descent with Warm Restarts`_. Note that this only
+    implements the cosine annealing part of SGDR, and not the restarts.
+
+    Args:
+        optimizer(Optimizer): Wrapped optimizer.
+        steps (int): The decay steps in the scheduler.
+        alpha (float, optional): The learning rate scale factor (:math:`\\alpha`). (default: 0.0)
+        last_step (int, optional): The index of last step. (default: -1)
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. (default: ``False``)
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow as flow
+
+        ...
+        cosine_annealing_lr = flow.optim.lr_scheduler.CosineAnnealingLR(optimizer, steps=100, alpha=0.0)
+        for epoch in range(num_epoch):
+            train(...)
+            cosine_annealing_lr.step()
+
+    .. _SGDR\\: Stochastic Gradient Descent with Warm Restarts:
+        https://arxiv.org/abs/1608.03983
+    """
+
+    def __init__(
+        self, optimizer, steps: int, alpha: float = 0.0, last_step=-1, verbose=False
+    ):
+        assert steps > 0, f"steps must greater than zero, but got {steps}"
+        self.steps = steps
+        self.alpha = alpha
+        super().__init__(optimizer, last_step, verbose)
+
+    def get_lr(self):
+        if self.last_step < self.steps:
+            cos_decay = 0.5 * (1 + math.cos(math.pi * self.last_step / self.steps))
+            decay_factor = (1 - self.alpha) * cos_decay + self.alpha
+            return [base_lr * decay_factor for base_lr in self.base_lrs]
+        else:
+            return [base_lr * self.alpha for base_lr in self.base_lrs]
diff --git a/python/oneflow/nn/optimizer/lambda_lr.py b/python/oneflow/nn/optimizer/lambda_lr.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab463d475ce69aa7a737a320215656c92796cca8
--- /dev/null
+++ b/python/oneflow/nn/optimizer/lambda_lr.py
@@ -0,0 +1,100 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import types
+
+from .lr_scheduler import LrScheduler
+
+
+class LambdaLR(LrScheduler):
+    """
+    Sets the learning rate of each parameter group to the initial lr times a given function.
+    When last_step=-1, sets initial lr as lr.
+
+    .. math::
+
+        learning\\_rate = base\\_learning\\_rate*lambda(last\\_step)
+
+    Args:
+        optimizer(Optimizer): Wrapped optimizer.
+        lr_lambda(function or list): A function which computes a multiplicative factor given an integer
+            parameter epoch, or a list of such functions, one for each group in optimizer.param_groups.
+        last_step (int, optional): The index of last step. (default: -1)
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. (default: ``False``)
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow as flow
+
+        ...
+        lambda1 = lambda step: step // 30
+        lambda2 = lambda step: 0.95 * step
+        lambda_lr = flow.optim.lr_scheduler.LambdaLR(optimizer, [lambda1, lambda2])
+        for epoch in range(num_epoch):
+            train(...)
+            lambda_lr.step()
+
+    """
+
+    def __init__(self, optimizer, lr_lambda, last_step=-1, verbose=False):
+        if not isinstance(lr_lambda, (list, tuple)):
+            self.lr_lambdas = [lr_lambda] * len(optimizer.param_groups)
+        else:
+            assert len(lr_lambda) == len(
+                optimizer.param_groups
+            ), f"Expected {len(optimizer.param_groups)} lr_lambdas, but got {len(lr_lambda)}"
+            self.lr_lambdas = list(lr_lambda)
+        super().__init__(optimizer, last_step, verbose)
+
+    def state_dict(self):
+        """Returns the state of the scheduler as a :class:`dict`.
+
+        It contains an entry for every variable in self.__dict__ which
+        is not the optimizer.
+        The learning rate lambda functions will only be saved if they are callable objects
+        and not if they are functions or lambdas.
+        """
+        state_dict = {
+            key: value
+            for (key, value) in self.__dict__.items()
+            if key not in ("optimizer", "lr_lambdas")
+        }
+        state_dict["lr_lambdas"] = [None] * len(self.lr_lambdas)
+        for (idx, fn) in enumerate(self.lr_lambdas):
+            if not isinstance(fn, types.FunctionType):
+                state_dict["lr_lambdas"][idx] = fn.__dict__.copy()
+        return state_dict
+
+    def load_state_dict(self, state_dict):
+        """Loads the schedulers state.
+
+        Arguments:
+            state_dict (dict): scheduler state. Should be an object returned
+                from a call to :meth:`state_dict`.
+        """
+        lr_lambdas = state_dict.pop("lr_lambdas")
+        self.__dict__.update(state_dict)
+        state_dict["lr_lambdas"] = lr_lambdas
+        for (idx, fn) in enumerate(lr_lambdas):
+            if fn is not None:
+                self.lr_lambdas[idx].__dict__.update(fn)
+
+    def get_lr(self):
+        return [
+            base_lr * lmbda(self.last_step)
+            for (lmbda, base_lr) in zip(self.lr_lambdas, self.base_lrs)
+        ]
diff --git a/python/oneflow/nn/optimizer/lr_scheduler.py b/python/oneflow/nn/optimizer/lr_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfaae3047ea1f39975a4e337e0aea8500368561c
--- /dev/null
+++ b/python/oneflow/nn/optimizer/lr_scheduler.py
@@ -0,0 +1,78 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from .optimizer import Optimizer
+
+
+class LrScheduler(object):
+    def __init__(self, optimizer, last_step=-1, verbose=False):
+        if not isinstance(optimizer, Optimizer):
+            raise TypeError(f"{type(optimizer).__name__} is not an Optimizer object")
+        self._optimizer = optimizer
+        if last_step == -1:
+            for group in self._optimizer.param_groups:
+                group["initial_lr"] = group["lr"]
+        else:
+            for (i, group) in enumerate(self._optimizer.param_groups):
+                assert (
+                    "initial_lr" in group
+                ), f"param 'initial_lr' is not specified in param_groups[{i}] when resuming an optimizer"
+        self.base_lrs = [group["initial_lr"] for group in self._optimizer.param_groups]
+        self.last_lr = list()
+        self.last_step = last_step
+        self.verbose = verbose
+        self.step()
+
+    def state_dict(self):
+        """Returns the state of the scheduler as a :class:`dict`.
+
+        It contains an entry for every variable in self.__dict__ which
+        is not the optimizer.
+        """
+        return {
+            key: value for (key, value) in self.__dict__.items() if key != "_optimizer"
+        }
+
+    def load_state_dict(self, state_dict):
+        """Loads the schedulers state.
+
+        Arguments:
+            state_dict (dict): scheduler state. Should be an object returned
+                from a call to :meth:`state_dict`.
+        """
+        self.__dict__.update(state_dict)
+
+    def get_lr(self):
+        """Compute learning rate using chainable form of the scheduler
+        """
+        raise NotImplementedError
+
+    def get_last_lr(self):
+        """ Return last computed learning rate by current scheduler.
+        """
+        return self.last_lr
+
+    def print_lr(self, group_idx, lr):
+        """Display the current learning rate.
+        """
+        print(f"Adjusting learning rate of param_groups[{group_idx}] to {lr}")
+
+    def step(self):
+        self.last_step += 1
+        self.last_lr = self.get_lr()
+        for (i, group) in enumerate(self._optimizer.param_groups):
+            group["lr"] = self.last_lr[i]
+            if self.verbose:
+                self.print_lr(i, self.last_lr[i])
diff --git a/python/oneflow/nn/optimizer/optimizer.py b/python/oneflow/nn/optimizer/optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebb85bd034cd2bd7b4fa571958bbf81ddacd1e3e
--- /dev/null
+++ b/python/oneflow/nn/optimizer/optimizer.py
@@ -0,0 +1,111 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import collections
+import warnings
+from typing import Any, Callable, Dict, Iterator, Union
+
+from oneflow.framework.tensor import Tensor
+from oneflow.nn.parameter import Parameter
+
+
+class ParamGroup(object):
+    def __init__(
+        self,
+        parameters: Union[Iterator[Parameter], Dict[str, Any]],
+        default_options: Dict,
+    ):
+        if isinstance(parameters, collections.abc.Iterator):
+            self._parameters = list(parameters)
+            self._options = default_options
+        else:
+            assert "params" in parameters
+            self._parameters = list(parameters["params"])
+            self._options = default_options
+            for key in self._options:
+                if key in parameters:
+                    self._options[key] = parameters[key]
+
+    def __getitem__(self, key):
+        return self._options[key]
+
+    def __setitem__(self, key, value):
+        self._options[key] = value
+
+    @property
+    def options(self):
+        return self._options
+
+    @property
+    def parameters(self):
+        return self._parameters
+
+
+class Optimizer(object):
+    def __init__(self):
+        self.param_groups = list()
+        self._default_options = dict()
+        self._state = dict()
+        self._state["step"] = 0
+        self._op = None
+
+    def add_param_group(self, param_group) -> None:
+        raise NotImplementedError()
+
+    def load_state_dict(self, state_dict) -> None:
+        raise NotImplementedError()
+
+    def state_dict(self):
+        raise NotImplementedError()
+
+    def step(self, closure: Union[Callable, None] = None) -> Union[Tensor, None]:
+        raise NotImplementedError()
+
+    def zero_grad(self, set_to_none: bool = False):
+        """Sets the gradients of all optimized torch.Tensor s to zero.
+
+        Args:
+            set_to_none (bool): instead of setting to zero, set the grads to None.
+                This will in general have lower memory footprint, and can modestly
+                improve performance. However, it changes certain behaviors.
+        For example:
+            1. When the user tries to access a gradient and perform manual ops on
+            it, a None attribute or a Tensor full of 0s will behave differently.
+
+            2. If the user requests zero_grad(set_to_none=True) followed by a
+            backward pass, grads are guaranteed to be None for params that did not
+            receive a gradient.
+
+            3. Optimizers have a different behavior if the gradient is 0 or None
+            (in one case it does the step with a gradient of 0 and in the other
+            it skips the step altogether).
+
+        Returns:
+            None
+
+        """
+        all_grad_is_none = True
+        for param_group in self.param_groups:
+            for param in param_group.parameters:
+                if param.grad is not None:
+                    all_grad_is_none = False
+                    if set_to_none:
+                        param.grad = None
+                    else:
+                        param.grad.zeros_()
+        if all_grad_is_none:
+            warnings.warn(
+                "\nParameters in optimizer do not have gradient.\nPlease check `loss.backward()` is called or not,\nor try to declare optimizer after calling `module.to()`"
+            )
diff --git a/python/oneflow/nn/optimizer/rmsprop.py b/python/oneflow/nn/optimizer/rmsprop.py
new file mode 100644
index 0000000000000000000000000000000000000000..21659032fa24ee84571c208f2cd43d5dc4801e8d
--- /dev/null
+++ b/python/oneflow/nn/optimizer/rmsprop.py
@@ -0,0 +1,173 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import collections
+from typing import Callable, Dict, Iterator, List, Union
+
+import oneflow as flow
+from oneflow.nn.optimizer.optimizer import Optimizer, ParamGroup
+from oneflow.nn.parameter import Parameter
+
+
+class RMSprop(Optimizer):
+    """Implements RMSprop algorithm.
+
+    oot Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning
+    rate method. The original slides proposed RMSProp: Slide 29 of
+    http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf .
+
+    The original equation is as follows:
+
+    .. math::
+
+        r(w, t) = \\alpha r(w, t-1) + (1 - \\alpha)(\\nabla Q_{i}(w))^2
+
+        W = w - \\frac{\\eta} {\\\\sqrt{r(w,t) + \\epsilon}} \\nabla Q_{i}(w)
+
+    The first equation calculates moving average of the squared gradient for
+    each weight. Then dividing the gradient by :math:`sqrt{v(w,t)}`.
+    In some cases, adding a momentum term :math: `\\beta` is beneficial.
+    In our implementation, Nesterov momentum is used:
+
+    .. math::
+
+        r(w, t) = \\alpha r(w, t-1) + (1 - \\alpha)(\\nabla Q_{i}(w))^2
+
+        v(w, t) = \\beta v(w, t-1) + \\frac{\\eta} {\\\\sqrt{r(w,t) +
+            \\epsilon}} \\nabla Q_{i}(w)
+
+        w = w - v(w, t)
+
+    if centered is True:
+
+    .. math::
+
+        r(w, t) = \\alpha r(w, t-1) + (1 - \\alpha)(\\nabla Q_{i}(w))^2
+
+        g(w, t) = \\alpha g(w, t-1) + (1 - \\alpha)\\nabla Q_{i}(w)
+
+        v(w, t) = \\beta v(w, t-1) + \\frac{\\eta} {\\\\sqrt{r(w,t) - (g(w, t))^2 +
+            \\epsilon}} \\nabla Q_{i}(w)
+
+        w = w - v(w, t)
+
+    where, :math:`\\alpha` is a hyperparameter and typical values are 0.99, 0.95
+    and so on. :math:`\\beta` is the momentum term. :math:`\\epsilon` is a
+    smoothing term to avoid division by zero, usually set somewhere in range
+    from 1e-4 to 1e-8.
+
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-2)
+        momentum (float, optional): momentum factor (default: 0, oneflow not support momenmtum > 0 now!)
+        alpha (float, optional): smoothing constant (default: 0.99)
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        centered (bool, optional) : if ``True``, compute the centered RMSProp,
+            the gradient is normalized by an estimation of its variance
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+    """
+
+    def __init__(
+        self,
+        parameters: Union[Iterator[Parameter], List[Dict]],
+        lr: float = 0.001,
+        alpha: float = 0.99,
+        eps: float = 1e-08,
+        weight_decay: float = 0,
+        momentum: float = 0.0,
+        centered: bool = False,
+        scale: float = 1.0,
+    ):
+        super().__init__()
+        assert lr >= 0.0, f"Invalid learning rate: {lr}"
+        assert alpha >= 0.0, f"Invalid alpha value: {alpha}"
+        assert eps >= 0.0, f"Invalid epsilon value: {eps}"
+        assert weight_decay >= 0.0, f"Invalid weight_decay value: {weight_decay}"
+        assert scale > 0.0, f"Invalid scale factor: {scale}"
+        assert momentum == 0.0, "Not support momentum greater than zeros now!"
+        self._default_options["lr"] = lr
+        self._default_options["alpha"] = alpha
+        self._default_options["eps"] = eps
+        self._default_options["weight_decay"] = weight_decay
+        self._default_options["centered"] = centered
+        self._default_options["scale"] = scale
+        if isinstance(parameters, collections.abc.Iterator):
+            self.param_groups.append(ParamGroup(parameters, self._default_options))
+        else:
+            for param in parameters:
+                self.param_groups.append(ParamGroup(param, self._default_options))
+        for param_group in self.param_groups:
+            for param in param_group.parameters:
+                assert param.is_leaf, "parameters must be leaf tensor"
+                self._state[param] = dict()
+                self._state[param]["square_avg"] = flow.zeros_like(param)
+                if param_group["centered"]:
+                    self._state[param]["grad_avg"] = flow.zeros_like(param)
+        self._centered_rmsprop = (
+            flow.builtin_op("rmsprop_update")
+            .Input("model")
+            .Input("model_diff")
+            .Input("mean_square")
+            .Input("mean_gradient")
+            .Attr("centered", True)
+            .Attr("l1", 0.0)
+            .Attr("l2", 0.0)
+            .Build()
+        )
+        self._rmsprop = (
+            flow.builtin_op("rmsprop_update")
+            .Input("model")
+            .Input("model_diff")
+            .Input("mean_square")
+            .Attr("centered", False)
+            .Attr("l1", 0.0)
+            .Attr("l2", 0.0)
+            .Build()
+        )
+
+    def step(self, closure: Callable = None):
+        """Performs a single optimization step.
+
+        Args:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        with flow.no_grad():
+            loss = None
+            if closure is not None:
+                loss = closure()
+            for param_group in self.param_groups:
+                kwargs = {
+                    "learning_rate_val": param_group["lr"],
+                    "scale": param_group["scale"],
+                    "epsilon": param_group["eps"],
+                    "decay_rate": param_group["alpha"],
+                    "weight_decay": param_group["weight_decay"],
+                }
+                for param in param_group.parameters:
+                    if param.grad is None:
+                        continue
+                    ms_tensor = self._state[param]["square_avg"]
+                    if param_group["centered"]:
+                        mg_tensor = self._state[param]["grad_avg"]
+                        self._centered_rmsprop(
+                            param, param.grad, ms_tensor, mg_tensor, **kwargs
+                        )
+                    else:
+                        self._rmsprop(param, param.grad, ms_tensor, **kwargs)
+            self._state["step"] = self._state["step"] + 1
+            return loss
diff --git a/python/oneflow/nn/optimizer/sgd.py b/python/oneflow/nn/optimizer/sgd.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6fcd86774ed11f61111ab20158354ef2c0a2fa5
--- /dev/null
+++ b/python/oneflow/nn/optimizer/sgd.py
@@ -0,0 +1,154 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import collections
+import math
+from typing import Callable, Dict, Iterator, List, Union
+
+import oneflow as flow
+from oneflow.nn.parameter import Parameter
+
+from .optimizer import Optimizer, ParamGroup
+
+
+class SGD(Optimizer):
+    """Implements SGD algorithm.
+
+    This algorithm takes a random sample鈥檚 gradient as an approximate estimate of
+    the overall gradient in small batch gradient descent.
+
+    When the momentum = 0, the equation of parameters updating is:
+
+        .. math::
+
+            param_{new} = param_{old} - learning\\_rate * grad
+
+    With momentum, the equation of parameters updating is:
+
+        .. math::
+
+            & V_t = \\beta * V_{t-1} - learning\\_rate * (g_t * scale + param_{old} * weight\\_decay)
+
+            & param_{new} = param_{old} + V_t
+
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-3)
+        momentum (float, optional): Momentum factor (default: 0.0)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0.0)
+        scale (float, optional): the scale factor of loss (default: 1.0)
+
+    """
+
+    def __init__(
+        self,
+        parameters: Union[Iterator[Parameter], List[Dict]],
+        lr: float = 0.001,
+        momentum: float = 0.0,
+        weight_decay: float = 0.0,
+        scale: float = 1.0,
+    ):
+        super().__init__()
+        assert lr >= 0.0, f"Invalid learning rate: {lr}"
+        assert momentum >= 0.0, f"Invalid momentum: {momentum}"
+        assert scale >= 0.0, f"Invalid scale factor: {scale}"
+        assert weight_decay >= 0.0, f"Invalid weight_decay: {weight_decay}"
+        self._default_options["lr"] = lr
+        self._default_options["scale"] = scale
+        self._default_options["momentum"] = momentum
+        self._default_options["weight_decay"] = weight_decay
+        if isinstance(parameters, collections.abc.Iterator):
+            self.param_groups.append(ParamGroup(parameters, self._default_options))
+        else:
+            for param in parameters:
+                self.param_groups.append(ParamGroup(param, self._default_options))
+        for param_group in self.param_groups:
+            for param in param_group.parameters:
+                assert param.is_leaf, "parameters must be leaf tensor"
+                self._state[param] = dict()
+                if param_group["momentum"] != 0.0:
+                    self._state[param]["momentum_buf"] = flow.zeros_like(param)
+        self._momentum_sgd = (
+            flow.builtin_op("momentum_update")
+            .Input("model")
+            .Input("model_diff")
+            .Input("momentum")
+            .Attr("l1", 0.0)
+            .Attr("weight_decay", 0.0)
+            .Build()
+        )
+        self._sgd = (
+            flow.builtin_op("sgd_update")
+            .Input("model")
+            .Input("model_diff")
+            .Attr("weight_decay", 0.0)
+            .Attr("l1", 0.0)
+            .Build()
+        )
+
+    def step(self, closure: Callable = None):
+        with flow.no_grad():
+            loss = None
+            if closure is not None:
+                loss = closure()
+            for param_group in self.param_groups:
+                lr = param_group["lr"]
+                scale = param_group["scale"]
+                l2 = param_group["weight_decay"]
+                for param in param_group.parameters:
+                    if param.grad is None:
+                        continue
+                    if param_group["momentum"] == 0.0:
+                        self._sgd(
+                            param, param.grad, learning_rate_val=lr, l2=l2, scale=scale
+                        )
+                    else:
+                        momentum_buf = self._state[param]["momentum_buf"]
+                        beta = param_group["momentum"]
+                        self._momentum_sgd(
+                            param,
+                            param.grad,
+                            momentum_buf,
+                            learning_rate_val=lr,
+                            l2=l2,
+                            scale=scale,
+                            beta=beta,
+                        )
+            self._state["step"] = self._state["step"] + 1
+            return loss
+
+    def add_to_graph_train_config(self, train_conf, var2var_op_name_dict):
+        for param_group in self.param_groups:
+            optimizer_conf = train_conf.mutable_optimizer_conf().Add()
+            lr = param_group["lr"]
+            beta = param_group["momentum"]
+            scale = param_group["scale"]
+            base_scale = train_conf.loss_scale_factor()
+            assert math.isclose(base_scale, 1, rel_tol=0.0001) or math.isclose(
+                scale, base_scale, rel_tol=0.0001
+            ), "nn.Graph only support one scale factor at the moment, base_scale {} vs scale {}".format(
+                base_scale, scale
+            )
+            train_conf.set_loss_scale_factor(scale)
+            optimizer_conf.set_base_learning_rate(lr)
+            if beta == 0:
+                optimizer_conf.mutable_naive_conf()
+            else:
+                optimizer_conf.mutable_momentum_conf().set_beta(beta)
+            for param in param_group.parameters:
+                if not param.requires_grad:
+                    continue
+                optimizer_conf.add_variable_op_names(var2var_op_name_dict[param])
diff --git a/python/oneflow/nn/optimizer/step_lr.py b/python/oneflow/nn/optimizer/step_lr.py
new file mode 100644
index 0000000000000000000000000000000000000000..e978b789ad2a1ec1112575043b80906e6c70128f
--- /dev/null
+++ b/python/oneflow/nn/optimizer/step_lr.py
@@ -0,0 +1,59 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from .lr_scheduler import LrScheduler
+
+
+class StepLR(LrScheduler):
+    """
+    Decays the learning rate of each parameter group by gamma every step_size steps.
+    Notice that such decay can happen simultaneously with other changes to the learning
+    rate fromoutside this scheduler. When last_step=-1, sets initial lr as lr.
+
+    Args:
+        optimizer(Optimizer): Wrapped optimizer.
+        step_size (int): Period of learning rate decay.
+        gamma (float, optional): Multiplicative factor of learning rate decay. (default: 0.1)
+        last_step (int, optional): The index of last step. (default: -1)
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. (default: ``False``)
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow as flow
+
+        ...
+        step_lr = flow.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)
+        for epoch in range(num_epoch):
+            train(...)
+            step_lr.step()
+
+    """
+
+    def __init__(
+        self, optimizer, step_size: int, gamma: float = 0.1, last_step=-1, verbose=False
+    ):
+        assert step_size > 0, f"step_size must greater than zero, but got {step_size}"
+        assert gamma > 0.0, f"gamma must greater than zero, but got {gamma}"
+        self.step_size = step_size
+        self.gamma = gamma
+        super().__init__(optimizer, last_step, verbose)
+
+    def get_lr(self):
+        if self.last_step == 0 or self.last_step % self.step_size != 0:
+            return [group["lr"] for group in self._optimizer.param_groups]
+        else:
+            return [group["lr"] * self.gamma for group in self._optimizer.param_groups]
diff --git a/python/oneflow/nn/parameter.py b/python/oneflow/nn/parameter.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef22bcc89b8aa4e7cf124d3781a5c2a95cd8c7d6
--- /dev/null
+++ b/python/oneflow/nn/parameter.py
@@ -0,0 +1,28 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.framework.tensor import Tensor
+
+
+class Parameter(Tensor):
+    def __init__(self, data, requires_grad=True):
+        if not isinstance(data, Tensor):
+            data = Tensor(data)
+        self._data = data
+        self._data.requires_grad = requires_grad
+
+    def __getattr__(self, name):
+        return getattr(self._data, name)
diff --git a/python/oneflow/nn/utils.py b/python/oneflow/nn/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..493189ef1e7ef297fee5f08d1326bf513af2b84c
--- /dev/null
+++ b/python/oneflow/nn/utils.py
@@ -0,0 +1,26 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+
+def add_indent(in_s, num_spaces):
+    s = in_s.split("\n")
+    if len(s) == 1:
+        return in_s
+    first = s.pop(0)
+    s = [num_spaces * " " + line for line in s]
+    s = "\n".join(s)
+    s = first + "\n" + s
+    return s
diff --git a/python/oneflow/ops/__init__.py b/python/oneflow/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..058e21559bb24f8a00742d3e5012204cfa1cb7b7
--- /dev/null
+++ b/python/oneflow/ops/__init__.py
@@ -0,0 +1,120 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import re
+
+import oneflow._oneflow_internal
+import oneflow._oneflow_internal.oneflow.core.job.placement as placement_cfg
+import oneflow.core.operator.op_conf_pb2 as op_conf_util
+import oneflow.core.register.logical_blob_id_pb2 as logical_blob_id_util
+import oneflow.eager.blob_register as blob_register_util
+import oneflow.eager.boxing_util as boxing_util
+import oneflow.framework.c_api_util as c_api_util
+import oneflow.framework.compile_context as compile_context
+import oneflow.framework.hob as hob
+import oneflow.framework.id_util as id_util
+import oneflow.framework.input_blob_def as input_blob_util
+import oneflow.framework.remote_blob as remote_blob_util
+import oneflow.framework.scope_util as scope_util
+import oneflow.framework.session_context as session_ctx
+import oneflow.support.enable_if as enable_if
+
+blob_register = oneflow._oneflow_internal.GetDefaultBlobRegister()
+
+
+def InputOpByArgBlobDef(blob_def):
+    assert isinstance(blob_def, input_blob_util.ArgBlobDef)
+    op_conf = op_conf_util.OperatorConf()
+    op_conf.name = blob_def.op_name
+    op_conf.input_conf.out = blob_def.blob_name
+    op_conf.input_conf.blob_conf.CopyFrom(blob_def.ToInterfaceBlobConf())
+    blob_def.AddAndInferOp(op_conf)
+    lbi = logical_blob_id_util.LogicalBlobId()
+    lbi.op_name = blob_def.op_name
+    lbi.blob_name = blob_def.blob_name
+    return remote_blob_util.RemoteBlob(lbi)
+
+
+def ReturnRemoteBlob(remote_blob, allow_cpu_return_op=True):
+    return enable_if.unique([LazyReturnRemoteBlob, EagerReturnRemoteBlob])(
+        remote_blob, allow_cpu_return_op
+    )
+
+
+@enable_if.condition(hob.in_global_mode & ~hob.eager_execution_enabled)
+def LazyReturnRemoteBlob(remote_blob, allow_cpu_return_op=True):
+    assert isinstance(
+        remote_blob,
+        (
+            oneflow._oneflow_internal.LazyMirroredBlob,
+            oneflow._oneflow_internal.LazyConsistentBlob,
+        ),
+    )
+    (op_conf, lbi, scope) = _GetReturnOpConfAndOutLbiAndScope(
+        remote_blob, allow_cpu_return_op
+    )
+    compile_context.CurJobAddOp(op_conf, scope)
+    return remote_blob_util.RemoteBlob(lbi)
+
+
+@enable_if.condition(hob.in_global_mode & hob.eager_execution_enabled)
+def EagerReturnRemoteBlob(remote_blob, allow_cpu_return_op=True):
+    if not hob.is_trainable(None):
+        return remote_blob
+    (op_conf, lbi, scope) = _GetReturnOpConfAndOutLbiAndScope(
+        remote_blob, allow_cpu_return_op
+    )
+    if remote_blob.blob_object.op_arg_parallel_attr.is_mirrored():
+        add_and_infer = compile_context.CurJobAddMirroredOp
+    else:
+        add_and_infer = compile_context.CurJobAddConsistentOp
+    op_attribute = add_and_infer(op_conf, scope)
+
+    def BuildInstruction(builder):
+        get_blob_scope = blob_register_util.BnInOp2BlobObjectScope
+        with get_blob_scope(blob_register, op_attribute) as bn_in_op2blob_object:
+            cfg_op_attribute = oneflow._oneflow_internal.deprecated.MakeOpAttributeByString(
+                str(op_attribute)
+            )
+            builder.StatelessCall(
+                cfg_op_attribute,
+                remote_blob.blob_object.parallel_desc_symbol.parallel_conf,
+                bn_in_op2blob_object,
+                boxing_util.BoxingTo,
+            )
+
+    oneflow._oneflow_internal.deprecated.LogicalRun(BuildInstruction)
+    return remote_blob_util.RemoteBlob(lbi)
+
+
+def _GetReturnOpConfAndOutLbiAndScope(remote_blob, allow_cpu_return_op=True):
+    op_conf = op_conf_util.OperatorConf()
+    op_conf.name = id_util.UniqueStr("Return_")
+    setattr(op_conf.return_conf, "in", remote_blob.unique_name)
+    op_conf.return_conf.out = "out"
+    if allow_cpu_return_op:
+        op_conf.device_tag = "cpu"
+    lbi = logical_blob_id_util.LogicalBlobId()
+    lbi.op_name = op_conf.name
+    lbi.blob_name = "out"
+    parallel_conf = placement_cfg.ParallelConf()
+    parallel_conf.CopyFrom(remote_blob.parallel_conf)
+
+    def BuildScope(old_scope, builder):
+        return builder.BuildScopeWithNewParallelConf(old_scope, parallel_conf)
+
+    sess = session_ctx.GetDefaultSession()
+    scope = scope_util.MakeScope(BuildScope)
+    return (op_conf, lbi, scope)
diff --git a/python/oneflow/ops/array_ops.py b/python/oneflow/ops/array_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..f32c64bf5ccea7f999601fce6cff5e6e8496047a
--- /dev/null
+++ b/python/oneflow/ops/array_ops.py
@@ -0,0 +1,102 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import numpy as np
+
+
+def check_slice_tup_list(slice_tup_list, shape):
+    ndim = len(shape)
+    if not isinstance(slice_tup_list, (list, tuple)) or len(slice_tup_list) > ndim:
+        raise ValueError(
+            "slice_tup_list must be a list or tuple with length less than or equal to number of dimensions of input tensor"
+        )
+    if len(slice_tup_list) < ndim:
+        slice_tup_list += type(slice_tup_list)(
+            [(None, None, None)] * (ndim - len(slice_tup_list))
+        )
+    start_list = []
+    stop_list = []
+    step_list = []
+    for (slice_tup, dim_size) in zip(slice_tup_list, shape):
+        if not isinstance(slice_tup, (tuple, list)) or len(slice_tup) != 3:
+            raise ValueError(
+                "element of slice_tup_list must be a list or tuple with form (start, stop, step)"
+            )
+        if not all((isinstance(idx, int) or idx is None for idx in slice_tup)):
+            raise ValueError("element of slice tuple must int or None")
+        (start, stop, step) = slice_tup
+        if step is None:
+            step = 1
+        if step == 0:
+            raise ValueError("slice step can't be 0")
+        if start is None:
+            start = 0 if step > 0 else np.iinfo(np.int64).max
+        elif start < -dim_size or start >= dim_size:
+            raise ValueError("slice start must be in range [-size, size)")
+        if stop is None:
+            stop = np.iinfo(np.int64).max if step > 0 else np.iinfo(np.int64).min
+        elif stop < -dim_size - 1 or stop > dim_size:
+            raise ValueError("slice start must be in range [-size-1, size]")
+        start_list.append(start)
+        stop_list.append(stop)
+        step_list.append(step)
+    return (start_list, stop_list, step_list)
+
+
+def GetSliceAttrs(slice_tup_list, input_shape):
+    ndim = len(input_shape)
+    if not (isinstance(slice_tup_list, (list, tuple)) and len(slice_tup_list) <= ndim):
+        raise ValueError(
+            "slice_tup_list must be a list or tuple with length less than or equal to number of dimensions of input tensor"
+        )
+    if len(slice_tup_list) < ndim:
+        slice_tup_list += type(slice_tup_list)(
+            [(None, None, None)] * (ndim - len(slice_tup_list))
+        )
+    start_list = []
+    stop_list = []
+    step_list = []
+    for (slice_tup, dim_size) in zip(slice_tup_list, input_shape):
+        if not (isinstance(slice_tup, (tuple, list)) and len(slice_tup) == 3):
+            raise ValueError(
+                "element of slice_tup_list must be a list or tuple with form (start, stop, step)"
+            )
+        if not all((isinstance(idx, int) or idx is None for idx in slice_tup)):
+            raise ValueError("element of slice tuple must int or None")
+        (start, stop, step) = slice_tup
+        if step is None:
+            step = 1
+        if step <= 0:
+            raise ValueError("slice_assign/logical_slice step must be greater than 0")
+        if start is None:
+            start = 0
+        elif start < -dim_size or start >= dim_size:
+            raise ValueError(
+                "slice_assign/logical_slice start must be in range [-size, size)"
+            )
+        elif start < 0:
+            start += dim_size
+        if stop is None:
+            stop = dim_size
+        elif stop < -dim_size or stop > dim_size:
+            raise ValueError(
+                "slice_assign/logical_slice start must be in range [-size, size]"
+            )
+        elif stop < 0:
+            stop += dim_size
+        start_list.append(start)
+        stop_list.append(stop)
+        step_list.append(step)
+    return (start_list, stop_list, step_list)
diff --git a/python/oneflow/ops/assign_op.py b/python/oneflow/ops/assign_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..9173dfc8212987a747ad8617df6fe822c8a8263c
--- /dev/null
+++ b/python/oneflow/ops/assign_op.py
@@ -0,0 +1,95 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+
+import oneflow
+import oneflow.core.operator.op_conf_pb2 as op_conf_util
+import oneflow.core.register.logical_blob_id_pb2 as logical_blob_id_util
+import oneflow.eager.boxing_util as boxing_util
+import oneflow.framework.hob as hob
+import oneflow.framework.id_util as id_util
+import oneflow.framework.interpret_util as interpret_util
+import oneflow.framework.placement_context as placement_ctx
+import oneflow.framework.remote_blob as remote_blob_util
+import oneflow.support.enable_if as enable_if
+
+
+def assign(ref, value, dtype=None, name=None):
+    if name is None:
+        name = id_util.UniqueStr("Assign_")
+    op = (
+        oneflow.consistent_user_op_builder(name)
+        .Op("assign")
+        .Input("ref", [ref])
+        .Input("value", [value])
+        .Build()
+    )
+    op.InferAndTryRun()
+
+
+def api_system_assign(ref, value, validate_shape=None, use_locking=None, name=None):
+    api = enable_if.unique([lazy_system_assign, eager_system_assign])
+    return api(
+        ref, value, validate_shape=validate_shape, use_locking=use_locking, name=name
+    )
+
+
+@enable_if.condition(hob.in_global_mode & ~hob.eager_execution_enabled)
+def lazy_system_assign(ref, value, validate_shape=None, use_locking=None, name=None):
+    op_conf = _SystemAssignOpConf(ref, value, name=name)
+    (
+        device_tag,
+        machine_device_ids,
+        hierarchy,
+    ) = oneflow._oneflow_internal.GetDeviceTagAndMachineDeviceIdsAndHierarchy(
+        ref.parallel_conf
+    )
+    if hierarchy is not None:
+        hierarchy = tuple(hierarchy.dim())
+    with oneflow.scope.placement(device_tag, machine_device_ids, hierarchy):
+        interpret_util.Forward(op_conf)
+    return ref
+
+
+@enable_if.condition(hob.in_global_mode & hob.eager_execution_enabled)
+def eager_system_assign(ref, value, validate_shape=None, use_locking=None, name=None):
+    op_conf = _SystemAssignOpConf(ref, value, name=name)
+    oneflow._oneflow_internal.deprecated.LogicalRun(
+        lambda builder: boxing_util.BuildAssignInstruction(
+            builder, ref.blob_object, value.blob_object, op_conf
+        )
+    )
+    return ref
+
+
+def api_one_to_one_assign(ref, value):
+    assert hob.eager_execution_enabled(None)
+    oneflow._oneflow_internal.deprecated.LogicalRun(
+        lambda builder: builder.Build121AssignInstruction(
+            ref.blob_object, value.blob_object
+        )
+    )
+    return ref
+
+
+def _SystemAssignOpConf(ref, value, name=None):
+    if name is None:
+        name = id_util.UniqueStr("Assign_")
+    op_conf = op_conf_util.OperatorConf()
+    op_conf.name = name
+    op_conf.assign_conf.ref = ref.unique_name
+    op_conf.assign_conf.value = value.unique_name
+    return op_conf
diff --git a/python/oneflow/ops/builtin_ops.py b/python/oneflow/ops/builtin_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..00b73e97f615c90f582f816dbaacfdac89084d18
--- /dev/null
+++ b/python/oneflow/ops/builtin_ops.py
@@ -0,0 +1,102 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow
+import oneflow._oneflow_internal
+import oneflow.framework.id_util as id_util
+from oneflow.framework.attr_util import convert_to_user_attr_value
+
+
+class BuiltinOp(object):
+    def __init__(self, op_type_name, op_name=None):
+        if op_name is None:
+            op_name = id_util.UniqueStr(op_type_name)
+        self._builder = oneflow._oneflow_internal.one.OpBuilder(op_type_name, op_name)
+        self._op = None
+        self._op_type_name = op_type_name
+
+    @property
+    def op(self):
+        """access the builtin op
+
+        Returns:
+            the builtin op
+        """
+        if self._op is None:
+            self._op = self._builder.build()
+        return self._op
+
+    def Input(self, input_name, num=1):
+        """Set input blob of op
+
+        Args:
+            input_name (str): input name of blob
+            num (int, optional) : Defaults to 1.
+
+        Returns:
+            self
+        """
+        assert isinstance(num, int) and num >= 1
+        self._builder.input(input_name, num)
+        return self
+
+    def Output(self, output_name, num=1):
+        """Set output blob of op
+
+        Args:
+            output_name (str): name of output blob
+            num (int, optional):  Defaults to 1.
+
+        Returns:
+            self
+        """
+        assert isinstance(num, int) and num >= 1
+        self._builder.output(output_name, num)
+        return self
+
+    def Attr(self, attr_name, attr_value, attr_type_name=None):
+        """Set value of op's attribute.
+
+        Args:
+            attr_name (str): attribute name of op
+            attr_value (Any): attribute value of op
+
+        Raises:
+            ValueError: raised when value is not idential to op's attribute type.
+
+        Returns:
+            [type]: [description]
+        """
+        if attr_type_name is not None:
+            print(
+                'WARNING: Argument \'attr_type_name\' of UserOpConfBuilder.Attr has been deprecated. Please remove it.\n\n            For instance:\n                -     .Attr("out_num", out_num, "AttrTypeInt64")\n                +     .Attr("out_num", out_num)\n                        '
+            )
+            print(traceback.format_stack()[-2])
+        assert self._op_type_name is not None
+        self._builder.attr(
+            attr_name,
+            convert_to_user_attr_value(self._op_type_name, attr_name, attr_value),
+        )
+        return self
+
+    def Build(self):
+        """Explicitly complete the construction of the builtin op
+
+        Returns:
+            the completed builtin op
+        """
+        if self._op is None:
+            self._op = self._builder.build()
+        return self._op
diff --git a/python/oneflow/ops/categorical_ordinal_encode_op.py b/python/oneflow/ops/categorical_ordinal_encode_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f37ec9e3f0066892d9e7d905f4dc4996d639daa
--- /dev/null
+++ b/python/oneflow/ops/categorical_ordinal_encode_op.py
@@ -0,0 +1,164 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional
+
+import oneflow as flow
+import oneflow._oneflow_internal
+import oneflow.framework.id_util as id_util
+import oneflow.framework.remote_blob as remote_blob_util
+
+
+def categorical_ordinal_encode(
+    table: oneflow._oneflow_internal.BlobDesc,
+    size: oneflow._oneflow_internal.BlobDesc,
+    input_tensor: oneflow._oneflow_internal.BlobDesc,
+    hash_precomputed: bool = True,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator maintains a hash table to encode the categorical ordinal Blob. It converts a discrete input value into a continuous integer ID.
+
+    Args:
+        table (oneflow._oneflow_internal.BlobDesc): The hash table, you can assign it as a variable.
+        size (oneflow._oneflow_internal.BlobDesc): The size of hash table.
+        input_tensor (oneflow._oneflow_internal.BlobDesc): The input Blob.
+        hash_precomputed (bool, optional): We currently only support the 'True' mode. The internal hash value will no longer be computed. Defaults to True.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow as flow
+        import numpy as np
+        import oneflow.typing as tp
+
+        @flow.global_function()
+        def categorical_ordinal_encode_Job(x: tp.Numpy.Placeholder((3, 3), dtype=flow.int32)
+        ) -> tp.Numpy:
+            dtype = x.dtype
+            with flow.scope.namespace("categorical_ordinal_encode"):
+                table = flow.get_variable(
+                    name="Table",
+                    shape=(16,),
+                    dtype=dtype,
+                    initializer=flow.constant_initializer(0, dtype=dtype),
+                    trainable=False,
+                    reuse=False,
+                )
+                size = flow.get_variable(
+                    name="Size",
+                    shape=(1,),
+                    dtype=dtype,
+                    initializer=flow.constant_initializer(0, dtype=dtype),
+                    trainable=False,
+                    reuse=False,
+                )
+                return flow.categorical_ordinal_encode(
+                    table=table, size=size, input_tensor=x, name="Encode",
+                )
+
+        x = np.array([[7, 0, 2],
+                    [1, 7, 2],
+                    [0, 1, 7]]).astype(np.int32)
+
+        out = categorical_ordinal_encode_Job(x)
+
+        # out [[1 0 2]
+        #      [3 1 2]
+        #      [0 3 1]]
+
+    """
+    assert hash_precomputed is True
+    return (
+        flow.user_op_builder(name or id_util.UniqueStr("CategoricalOrdinalEncode_"))
+        .Op("CategoricalOrdinalEncode")
+        .Input("in", [input_tensor])
+        .Input("table", [table])
+        .Input("size", [size])
+        .Output("out")
+        .Attr("hash_precomputed", hash_precomputed)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def categorical_ordinal_encoder(
+    input_tensor: oneflow._oneflow_internal.BlobDesc,
+    capacity: int,
+    hash_precomputed: bool = True,
+    name: str = "CategoricalOrdinalEncoder",
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator uses `oneflow.categorical_ordinal_encode` to encapsulate a categorical_ordinal_encoder. More details please refer to `oneflow.categorical_ordinal_encode`
+
+    Args:
+        input_tensor (oneflow._oneflow_internal.BlobDesc): The input Blob.
+        capacity (int): The capacity of hash table.
+        hash_precomputed (bool, optional): We currently only support the 'True' mode. The internal hash value will no longer be computed. Defaults to True.
+        name (str, optional): The name for the operation. Defaults to "CategoricalOrdinalEncoder".
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow as flow
+        import numpy as np
+        import oneflow.typing as tp
+
+        @flow.global_function()
+        def categorical_ordinal_encoder_Job(x: tp.Numpy.Placeholder((3, 3), dtype=flow.int32)
+        ) -> tp.Numpy:
+            return flow.layers.categorical_ordinal_encoder(x, 16)
+
+        x = np.array([[7, 0, 2],
+                    [1, 7, 2],
+                    [0, 1, 7]]).astype(np.int32)
+
+        out = categorical_ordinal_encoder_Job(x)
+
+        # out [[1 0 2]
+        #      [3 1 2]
+        #      [0 3 1]]
+
+    """
+    assert hash_precomputed is True
+    dtype = input_tensor.dtype
+    with flow.scope.namespace(name):
+        table = flow.get_variable(
+            name="Table",
+            shape=(capacity * 2,),
+            dtype=dtype,
+            initializer=flow.constant_initializer(0, dtype=dtype),
+            trainable=False,
+            reuse=False,
+        )
+        size = flow.get_variable(
+            name="Size",
+            shape=(1,),
+            dtype=dtype,
+            initializer=flow.constant_initializer(0, dtype=dtype),
+            trainable=False,
+            reuse=False,
+        )
+        return categorical_ordinal_encode(
+            table=table, size=size, input_tensor=input_tensor, name="Encode"
+        )
diff --git a/python/oneflow/ops/constant_op.py b/python/oneflow/ops/constant_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..135bf11ff1212eff0de01553ff56359b3fbd458a
--- /dev/null
+++ b/python/oneflow/ops/constant_op.py
@@ -0,0 +1,215 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+from typing import Optional, Sequence, Union
+
+import oneflow as flow
+import oneflow._oneflow_internal
+import oneflow.core.operator.op_conf_pb2 as op_conf_util
+import oneflow.core.register.logical_blob_id_pb2 as logical_blob_id_util
+import oneflow.framework.id_util as id_util
+import oneflow.framework.interpret_util as interpret_util
+import oneflow.framework.remote_blob as remote_blob_util
+
+
+def constant(
+    value: Union[int, float],
+    dtype: Optional[flow.dtype] = None,
+    shape: Optional[Sequence[int]] = None,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator creates a constant Blob.
+
+    Args:
+        value (Union[int, float]): The constant value of Blob.
+        dtype (Optional[flow.dtype], optional): The data type of Blob. Defaults to None.
+        shape (Optional[Sequence[int]], optional): The shape of Blob. Defaults to None.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Raises:
+        NotImplementedError: The data type of value should be int or float.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result blob.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow as flow
+        import numpy as np
+        import oneflow.typing as tp
+
+
+        @flow.global_function()
+        def constant_Job() -> tp.Numpy:
+            constant_blob = flow.constant(value=1.5,
+                                        shape=(1, 3, 3),
+                                        dtype=flow.float)
+            return constant_blob
+
+
+        out = constant_Job()
+
+        # out [[[1.5 1.5 1.5]
+        #       [1.5 1.5 1.5]
+        #       [1.5 1.5 1.5]]]
+
+    """
+    if name is None:
+        name = id_util.UniqueStr("Constant_")
+    assert value is not None
+    assert dtype is not None
+    if not isinstance(value, (int, float)):
+        raise NotImplementedError
+    if isinstance(value, float):
+        is_floating_value = True
+        floating_value = float(value)
+        integer_value = int(0)
+    else:
+        is_floating_value = False
+        floating_value = float(0)
+        integer_value = int(value)
+    if shape is not None:
+        assert isinstance(shape, (list, tuple))
+    else:
+        shape = []
+    return (
+        flow.user_op_builder(name)
+        .Op("constant")
+        .Output("out")
+        .Attr("floating_value", floating_value)
+        .Attr("integer_value", integer_value)
+        .Attr("is_floating_value", is_floating_value)
+        .Attr("dtype", dtype)
+        .Attr("shape", shape)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def constant_scalar(
+    value: Union[int, float],
+    dtype: Optional[flow.dtype] = None,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator creates a constant scalar Blob.
+
+    Args:
+        value (Union[int, float]): The constant value of Blob.
+        dtype (Optional[flow.dtype], optional): The data type of Blob. Defaults to None.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result blob.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow as flow
+        import numpy as np
+        import oneflow.typing as tp
+
+
+        @flow.global_function()
+        def constant_scalar_Job() -> tp.Numpy:
+            constant_scalar = flow.constant_scalar(value=2.5,
+                                                dtype=flow.float)
+            return constant_scalar
+
+
+        out = constant_scalar_Job()
+
+        # out [2.5]
+
+    """
+    return flow.constant(value, dtype=dtype, shape=[1])
+
+
+def constant_like(
+    like: oneflow._oneflow_internal.BlobDesc,
+    value: Union[int, float],
+    dtype: Optional[flow.dtype] = None,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator creates a constant Blob that has the same shape as `like`.
+
+    Args:
+        like (oneflow._oneflow_internal.BlobDesc): A Blob.
+        value (Union[int, float]): The constant value of Blob.
+        dtype (Optional[flow.dtype], optional): The data type of Blob. Defaults to None.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Raises:
+        NotImplementedError: The data type of value should be int or float.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow as flow
+        import numpy as np
+        import oneflow.typing as tp
+
+
+        @flow.global_function()
+        def constant_like_Job() -> tp.Numpy:
+            constant_blob = flow.constant(value=1.5,
+                                        shape=(1, 3, 3),
+                                        dtype=flow.float)
+            constant_like_blob = flow.constant_like(like=constant_blob,
+                                                    value=5.5,
+                                                    dtype=flow.float)
+            return constant_like_blob
+
+
+        out = constant_like_Job()
+
+        # out [[[5.5 5.5 5.5]
+        #       [5.5 5.5 5.5]
+        #       [5.5 5.5 5.5]]]
+
+    """
+    op_conf = op_conf_util.OperatorConf()
+    setattr(
+        op_conf,
+        "name",
+        name if name is not None else id_util.UniqueStr("ConstantLike_"),
+    )
+    setattr(op_conf.constant_like_conf, "like", like.unique_name)
+    if isinstance(value, int):
+        op_conf.constant_like_conf.int_operand = value
+    elif isinstance(value, float):
+        op_conf.constant_like_conf.float_operand = value
+    else:
+        raise NotImplementedError
+    if dtype is not None:
+        setattr(
+            op_conf.constant_like_conf,
+            "data_type",
+            oneflow._oneflow_internal.deprecated.GetProtoDtype4OfDtype(dtype),
+        )
+    setattr(op_conf.constant_like_conf, "out", "out")
+    interpret_util.Forward(op_conf)
+    out_lbi = logical_blob_id_util.LogicalBlobId()
+    setattr(out_lbi, "op_name", op_conf.name)
+    setattr(out_lbi, "blob_name", "out")
+    return remote_blob_util.RemoteBlob(out_lbi)
diff --git a/python/oneflow/ops/count_not_finite.py b/python/oneflow/ops/count_not_finite.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e6212fc2b78f8486b712dc29c8e7d88a00988a9
--- /dev/null
+++ b/python/oneflow/ops/count_not_finite.py
@@ -0,0 +1,58 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+from typing import Optional, Sequence, Union
+
+import oneflow as flow
+import oneflow._oneflow_internal
+import oneflow.core.operator.op_conf_pb2 as op_conf_util
+import oneflow.core.register.logical_blob_id_pb2 as logical_blob_id_util
+import oneflow.framework.distribute as distribute_util
+import oneflow.framework.id_util as id_util
+import oneflow.framework.remote_blob as remote_blob_util
+
+
+def count_not_finite(
+    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
+) -> oneflow._oneflow_internal.BlobDesc:
+    return (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("CountNotFinite_")
+        )
+        .Op("count_not_finite")
+        .Input("x", [x])
+        .Output("y")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def multi_count_not_finite(
+    x: Optional[Sequence[oneflow._oneflow_internal.BlobDesc]] = None,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    return (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("MultiCountNotFinite_")
+        )
+        .Op("multi_count_not_finite")
+        .Input("x", x)
+        .Output("y")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
diff --git a/python/oneflow/ops/domain_ops.py b/python/oneflow/ops/domain_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccfc942f905ee9e4fecc8cba62d3bd093e362fb4
--- /dev/null
+++ b/python/oneflow/ops/domain_ops.py
@@ -0,0 +1,42 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import typing
+
+import oneflow as flow
+import oneflow._oneflow_internal
+import oneflow.framework.id_util as id_util
+
+
+def api_fused_self_attention_query_mul_key_and_value(
+    x: oneflow._oneflow_internal.BlobDesc,
+    head_size: int,
+    alpha: float = 1.0,
+    name: typing.Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    if name is None:
+        name = id_util.UniqueStr("FusedSelfAttentionQueryMulKeyAndValue_")
+    op = (
+        flow.user_op_builder(name)
+        .Op("fused_self_attention_query_mul_key_and_value")
+        .Input("hidden_states", [x])
+        .Attr("head_size", int(head_size))
+        .Attr("alpha", float(alpha))
+        .Output("query_mul_key")
+        .Output("value")
+        .Build()
+    )
+    (qmk, v) = op.InferAndTryRun().RemoteBlobList()
+    return (qmk, v)
diff --git a/python/oneflow/ops/eager_nccl_ops.py b/python/oneflow/ops/eager_nccl_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..1eff13ed00fc23e3270787efcc23b7c38de8e2c1
--- /dev/null
+++ b/python/oneflow/ops/eager_nccl_ops.py
@@ -0,0 +1,40 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional
+
+import oneflow as flow
+import oneflow._oneflow_internal
+import oneflow.framework.id_util as id_util
+import oneflow.framework.remote_blob as remote_blob_util
+
+
+def eager_nccl_all_reduce(
+    x: oneflow._oneflow_internal.BlobDesc,
+    parallel_conf: str,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    return (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("EagerNcclAllReduce_")
+        )
+        .Op("eager_nccl_all_reduce")
+        .Input("in", [x])
+        .Output("out")
+        .Attr("parallel_conf", parallel_conf)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
diff --git a/python/oneflow/ops/get_variable.py b/python/oneflow/ops/get_variable.py
new file mode 100644
index 0000000000000000000000000000000000000000..6677eaa1dbbae09177de05f3a867f789a682e948
--- /dev/null
+++ b/python/oneflow/ops/get_variable.py
@@ -0,0 +1,385 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+from typing import Optional, Sequence, Union
+
+import oneflow
+import oneflow._oneflow_internal
+import oneflow._oneflow_internal.oneflow.core.register.logical_blob_id as lbi_util
+import oneflow.core.job.initializer_conf_pb2 as initializer_conf_util
+import oneflow.core.job.regularizer_conf_pb2 as regularizer_conf_util
+import oneflow.core.operator.op_conf_pb2 as op_conf_util
+import oneflow.core.register.logical_blob_id_pb2 as logical_blob_id_util
+import oneflow.eager.boxing_util as boxing_util
+import oneflow.eager.gradient_util as gradient_util
+import oneflow.eager.op_executor as op_executor
+import oneflow.experimental.namescope as name_scope
+import oneflow.framework.compile_context as compile_context
+import oneflow.framework.distribute as distribute_util
+import oneflow.framework.hob as hob
+import oneflow.framework.remote_blob as remote_blob_util
+import oneflow.framework.runtime_mode as rt_mode
+import oneflow.framework.session_context as session_ctx
+import oneflow.support.enable_if as enable_if
+
+blob_register = oneflow._oneflow_internal.GetDefaultBlobRegister()
+
+
+def api_get_variable(
+    name: str,
+    shape: Optional[Sequence[int]] = None,
+    dtype: Optional[oneflow.dtype] = oneflow.float32,
+    initializer: Optional[initializer_conf_util.InitializerConf] = None,
+    regularizer: Optional[regularizer_conf_util.RegularizerConf] = None,
+    trainable: Optional[bool] = None,
+    model_name: Optional[str] = None,
+    random_seed: Optional[int] = None,
+    parallel_distribution: Optional[
+        Union[
+            Sequence[oneflow._oneflow_internal.distribute.Distribute],
+            Sequence[str],
+            str,
+        ]
+    ] = None,
+    distribute: Optional[oneflow._oneflow_internal.distribute.Distribute] = None,
+    reuse: bool = True,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Create a variable or retrieve an existing one.
+
+    Args:
+        name: Name of this variable. One variable could be shared by multiple OneFlow functions. `None` by default
+        shape: Shape of the variable. `None` by default
+        dtype: Data type of the variable. `None` by default
+        initializer: A initializer object. For instance, a :func:`~oneflow.ones_initializer`. `None` by default
+        trainable: A `bool` to indicate if this variable is trainable. `True` by default
+        model_name: A `string`. `'weight'` or `'bias'`. `None` by default
+        random_seed: Random seed for random initializers. `None` by default
+
+    For example:
+
+    Example 1:
+
+    .. code-block:: python
+
+        import oneflow as flow
+        import oneflow.typing as tp
+
+
+        def watch_handler(y: tp.Numpy):
+            print("out", y)
+
+
+        @flow.global_function()
+        def variable_Job() -> None:
+            init = flow.constant_initializer(1.25)
+            variable = flow.get_variable(
+                "variable-weight",
+                shape=(1, 3, 2, 2),
+                initializer=init,
+                trainable=True
+            )
+            flow.watch(variable, watch_handler)
+
+
+        checkpoint = flow.train.CheckPoint()
+        checkpoint.init()
+        variable_Job()
+
+        # out [[[[1.25 1.25]
+        #        [1.25 1.25]]
+
+        #       [[1.25 1.25]
+        #        [1.25 1.25]]
+
+        #       [[1.25 1.25]
+        #        [1.25 1.25]]]]
+
+    Example 2:
+
+    .. code-block:: python
+
+        import oneflow as flow
+        import numpy as np
+        import oneflow.typing as tp
+
+
+        def conv2d(input, filters, kernel_size, strides, padding, name):
+            input_shape = input.shape
+            weight_initializer = flow.truncated_normal(0.1)
+            weight_regularizer = flow.regularizers.l2(0.0005)
+            weight_shape = (filters,
+                            input_shape[1],
+                            kernel_size[0],
+                            kernel_size[1])
+
+            weight = flow.get_variable(
+                name + "-weight",
+                shape=weight_shape,
+                initializer=weight_initializer,
+                regularizer=weight_regularizer,
+            )
+            return flow.nn.conv2d(input, weight, strides, padding, name=name)
+
+
+        @flow.global_function()
+        def conv2d_Job(x: tp.Numpy.Placeholder((1, 64, 32, 32))
+        ) -> tp.Numpy:
+            conv = conv2d(x,
+                        filters=128,
+                        kernel_size=[3, 3],
+                        strides=2,
+                        padding='SAME',
+                        name="ConvLayer")
+            return conv
+
+
+        x = np.random.randn(1, 64, 32, 32).astype(np.float32)
+        out = conv2d_Job(x)
+
+        # out.shape (1, 128, 16, 16)
+
+    """
+    if distribute is not None:
+        assert parallel_distribution is None
+        parallel_distribution = [distribute]
+    if parallel_distribution is None:
+        parallel_distribution = []
+    if isinstance(parallel_distribution, str):
+        parallel_distribution = parallel_distribution.split(",")
+    assert isinstance(parallel_distribution, (list, tuple))
+
+    def distribute_to_str(dist):
+        if dist is None:
+            return ""
+        elif type(dist) is str:
+            return dist
+        elif type(dist) is oneflow._oneflow_internal.distribute.SplitDistribute:
+            return "S({})".format(dist.axis)
+        elif type(dist) is oneflow._oneflow_internal.distribute.BroadcastDistribute:
+            return "B"
+        else:
+            raise ValueError("unsupported distribute")
+
+    parallel_distribution = list(map(distribute_to_str, parallel_distribution))
+    api = enable_if.unique([get_lazy_variable, get_eager_variable])
+    return api(
+        name,
+        shape=shape,
+        dtype=dtype,
+        initializer=initializer,
+        regularizer=regularizer,
+        trainable=trainable,
+        model_name=model_name,
+        random_seed=random_seed,
+        parallel_distribution=parallel_distribution,
+        reuse=reuse,
+    )
+
+
+@enable_if.condition(hob.in_global_mode & hob.eager_execution_enabled)
+def get_eager_variable(
+    name,
+    shape=None,
+    dtype=None,
+    initializer=None,
+    regularizer=None,
+    trainable=None,
+    model_name=None,
+    random_seed=None,
+    parallel_distribution=None,
+    reuse=True,
+):
+    assert isinstance(name, str)
+    assert isinstance(
+        shape, (list, tuple)
+    ), "param shape should be a list or tuple of dimension"
+    job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+    name = name_scope.GetJobNameScopePrefix(job_name) + name
+    sess = session_ctx.GetDefaultSession()
+    (var_blob, job_var_blob) = sess.TryGetVariableBlobOfJobFromStash(job_name, name)
+    if reuse is False:
+        assert (
+            job_var_blob is None
+        ), "variable '{}' already exists, getting the same variable is not allowed when reuse is False".format(
+            name
+        )
+    if job_var_blob is None:
+        op_conf = GenerateVariableOpConf(
+            name=name,
+            shape=shape,
+            dtype=dtype,
+            initializer=initializer,
+            regularizer=regularizer,
+            trainable=trainable,
+            model_name=model_name,
+            random_seed=random_seed,
+            parallel_distribution=parallel_distribution,
+        )
+        op_attribute = compile_context.CurJobAddConsistentOp(op_conf)
+        if var_blob is None:
+            var_blob = CreateEagerVariableBlob(op_attribute)
+            op_executor.EagerInitVariableBlob(sess, op_conf, var_blob)
+        assert isinstance(var_blob, oneflow._oneflow_internal.EagerConsistentBlob)
+        sess.StashVariableBlob4Job(job_name, op_conf.name, var_blob)
+    else:
+        assert isinstance(job_var_blob, oneflow._oneflow_internal.EagerConsistentBlob)
+        assert isinstance(var_blob, oneflow._oneflow_internal.EagerConsistentBlob)
+        assert var_blob.IdenticalTo(job_var_blob)
+    bw_blob_register = gradient_util.GetDefaultBackwardBlobRegister()
+    bw_blob_register.TrySetObject4BlobName(
+        var_blob.logical_blob_name, var_blob.blob_object
+    )
+    return var_blob
+
+
+@enable_if.condition(hob.in_global_mode & ~hob.eager_execution_enabled)
+def get_lazy_variable(
+    name,
+    shape=None,
+    dtype=None,
+    initializer=None,
+    regularizer=None,
+    trainable=None,
+    model_name=None,
+    random_seed=None,
+    parallel_distribution=None,
+    reuse=True,
+):
+    assert isinstance(name, str)
+    assert isinstance(
+        shape, (list, tuple)
+    ), "param shape should be a list or tuple of dimension"
+    job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+    name = name_scope.GetJobNameScopePrefix(job_name) + name
+    sess = session_ctx.GetDefaultSession()
+    (var_blob, job_var_blob) = sess.TryGetVariableBlobOfJobFromStash(job_name, name)
+    if reuse is False:
+        assert (
+            job_var_blob is None
+        ), "variable '{}' already exists, getting the same variable is not allowed when param reuse is False".format(
+            name
+        )
+    if job_var_blob is None:
+        op_conf = GenerateVariableOpConf(
+            name=name,
+            shape=shape,
+            dtype=dtype,
+            initializer=initializer,
+            regularizer=regularizer,
+            trainable=trainable,
+            model_name=model_name,
+            random_seed=random_seed,
+            parallel_distribution=parallel_distribution,
+        )
+        job_var_blob = _CreateVariableBlob(op_conf)
+        assert isinstance(job_var_blob, oneflow._oneflow_internal.LazyConsistentBlob)
+        sess.StashVariableBlob4Job(job_name, op_conf.name, job_var_blob)
+        if var_blob is not None:
+            assert isinstance(var_blob, oneflow._oneflow_internal.LazyConsistentBlob)
+            assert var_blob.IdenticalTo(job_var_blob)
+    else:
+        assert isinstance(job_var_blob, oneflow._oneflow_internal.LazyConsistentBlob)
+        assert isinstance(var_blob, oneflow._oneflow_internal.LazyConsistentBlob)
+        assert var_blob.IdenticalTo(job_var_blob)
+    return job_var_blob
+
+
+def GenerateVariableOpConf(
+    name,
+    shape,
+    dtype=None,
+    initializer=None,
+    regularizer=None,
+    trainable=None,
+    model_name=None,
+    random_seed=None,
+    parallel_distribution=None,
+):
+    op_conf = op_conf_util.OperatorConf()
+    op_conf.name = name
+    op_conf.variable_conf.shape.dim.extend(shape)
+    assert dtype is not None
+    op_conf.variable_conf.data_type = oneflow._oneflow_internal.deprecated.GetProtoDtype4OfDtype(
+        dtype
+    )
+    if rt_mode.CurrentMode() == rt_mode.NORMAL_MODE:
+        root_path = None
+    else:
+        root_path = (
+            compile_context.GetCurJobConfigProto().default_initialize_with_snapshot_path()
+        )
+        dir_path = os.path.join(root_path, name)
+        file_path = os.path.join(dir_path, "out")
+    if root_path and os.path.isfile(file_path):
+        op_conf.variable_conf.initialize_with_snapshot.path = dir_path
+        op_conf.variable_conf.initialize_with_snapshot.key = "out"
+    else:
+        if root_path:
+            print("{} not found, will be initialized".format(file_path))
+        if initializer is not None:
+            op_conf.variable_conf.initializer.CopyFrom(initializer)
+    if regularizer is not None:
+        op_conf.variable_conf.regularizer.CopyFrom(regularizer)
+    if trainable is not None:
+        op_conf.variable_conf.trainable = trainable
+    if model_name is not None:
+        op_conf.variable_conf.model_name = model_name
+    if parallel_distribution is None:
+        parallel_distribution = []
+    op_conf.variable_conf.parallel_distribution.extend(parallel_distribution)
+    if random_seed is not None:
+        op_conf.variable_conf.random_seed = random_seed
+    op_conf.variable_conf.out = "out"
+    return op_conf
+
+
+def _CreateVariableBlob(op_conf):
+    compile_context.CurJobAddConsistentOp(op_conf)
+    lbi = logical_blob_id_util.LogicalBlobId()
+    lbi.op_name = op_conf.name
+    lbi.blob_name = op_conf.variable_conf.out
+    return remote_blob_util.RemoteBlob(lbi)
+
+
+def CreateEagerVariableBlob(op_attribute, job_name=""):
+    bn_in_op2blob_object = oneflow._oneflow_internal.deprecated.BnInOp2BlobObject()
+
+    def BuildInstruction(builder):
+        parallel_conf = (
+            oneflow.current_scope().device_parallel_desc_symbol.parallel_conf
+        )
+        cfg_op_attribute = oneflow._oneflow_internal.deprecated.MakeOpAttributeByString(
+            str(op_attribute)
+        )
+        builder.StatelessCall(
+            cfg_op_attribute, parallel_conf, bn_in_op2blob_object, boxing_util.BoxingTo
+        )
+
+    oneflow._oneflow_internal.deprecated.LogicalRun(BuildInstruction)
+    lbi = lbi_util.LogicalBlobId()
+    lbi.set_op_name(op_attribute.op_conf.name)
+    lbi.set_blob_name(op_attribute.op_conf.variable_conf.out)
+    if not isinstance(lbi, lbi_util.LogicalBlobId):
+        cfg_lbi = lbi_util.LogicalBlobId()
+        cfg_lbi.set_op_name(lbi.op_name)
+        cfg_lbi.set_blob_name(lbi.blob_name)
+        lbi = cfg_lbi
+    return oneflow._oneflow_internal.EagerConsistentBlob(
+        lbi,
+        blob_object=bn_in_op2blob_object["out"],
+        blob_register=blob_register,
+        job_name=job_name,
+    )
diff --git a/python/oneflow/ops/initializer_util.py b/python/oneflow/ops/initializer_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..3be523230593ae5c394a88a467d03824412209c3
--- /dev/null
+++ b/python/oneflow/ops/initializer_util.py
@@ -0,0 +1,1217 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import functools
+import math
+from typing import Optional, Sequence, Union
+
+import numpy as np
+
+import oneflow as flow
+import oneflow.core.job.initializer_conf_pb2 as initializer_conf_util
+import oneflow.core.operator.op_conf_pb2 as op_conf_util
+
+
+def empty_initializer(
+    dtype: flow.dtype = flow.float,
+) -> initializer_conf_util.InitializerConf:
+    initializer = initializer_conf_util.InitializerConf()
+    empty_conf = initializer_conf_util.EmptyInitializerConf()
+    initializer.empty_conf.CopyFrom(empty_conf)
+    return initializer
+
+
+def constant_initializer(
+    value: float = 0, dtype: flow.dtype = flow.float
+) -> initializer_conf_util.InitializerConf:
+    """Initializer that generates blob with constant values.
+
+    Args:
+        value (float, optional): A Python scalar. All elements of the initialized variable . Defaults to 0.
+        dtype (flow.dtype, optional): Default data type. Defaults to flow.float.
+
+    Raises:
+        NotImplementedError:  Do not support such data type.
+
+    Returns:
+        initializer_conf_util.InitializerConf:  An InitializerConf object.
+    
+    For example: 
+
+    Example 1:
+
+    .. code-block:: python 
+
+        import oneflow as flow
+        import oneflow.typing as tp
+
+
+        def watch_handler(y: tp.Numpy):
+            print("out", y)
+
+
+        @flow.global_function()
+        def constant_Job() -> None:
+            init = flow.constant_initializer(2.5)
+            blob = flow.get_variable(
+                "blob-weight",
+                shape=(3, ),
+                initializer=init,
+                trainable=True
+            )
+            flow.watch(blob, watch_handler)
+
+
+        checkpoint = flow.train.CheckPoint()
+        checkpoint.init()
+        constant_Job()
+
+        # out [2.5 2.5 2.5]
+
+    Example 2:
+
+    .. code-block:: python 
+
+        import oneflow as flow
+        import numpy as np
+        import oneflow.typing as tp
+
+
+        @flow.global_function()
+        def conv2d_constant_Job(x: tp.Numpy.Placeholder((1, 256, 32, 32))
+        ) -> tp.Numpy:
+            initializer = flow.constant_initializer(0.01)
+            conv2d = flow.layers.conv2d(
+                x,
+                filters=128,
+                kernel_size=3,
+                strides=1,
+                padding='SAME',
+                kernel_initializer=initializer, 
+                name="Conv2d"
+            )
+            return conv2d
+
+
+        x = np.random.randn(1, 256, 32, 32).astype(np.float32)
+        out = conv2d_constant_Job(x)
+
+        # out.shape (1, 128, 32, 32)
+
+    """
+    initializer = initializer_conf_util.InitializerConf()
+    if dtype in [flow.float, flow.double]:
+        setattr(initializer.constant_conf, "value", float(value))
+    elif dtype in [flow.int8, flow.int32, flow.int64]:
+        setattr(initializer.constant_int_conf, "value", int(value))
+    else:
+        raise NotImplementedError("Do not support such data type")
+    return initializer
+
+
+def zeros_initializer(
+    dtype: flow.dtype = flow.float,
+) -> initializer_conf_util.InitializerConf:
+    """Initializer that generates blobs initialized to 0
+
+    Args:
+        dtype (flow.dtype, optional): Default data type. Defaults to flow.float.
+
+    Returns:
+        initializer_conf_util.InitializerConf: constant_initializer
+
+    For example: 
+
+    Example 1: 
+
+    .. code-block:: python 
+
+        import oneflow as flow
+        import oneflow.typing as tp
+
+
+        def watch_handler(y: tp.Numpy):
+            print("out", y)
+
+
+        @flow.global_function()
+        def zeros_Job() -> None:
+            init = flow.zeros_initializer()
+            blob = flow.get_variable(
+                "blob-weight",
+                shape=(3, ),
+                initializer=init,
+                trainable=True
+            )
+            flow.watch(blob, watch_handler)
+
+
+        checkpoint = flow.train.CheckPoint()
+        checkpoint.init()
+        zeros_Job()
+
+        # out [0. 0. 0.]
+
+    Example 2: 
+
+    .. code-block:: python 
+
+        import oneflow as flow
+        import numpy as np
+        import oneflow.typing as tp
+
+
+        @flow.global_function()
+        def conv2d_zero_Job(x: tp.Numpy.Placeholder((1, 256, 32, 32))
+        ) -> tp.Numpy:
+            initializer = flow.zeros_initializer()
+            conv2d = flow.layers.conv2d(
+                x,
+                filters=128,
+                kernel_size=3,
+                strides=1,
+                padding='SAME',
+                kernel_initializer=initializer, 
+                name="Conv2d"
+            )
+            return conv2d
+
+
+        x = np.random.randn(1, 256, 32, 32).astype(np.float32)
+        out = conv2d_zero_Job(x)
+
+        # out.shape (1, 128, 32, 32)
+
+    """
+    return constant_initializer(0.0, dtype)
+
+
+def ones_initializer(
+    dtype: flow.dtype = flow.float,
+) -> initializer_conf_util.InitializerConf:
+    """Initializer that generates blobs initialized to 1.
+
+    Args:
+        dtype (flow.dtype, optional): Default data type. Defaults to flow.float.
+
+    Returns:
+        initializer_conf_util.InitializerConf: constant_initializer
+
+    For example: 
+
+    Example 1: 
+
+    .. code-block:: python 
+
+        import oneflow as flow
+        import oneflow.typing as tp
+
+
+        def watch_handler(y: tp.Numpy):
+            print("out", y)
+
+
+        @flow.global_function()
+        def ones_Job() -> None:
+            init = flow.ones_initializer()
+            blob = flow.get_variable(
+                "blob-weight",
+                shape=(3, ),
+                initializer=init,
+                trainable=True
+            )
+            flow.watch(blob, watch_handler)
+
+
+        checkpoint = flow.train.CheckPoint()
+        checkpoint.init()
+        ones_Job()
+
+        # out [1. 1. 1.]
+
+    Example 2: 
+
+    .. code-block:: python 
+
+        import oneflow as flow
+        import numpy as np
+        import oneflow.typing as tp
+
+
+        @flow.global_function()
+        def conv2d_one_Job(x: tp.Numpy.Placeholder((1, 256, 32, 32))
+        ) -> tp.Numpy:
+            initializer = flow.ones_initializer()
+            conv2d = flow.layers.conv2d(
+                x,
+                filters=128,
+                kernel_size=3,
+                strides=1,
+                padding='SAME',
+                kernel_initializer=initializer, 
+                name="Conv2d"
+            )
+            return conv2d
+
+
+        x = np.random.randn(1, 256, 32, 32).astype(np.float32)
+        out = conv2d_one_Job(x)
+        
+        # out.shape (1, 128, 32, 32)
+
+    """
+    return constant_initializer(1.0, dtype)
+
+
+def random_uniform_initializer(
+    minval: float = 0, maxval: float = 1, dtype: flow.dtype = flow.float
+) -> initializer_conf_util.InitializerConf:
+    """Initializer that generates blobs with a uniform distribution. 
+
+    Args:
+        minval (float, optional): A python scalar. Lower bound of the range of random values to generate. Defaults to 0.
+        maxval (float, optional): A python scalar. Upper bound of the range of random values to generate. Defaults to 1.
+        dtype (flow.dtype, optional): Default data type. Defaults to flow.float.
+
+    Raises:
+        NotImplementedError: Do not support such data type.
+
+    Returns:
+        initializer_conf_util.InitializerConf:  Initial configuration
+
+    For example: 
+
+    Example 1: 
+
+    .. code-block:: python 
+
+        import oneflow as flow
+        import oneflow.typing as tp
+
+
+        def watch_handler(y: tp.Numpy):
+            print("out", y)
+
+
+        @flow.global_function()
+        def random_uniform_Job() -> None:
+            init = flow.random_uniform_initializer(minval=0, maxval=0.5)
+            blob = flow.get_variable(
+                "blob-weight",
+                shape=(3, ),
+                initializer=init,
+                trainable=True
+            )
+            flow.watch(blob, watch_handler)
+
+
+        checkpoint = flow.train.CheckPoint()
+        checkpoint.init()
+        random_uniform_Job()
+
+        # out [0.07557311 0.3943565  0.31875622]
+
+    Example 2: 
+
+    .. code-block:: python 
+
+        import oneflow as flow
+        import numpy as np
+        import oneflow.typing as tp
+
+
+        @flow.global_function()
+        def conv2d_random_uniform_Job(x: tp.Numpy.Placeholder((1, 256, 32, 32))
+        ) -> tp.Numpy:
+            initializer = flow.random_uniform_initializer(minval=0, maxval=0.5)
+
+            conv2d = flow.layers.conv2d(
+                x,
+                filters=128,
+                kernel_size=3,
+                strides=1,
+                padding='SAME',
+                kernel_initializer=initializer, 
+                name="Conv2d"
+            )
+            return conv2d
+
+
+        x = np.random.randn(1, 256, 32, 32).astype(np.float32)
+        out = conv2d_random_uniform_Job(x)
+        
+        # out.shape (1, 128, 32, 32)
+
+    """
+    assert minval <= maxval
+    initializer = initializer_conf_util.InitializerConf()
+    if dtype in [flow.float, flow.double]:
+        setattr(initializer.random_uniform_conf, "min", float(minval))
+        setattr(initializer.random_uniform_conf, "max", float(maxval))
+    elif dtype in [flow.int8, flow.int32, flow.int64]:
+        setattr(initializer.random_uniform_int_conf, "min", int(minval))
+        setattr(initializer.random_uniform_int_conf, "max", int(maxval))
+    else:
+        raise NotImplementedError("Do not support such data type")
+    return initializer
+
+
+def random_normal_initializer(
+    mean: float = 0.0,
+    stddev: float = 1.0,
+    seed: Optional[int] = None,
+    dtype: Optional[flow.dtype] = None,
+) -> initializer_conf_util.InitializerConf:
+    """Initializer that generates blob with a normal distribution.
+
+    Args:
+        mean (float, optional): A python scalar. Mean of the random values to generate.. Defaults to 0.0.
+        stddev (float, optional): A python scalar. Standard deviation of the random values to generate. Defaults to 1.0.
+        seed (Optional[int], optional): None. Not support yet. Defaults to None.
+        dtype (Optional[flow.dtype], optional): . Defaults to None.
+
+    Returns:
+        initializer_conf_util.InitializerConf: Initial configuration
+
+    For example: 
+
+    Example 1: 
+
+    .. code-block:: python 
+
+        import oneflow as flow
+        import oneflow.typing as tp
+
+
+        def watch_handler(y: tp.Numpy):
+            print("out", y)
+
+
+        @flow.global_function()
+        def random_normal_Job() -> None:
+            init = flow.random_normal_initializer(mean=1, stddev=1)
+            blob = flow.get_variable(
+                "blob-weight",
+                shape=(3, ),
+                initializer=init,
+                trainable=True
+            )
+            flow.watch(blob, watch_handler)
+
+
+        checkpoint = flow.train.CheckPoint()
+        checkpoint.init()
+        random_normal_Job()
+
+        # out [1.4190257 2.7663114 1.7114428]
+
+    Example 2: 
+
+    .. code-block:: python 
+
+        import oneflow as flow
+        import numpy as np
+        import oneflow.typing as tp
+
+
+        @flow.global_function()
+        def conv2d_random_normal_Job(x: tp.Numpy.Placeholder((1, 256, 32, 32))
+        ) -> tp.Numpy:
+            initializer = flow.random_normal_initializer(mean=0, stddev=1)
+
+            conv2d = flow.layers.conv2d(
+                x,
+                filters=128,
+                kernel_size=3,
+                strides=1,
+                padding='SAME',
+                kernel_initializer=initializer, 
+                name="Conv2d"
+            )
+            return conv2d
+
+
+        x = np.random.randn(1, 256, 32, 32).astype(np.float32)
+        out = conv2d_random_normal_Job(x)
+
+        # out.shape (1, 128, 32, 32)
+
+    """
+    assert seed is None
+    assert dtype is None
+    if seed is not None:
+        assert name is not None
+    initializer = initializer_conf_util.InitializerConf()
+    setattr(initializer.random_normal_conf, "mean", float(mean))
+    setattr(initializer.random_normal_conf, "std", float(stddev))
+    return initializer
+
+
+def truncated_normal_initializer(
+    mean: float = 0.0, stddev: float = 1.0
+) -> initializer_conf_util.InitializerConf:
+    """Initializer that generates a truncated normal distribution.
+
+    Args:
+        mean (float, optional): A scalar (float). Defaults to 0.0.
+        stddev (float, optional): A scalar (float). Defaults to 1.0.
+
+    Returns:
+        initializer_conf_util.InitializerConf: Initial configuration
+
+    For example: 
+
+    Example 1: 
+
+    .. code-block:: python 
+
+        import oneflow as flow
+        import oneflow.typing as tp
+
+
+        def watch_handler(y: tp.Numpy):
+            print("out", y)
+
+
+        @flow.global_function()
+        def truncated_normal_Job() -> None:
+            init = flow.truncated_normal_initializer(mean=1, stddev=1)
+            blob = flow.get_variable(
+                "blob-weight",
+                shape=(3, ),
+                initializer=init,
+                trainable=True
+            )
+            flow.watch(blob, watch_handler)
+
+
+        checkpoint = flow.train.CheckPoint()
+        checkpoint.init()
+        truncated_normal_Job()
+
+        # out [1.8303236  0.09787154 0.83049864]
+
+    Example 2: 
+
+    .. code-block:: python 
+
+        import oneflow as flow
+        import numpy as np
+        import oneflow.typing as tp
+
+
+        @flow.global_function()
+        def conv2d_truncated_normal_Job(x: tp.Numpy.Placeholder((1, 256, 32, 32))
+        ) -> tp.Numpy:
+            initializer = flow.truncated_normal_initializer(mean=0, stddev=1)
+
+            conv2d = flow.layers.conv2d(
+                x,
+                filters=128,
+                kernel_size=3,
+                strides=1,
+                padding='SAME',
+                kernel_initializer=initializer, 
+                name="Conv2d"
+            )
+            return conv2d
+
+
+        x = np.random.randn(1, 256, 32, 32).astype(np.float32)
+        out = conv2d_truncated_normal_Job(x)
+
+        # out.shape (1, 128, 32, 32)
+
+    """
+    initializer = initializer_conf_util.InitializerConf()
+    setattr(initializer.truncated_normal_conf, "mean", float(mean))
+    setattr(initializer.truncated_normal_conf, "std", float(stddev))
+    return initializer
+
+
+def glorot_uniform_initializer(
+    data_format: str = "",
+) -> initializer_conf_util.InitializerConf:
+    """Initializer that generates a Xavier uniform distribution. 
+    
+    It also can be called as `oneflow.glorot_uniform_initializer`.  
+
+    The equation is: 
+
+    .. math:: 
+
+        W\\sim U(-\\sqrt{\\frac{{6}}{{n_j+n_{j+1}}}},\\sqrt{\\frac{{6}}{{n_j+n_{j+1}}}})
+
+    :math:`U` means uniform distribution 
+
+    :math:`n_j` means the amount of Nth layer parameters 
+
+    Args:
+        data_format (str, optional): The data format. Defaults to "".
+
+    Returns:
+        initializer_conf_util.InitializerConf: Initial configuration
+
+    For example: 
+
+    Example 1:
+
+    .. code-block:: python 
+
+        import oneflow as flow
+        import oneflow.typing as tp
+
+
+        def watch_handler(y: tp.Numpy):
+            print("out", y)
+
+
+        @flow.global_function()
+        def xavier_uniform_Job() -> None:
+            init = flow.xavier_uniform_initializer()
+            blob = flow.get_variable(
+                "blob-weight",
+                shape=(3, 3),
+                initializer=init,
+                trainable=True
+            )
+            flow.watch(blob, watch_handler)
+
+
+        checkpoint = flow.train.CheckPoint()
+        checkpoint.init()
+        xavier_uniform_Job()
+
+        # out [[-0.14424723 -0.9532095  -0.08723891]
+        #      [-0.8011227  -0.29729813 -0.26769108]
+        #      [ 0.9208976  -0.5971756  -0.15077025]]
+
+    Example 2: 
+
+    .. code-block:: python 
+
+        import oneflow as flow
+        import numpy as np
+        import oneflow.typing as tp
+
+
+        @flow.global_function()
+        def conv2d_xavier_uniform_Job(x: tp.Numpy.Placeholder((1, 256, 32, 32))
+        ) -> tp.Numpy:
+            initializer = flow.xavier_uniform_initializer()
+            conv2d = flow.layers.conv2d(
+                x,
+                filters=128,
+                kernel_size=3,
+                strides=1,
+                padding='SAME',
+                kernel_initializer=initializer, 
+                name="Conv2d"
+            )
+            return conv2d
+
+
+        x = np.random.randn(1, 256, 32, 32).astype(np.float32)
+        out = conv2d_xavier_uniform_Job(x)
+
+        # out.shape (1, 128, 32, 32)
+
+    """
+    return variance_scaling_initializer(1.0, "fan_avg", "random_uniform", data_format)
+
+
+def glorot_normal_initializer(
+    data_format: str = "",
+) -> initializer_conf_util.InitializerConf:
+    """Initializer that generates a Xavier normal distribution. 
+    
+    It also can be called as `oneflow.glorot_normal_initializer`.  
+
+    The equation is: 
+
+    .. math:: 
+
+        W\\sim N(0, \\sqrt{\\frac{{2}}{{n_j+n_{j+1}}}})
+
+    :math:`N` means normal distribution 
+
+    :math:`n_j` means the amount of Nth layer parameters 
+
+    Args:
+        data_format (str, optional): The data format. Defaults to "".
+
+    Returns:
+        initializer_conf_util.InitializerConf: Initial configuration
+
+    For example: 
+
+    Example 1: 
+
+    .. code-block:: python 
+
+        import oneflow as flow
+        import oneflow.typing as tp
+
+
+        def watch_handler(y: tp.Numpy):
+            print("out", y)
+
+
+        @flow.global_function()
+        def xavier_normal_Job() -> None:
+            init = flow.xavier_normal_initializer()
+            blob = flow.get_variable(
+                "blob-weight",
+                shape=(3, 3),
+                initializer=init,
+                trainable=True
+            )
+            flow.watch(blob, watch_handler)
+
+
+        checkpoint = flow.train.CheckPoint()
+        checkpoint.init()
+        xavier_normal_Job()
+
+        # out [[ 0.5908121  -0.10804518 -0.6148571 ]
+        #      [ 1.4007381  -0.08172473  0.36579943]
+        #      [-0.6461796  -0.15923311  0.33653972]]
+
+    Example 2: 
+
+    .. code-block:: python 
+
+        import oneflow as flow
+        import numpy as np
+        import oneflow.typing as tp
+
+
+        @flow.global_function()
+        def conv2d_xavier_normal_Job(x: tp.Numpy.Placeholder((1, 256, 32, 32))
+        ) -> tp.Numpy:
+            initializer = flow.xavier_normal_initializer()
+            conv2d = flow.layers.conv2d(
+                x,
+                filters=128,
+                kernel_size=3,
+                strides=1,
+                padding='SAME',
+                kernel_initializer=initializer, 
+                name="Conv2d"
+            )
+            return conv2d
+
+
+        x = np.random.randn(1, 256, 32, 32).astype(np.float32)
+        out = conv2d_xavier_normal_Job(x)
+
+        # out.shape (1, 128, 32, 32)
+
+    """
+    return variance_scaling_initializer(1.0, "fan_avg", "random_normal", data_format)
+
+
+def variance_scaling_initializer(
+    scale: float = 1.0,
+    mode: str = "fan_in",
+    distribution: str = "truncated_normal",
+    data_format: str = "",
+) -> initializer_conf_util.InitializerConf:
+    """Initializer that generates a truncated normal distribution or a random normal distribution or a random uniform distribution with a scale adapting to it.
+
+    When the distribution is "truncated_normal"
+
+    The equation is: 
+
+    .. math:: 
+
+        W\\sim N(0, \\sqrt{\\frac{{scale}}{{n}}})
+
+    If mode is "fan_in", the "n" is the number of input units in the weight Blob. 
+
+    If mode is "fan_out", the "n" is the number of output units in the weight Blob. 
+
+    if mode is "fan_avg", the "n" is the average of the number of input and output units in the weight Blob
+
+    Args:
+        scale (float, optional): Scaling factor (positive float). Defaults to 1.0.
+        mode (str, optional): One of "fan_in", "fan_out", "fan_avg". Defaults to "fan_in".
+        distribution (str, optional): Random distribution to use. One of "truncated_normal",. Defaults to "truncated_normal".
+        data_format (str, optional): A string be one of "N...C" or "NC...". Defaults to "".
+
+    Returns:
+        initializer_conf_util.InitializerConf: Initial configuration
+
+    For example: 
+
+    Example 1: 
+
+    .. code-block:: python 
+
+        import oneflow as flow
+        import oneflow.typing as tp
+
+
+        def watch_handler(y: tp.Numpy):
+            print("out", y)
+
+
+        @flow.global_function()
+        def variance_scale_Job() -> None:
+            init = flow.variance_scaling_initializer(scale=2.0, mode="fan_avg")
+            blob = flow.get_variable(
+                "blob-weight",
+                shape=(3, 3),
+                initializer=init,
+                trainable=True
+            )
+            flow.watch(blob, watch_handler)
+
+
+        checkpoint = flow.train.CheckPoint()
+        checkpoint.init()
+        variance_scale_Job()
+
+        # out [[-0.13931477  0.12266728 -0.9434968 ]
+        #      [-0.49665168  0.10231158 -0.19194333]
+        #      [-0.7902896  -1.7034698  -0.38695997]]
+
+    Example 2: 
+
+    .. code-block:: python 
+
+        import oneflow as flow
+        import numpy as np
+        import oneflow.typing as tp
+
+
+        @flow.global_function()
+        def conv2d_variance_scaling_Job(x: tp.Numpy.Placeholder((1, 256, 32, 32))
+        ) -> tp.Numpy:
+            initializer = flow.variance_scaling_initializer(mode="fan_out")
+            conv2d = flow.layers.conv2d(
+                x,
+                filters=128,
+                kernel_size=3,
+                strides=1,
+                padding='SAME',
+                kernel_initializer=initializer, 
+                name="Conv2d"
+            )
+            return conv2d
+
+
+        x = np.random.randn(1, 256, 32, 32).astype(np.float32)
+        out = conv2d_variance_scaling_Job(x)
+
+        # out.shape (1, 128, 32, 32)
+
+    """
+    initializer = initializer_conf_util.InitializerConf()
+    setattr(initializer.variance_scaling_conf, "scale", float(scale))
+    setattr(
+        initializer.variance_scaling_conf, "variance_norm", _get_variance_norm(mode)
+    )
+    setattr(
+        initializer.variance_scaling_conf,
+        "distribution",
+        _get_random_distribution(distribution),
+    )
+    setattr(
+        initializer.variance_scaling_conf, "data_format", _get_data_format(data_format)
+    )
+    return initializer
+
+
+def kaiming_initializer(
+    shape: Sequence[int],
+    distribution: str = "random_normal",
+    mode: str = "fan_in",
+    nonlinearity: str = "leaky_relu",
+    negative_slope: float = 0.0,
+    data_format: str = "NCHW",
+) -> None:
+    """Initialize weight according to the method described in `Delving deep into
+    rectifiers: Surpassing human-level performance on ImageNet classification`
+    - He, K. et al. (2015), using a normal or uniform distribution.
+
+    When distribution is "random_normal"
+
+    The equation is: 
+
+    .. math:: 
+
+        W \\sim N(0, \\sqrt{\\frac{{2}}{{n}}})
+
+    When distribution is "random_uniform"
+
+    The equation is: 
+
+    .. math:: 
+
+        W \\sim U(-\\sqrt{\\frac{{6}}{{n}}}, \\sqrt{\\frac{{6}}{{n}}})
+    
+    If mode is "fan_in", the "n" is the number of input units in the weight Blob. 
+
+    If mode is "fan_out", the "n" is the number of output units in the weight Blob. 
+
+    if mode is "fan_avg", the "n" is the average of the number of input and output units in the weight Blob
+
+    Args:
+        shape (Sequence[int]): Blob shape.
+        distribution (str, optional): 'random_normal' or 'random_uniform'. Defaults to "random_normal".
+        mode (str, optional): 'fan_in', 'fan_out' or 'fan_avg'. Defaults to "fan_in".
+        nonlinearity (str, optional): None, 'tanh', 'sigmoid', 'relu' or 'leaky_relu'. Defaults to "leaky_relu".
+        negative_slope (float, optional): The negative slope of leaky_relu. Defaults to 0.0.
+        data_format (str, optional):  'NCHW', 'NHWC'. Defaults to "NCHW".
+
+    Raises:
+        NotImplementedError: Only support normal and uniform distribution
+
+    Returns:
+        [type]: flow.random_normal_initializer or flow.random_uniform_initializer
+
+    For example: 
+
+    Example 1: 
+
+    .. code-block:: python 
+
+        import oneflow as flow
+        import oneflow.typing as tp
+
+
+        def watch_handler(y: tp.Numpy):
+            print("out", y)
+
+
+        @flow.global_function()
+        def kaiming_Job() -> None:
+            init = flow.kaiming_initializer(shape=(3, 3), 
+                                            mode="fan_avg", 
+                                            nonlinearity="relu")
+            blob = flow.get_variable(
+                "blob-weight",
+                shape=(3, 3),
+                initializer=init,
+                trainable=True
+            )
+            flow.watch(blob, watch_handler)
+
+
+        checkpoint = flow.train.CheckPoint()
+        checkpoint.init()
+        kaiming_Job()
+
+        # out [[ 0.54521346  0.32585594  1.3474437 ]
+        #      [ 0.30729076 -0.19158769  0.2709008 ]
+        #      [-0.95830524 -0.05093324  0.28178614]]
+
+    Example 2: 
+
+    .. code-block:: python 
+    
+        import oneflow as flow
+        import numpy as np
+        import oneflow.typing as tp
+
+
+        @flow.global_function()
+        def conv2d_kaiming_Job(x: tp.Numpy.Placeholder((1, 256, 32, 32))
+        ) -> tp.Numpy:
+            initializer = flow.kaiming_initializer(shape=(1, 256, 32, 32))
+            conv2d = flow.layers.conv2d(
+                x,
+                filters=128,
+                kernel_size=3,
+                strides=1,
+                padding='SAME',
+                kernel_initializer=initializer, 
+                name="Conv2d"
+            )
+            return conv2d
+
+
+        x = np.random.randn(1, 256, 32, 32).astype(np.float32)
+        out = conv2d_kaiming_Job(x)
+
+        # out.shape (1, 128, 32, 32)
+
+    """
+    assert isinstance(shape, (tuple, flow.Size))
+    assert len(shape) >= 2
+    elem_cnt = functools.reduce(lambda a, b: a * b, shape, 1)
+    assert elem_cnt > 0
+    assert distribution in ["random_normal", "random_uniform"]
+    assert mode in ["fan_in", "fan_out", "fan_avg"]
+    assert nonlinearity in [None, "tanh", "sigmoid", "relu", "leaky_relu"]
+    assert data_format in ["NCHW", "NHWC"]
+    fan = _CalcFan(shape, mode, _get_data_format(data_format))
+    gain = CalcGain(nonlinearity, negative_slope)
+    std = gain / math.sqrt(fan)
+    if distribution == "random_normal":
+        return flow.random_normal_initializer(0.0, std)
+    elif distribution == "random_uniform":
+        bound = math.sqrt(3.0) * std
+        return flow.random_uniform_initializer(-bound, bound)
+    else:
+        raise NotImplementedError("Only support normal and uniform distribution")
+
+
+def _get_variance_norm(mode):
+    if mode.lower() == "fan_in":
+        return initializer_conf_util.kFanIn
+    elif mode.lower() == "fan_out":
+        return initializer_conf_util.kFanOut
+    elif mode.lower() == "fan_avg":
+        return initializer_conf_util.kAverage
+    else:
+        raise ValueError("Invalid variance_norm")
+
+
+def _get_random_distribution(distribution):
+    if distribution.lower() == "truncated_normal":
+        return initializer_conf_util.kTruncatedNormal
+    elif distribution.lower() == "random_normal":
+        return initializer_conf_util.kRandomNormal
+    elif distribution.lower() == "random_uniform":
+        return initializer_conf_util.kRandomUniform
+    else:
+        raise ValueError("Invalid random_distribution")
+
+
+def _get_data_format(data_format):
+    assert isinstance(data_format, str), "data_format must be a string"
+    if data_format.startswith("NC"):
+        return "channels_first"
+    elif data_format.startswith("N") and data_format.endswith("C"):
+        return "channels_last"
+    else:
+        assert data_format == "", ValueError(
+            'data_format must be "N...C" or "NC..." or ""'
+        )
+        return ""
+
+
+def _CalcFan(shape, mode, data_format):
+    if len(shape) == 2:
+        fan_in = shape[1]
+        fan_out = shape[0]
+    else:
+        fan_in = 1.0
+        for dim in shape[1:]:
+            fan_in *= dim
+        fan_out = shape[0]
+        if data_format == "channels_first":
+            for dim in shape[2:]:
+                fan_out *= dim
+        elif data_format == "channels_last":
+            for dim in shape[1:-1]:
+                fan_out *= dim
+        else:
+            raise NotImplementedError(
+                "Only support 'channels_first' and 'channels_last' data format"
+            )
+    if mode == "fan_avg":
+        return (float(fan_in) + float(fan_out)) / 2
+    elif mode == "fan_in":
+        return float(fan_in)
+    elif mode == "fan_out":
+        return float(fan_out)
+    else:
+        raise NotImplementedError("Only support 'fan_in', 'fan_out' and 'fan_avg' mode")
+
+
+def CalcGain(nonlinearity, param):
+    linear_fns = [
+        "linear",
+        "conv1d",
+        "conv2d",
+        "conv3d",
+        "conv_transpose1d",
+        "conv_transpose2d",
+        "conv_transpose3d",
+    ]
+    if nonlinearity in linear_fns or nonlinearity == "sigmoid":
+        return 1
+    elif nonlinearity == "tanh":
+        return 5.0 / 3
+    elif nonlinearity == "relu":
+        return math.sqrt(2.0)
+    elif nonlinearity == "leaky_relu":
+        if param is None:
+            negative_slope = 0.01
+        elif (
+            not isinstance(param, bool)
+            and isinstance(param, int)
+            or isinstance(param, float)
+        ):
+            negative_slope = param
+        else:
+            raise ValueError("negative_slope {} not a valid number".format(param))
+        return math.sqrt(2.0 / (1 + negative_slope ** 2))
+    elif nonlinearity == "selu":
+        return 3.0 / 4
+    else:
+        raise ValueError("Unsupported nonlinearity {}".format(nonlinearity))
+
+
+_init_map = {}
+
+
+def register_initializer(flow_initializer):
+    def deco(func):
+        _init_map[flow_initializer] = func
+        return func
+
+    return deco
+
+
+def GetInitializer(initializer_conf, random_seed, var_blob_shape):
+    f = None
+    for m in _init_map:
+        if initializer_conf.HasField(m):
+            f = _init_map[m]
+            break
+    assert f is not None, initializer_conf
+    return f(getattr(initializer_conf, m), random_seed, var_blob_shape)
+
+
+@register_initializer("constant_conf")
+@register_initializer("constant_int_conf")
+def ConstantInitializerImpl(
+    initializer_conf: Union[
+        initializer_conf_util.ConstantInitializerConf,
+        initializer_conf_util.ConstantIntInitializerConf,
+    ],
+    random_seed: int,
+    var_blob_shape: Sequence[int],
+):
+    return lambda length: np.full((length,), initializer_conf.value)
+
+
+@register_initializer("random_normal_conf")
+def RandomNormalInitializerImpl(
+    initializer_conf: initializer_conf_util.RandomNormalInitializerConf,
+    random_seed: int,
+    var_blob_shape: Sequence[int],
+):
+    rng = np.random.default_rng(random_seed)
+    return lambda length: rng.normal(
+        loc=initializer_conf.mean, scale=initializer_conf.std, size=length
+    )
+
+
+@register_initializer("random_uniform_conf")
+def RandomUniformInitializerImpl(
+    initializer_conf: initializer_conf_util.RandomUniformIntInitializerConf,
+    random_seed: int,
+    var_blob_shape: Sequence[int],
+):
+    rng = np.random.default_rng(random_seed)
+    return lambda length: rng.uniform(
+        low=initializer_conf.min,
+        high=np.nextafter(initializer_conf.max, float("inf")),
+        size=length,
+    )
+
+
+@register_initializer("random_uniform_int_conf")
+def RandomUniformIntInitializerImpl(
+    initializer_conf: initializer_conf_util.RandomUniformIntInitializerConf,
+    random_seed: int,
+    var_blob_shape: Sequence[int],
+):
+    rng = np.random.default_rng(random_seed)
+    return lambda length: rng.integers(
+        low=initializer_conf.min, high=initializer_conf.max, size=length
+    )
+
+
+def RngTruncatedNormal(mean, std, length, rng):
+    truncated_value = 2 * std
+    data = np.empty(length)
+    generated = 0
+    ratio = 1.2
+    while generated < length:
+        remaining = length - generated
+        norm = rng.normal(mean, std, size=int(remaining * ratio))
+        truncated = norm[np.abs(norm - mean) < truncated_value][:remaining]
+        data[generated : generated + len(truncated)] = truncated
+        generated += len(truncated)
+    return data
+
+
+@register_initializer("truncated_normal_conf")
+def TruncatedNormalInitializerImpl(
+    initializer_conf: initializer_conf_util.TruncatedNormalInitializerConf,
+    random_seed: int,
+    var_blob_shape: Sequence[int],
+):
+    rng = np.random.default_rng(random_seed)
+    return lambda length: RngTruncatedNormal(
+        initializer_conf.mean, initializer_conf.std, length, rng
+    )
+
+
+def GenInitialFan(initializer_conf, var_blob_shape: Sequence[int]):
+    variance_norm = initializer_conf.variance_norm
+    data_format = initializer_conf.data_format
+    fan_in = np.prod(var_blob_shape[1:]).astype(np.int).item()
+    fan_out = var_blob_shape[0]
+    if data_format == "channel_first":
+        fan_out *= np.prod(var_blob_shape[2:]).astype(np.int).item()
+    else:
+        fan_out *= np.prod(var_blob_shape[1:-1]).astype(np.int).item()
+    if variance_norm == initializer_conf_util.kAverage:
+        fan = (fan_in + fan_out) / 2
+    elif variance_norm == initializer_conf_util.kFanIn:
+        fan = fan_in
+    elif variance_norm == initializer_conf_util.kFanOut:
+        fan = fan_out
+    else:
+        raise NotImplemented()
+    return fan
+
+
+@register_initializer("variance_scaling_conf")
+def VarianceScalingInitializerImpl(
+    initializer_conf: initializer_conf_util.VarianceScalingInitializerConf,
+    random_seed: int,
+    var_blob_shape: Sequence[int],
+):
+    scale = initializer_conf.scale / GenInitialFan(initializer_conf, var_blob_shape)
+    distribution = initializer_conf.distribution
+    rng = np.random.default_rng(random_seed)
+    if distribution == initializer_conf_util.kTruncatedNormal:
+        stddev = math.sqrt(scale) / 0.8796256610342398
+        return lambda length: RngTruncatedNormal(0, stddev, length, rng)
+    elif distribution == initializer_conf_util.kRandomNormal:
+        stddev = math.sqrt(scale)
+        return lambda length: rng.normal(0, stddev, size=length)
+    elif distribution == initializer_conf_util.kRandomUniform:
+        limit = math.sqrt(3.0 * scale)
+        return lambda length: rng.uniform(low=-limit, high=limit, size=length)
+    else:
+        raise NotImplemented()
+
+
+@register_initializer("empty_conf")
+def EmptyInitializerImpl(
+    initializer_conf: initializer_conf_util.EmptyInitializerConf,
+    random_seed: int,
+    var_blob_shape: Sequence[int],
+):
+    return None
diff --git a/python/oneflow/ops/linalg.py b/python/oneflow/ops/linalg.py
new file mode 100644
index 0000000000000000000000000000000000000000..e23a68722aebdc6b65314f35507869295ba19e32
--- /dev/null
+++ b/python/oneflow/ops/linalg.py
@@ -0,0 +1,25 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+from typing import Optional
+
+import oneflow as flow
+import oneflow._oneflow_internal
+import oneflow.core.operator.op_conf_pb2 as op_conf_util
+import oneflow.core.register.logical_blob_id_pb2 as logical_blob_id_util
+import oneflow.framework.id_util as id_util
+import oneflow.framework.interpret_util as interpret_util
+import oneflow.framework.remote_blob as remote_blob_util
diff --git a/python/oneflow/ops/loss_ops.py b/python/oneflow/ops/loss_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c413782725c5cfbde8e06baca4c7a9289bdcff45
--- /dev/null
+++ b/python/oneflow/ops/loss_ops.py
@@ -0,0 +1,275 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional, Tuple
+
+import oneflow as flow
+import oneflow._oneflow_internal
+import oneflow.framework.id_util as id_util
+import oneflow.framework.remote_blob as remote_blob_util
+
+
+def smooth_l1_loss(
+    prediction: oneflow._oneflow_internal.BlobDesc,
+    label: oneflow._oneflow_internal.BlobDesc,
+    beta: float = 1.0,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the smooth l1 loss.
+
+    The equation is:
+
+    .. math::
+
+        & out = \\frac{(\\beta*x)^2}{2}, \\left|x\\right|<\\frac{1}{{\\beta}^2}
+
+        & out = \\left|x\\right|-\\frac{0.5}{{\\beta}^2}, otherwise
+
+
+    Args:
+        prediction (oneflow._oneflow_internal.BlobDesc): The prediction Blob
+        label (oneflow._oneflow_internal.BlobDesc): The label Blob
+        beta (float, optional): The :math:`\\beta` in the equation. Defaults to 1.0.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow as flow
+        import numpy as np
+        import oneflow.typing as tp
+
+
+        @flow.global_function()
+        def smooth_l1_loss_Job(prediction: tp.Numpy.Placeholder((5, )),
+                            label: tp.Numpy.Placeholder((5, ))
+        ) -> tp.Numpy:
+            return flow.smooth_l1_loss(prediction=prediction,
+                                    label=label)
+
+
+        prediction = np.array([0.1, 0.4, 0.3, 0.5, 0.9]).astype(np.float32)
+        label = np.array([0.3, 0.9, 2.5, 0.4, 0.3]).astype(np.float32)
+        out = smooth_l1_loss_Job(prediction, label)
+
+        # out [0.02       0.12499999 1.7        0.005      0.17999998]
+
+    """
+    op = (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("SmoothL1Loss_")
+        )
+        .Op("smooth_l1_loss")
+        .Input("prediction", [prediction])
+        .Input("label", [label])
+        .Output("loss")
+    )
+    op.Attr("beta", float(beta))
+    return op.Build().InferAndTryRun().RemoteBlobList()[0]
+
+
+def ctc_loss(
+    log_probs: oneflow._oneflow_internal.BlobDesc,
+    targets: oneflow._oneflow_internal.BlobDesc,
+    input_lengths: oneflow._oneflow_internal.BlobDesc,
+    target_lengths: oneflow._oneflow_internal.BlobDesc,
+    blank: int = 0,
+    reduction: str = "mean",
+    zero_infinity: bool = False,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Computes the CTC(Connectionist Temporal Classification) loss.
+    This operator implements the CTC loss as presented in (Graves et al., 2006).
+
+
+    Args:
+        log_probs (oneflow._oneflow_internal.BlobDesc): A Blob of shape [input_length, batch_size, num_labels]. The logarithmized probabilities of the outputs (e.g. obtained with flow.nn.logsoftmax()).
+        targets (oneflow._oneflow_internal.BlobDesc): A Blob of shape [batch_size, max_target_length]. It represent the target sequences. Each element in the target sequence is a class index. And the target index cannot be blank (default=0).
+        input_lengths (oneflow._oneflow_internal.BlobDesc): A Blob of shape [batch_size]. It represent the lengths of the inputs. And the lengths are specified for each sequence to achieve masking under the assumption that sequences are padded to equal lengths.
+        target_lengths (oneflow._oneflow_internal.BlobDesc): A Blob of shape [batch_size]. It represent lengths of the targets. Lengths are specified for each sequence to achieve masking under the assumption that sequences are padded to equal lengths.
+        blank (int, optional): Blank label. Defaults to 0.
+        reduction (str, optional): The reduce type, it can be the one of "none", "mean", "sum". "none": no reduction will be applied, "mean": the output losses will be divided by the target lengths and then the mean over the batch is taken, "sum": the output will be summed. Defaults to "mean".
+        zero_infinity (bool, optional):  Whether to zero infinite losses and the associated gradients. Infinite losses mainly occur when the inputs are too short to be aligned to the targets. Defaults to False.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow as flow
+        import oneflow.typing as tp
+        import numpy as np
+
+
+        @flow.global_function()
+        def ctc_loss_job(
+            log_probs: tp.Numpy.Placeholder(shape=(5, 2, 3)),
+            targets: tp.Numpy.Placeholder(shape=(2, 3), dtype=flow.int32),
+            input_lengths: tp.Numpy.Placeholder(shape=(2,), dtype=flow.int32),
+            target_lengths: tp.Numpy.Placeholder(shape=(2,), dtype=flow.int32),
+        ) -> tp.Numpy:
+            loss = flow.ctc_loss(
+                log_probs, targets, input_lengths, target_lengths, blank=0, reduction="none"
+            )
+            return loss
+
+
+        log_probs = np.array(
+            [
+                [[-1.1031, -0.7998, -1.5200], [-0.9808, -1.1363, -1.1908]],
+                [[-1.2258, -1.0665, -1.0153], [-1.1135, -1.2331, -0.9671]],
+                [[-1.3348, -0.6611, -1.5118], [-0.9823, -1.2355, -1.0941]],
+                [[-1.3850, -1.3273, -0.7247], [-0.8235, -1.4783, -1.0994]],
+                [[-0.9049, -0.8867, -1.6962], [-1.4938, -1.3630, -0.6547]],
+            ]
+        ).astype(np.float32)
+        targets = np.array([[1, 2, 2], [1, 2, 2]]).astype("int32")
+        input_lengths = np.array([5, 5]).astype("int32")
+        target_lengths = np.array([3, 3]).astype("int32")
+        loss = ctc_loss_job(log_probs, targets, input_lengths, target_lengths)
+
+        # loss [3.918017 2.907672]
+
+    """
+    name = name if name is not None else id_util.UniqueStr("CTCLoss_")
+    (loss, _) = (
+        flow.user_op_builder(name)
+        .Op("ctc_loss")
+        .Input("log_probs", [log_probs])
+        .Input("targets", [targets])
+        .Input("input_lengths", [input_lengths])
+        .Input("target_lengths", [target_lengths])
+        .Output("loss")
+        .Output("alpha")
+        .Attr("blank", int(blank))
+        .Attr("zero_infinity", zero_infinity)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()
+    )
+    if zero_infinity:
+        cond = flow.math.equal(
+            loss,
+            flow.constant(
+                float("inf"),
+                dtype=loss.dtype,
+                shape=loss.shape,
+                name=name + "_constant",
+            ),
+            name=name + "_equal",
+        )
+        loss = flow.where(
+            cond,
+            flow.zeros(dtype=loss.dtype, shape=loss.shape, name=name + "_zeros"),
+            loss,
+            name=name + "_where",
+        )
+    if reduction == "mean":
+        return flow.math.reduce_mean(
+            flow.math.xdivy(
+                loss,
+                flow.cast(
+                    flow.math.clip_by_value(
+                        target_lengths, min_value=1, name=name + "_clip_by_value"
+                    ),
+                    dtype=log_probs.dtype,
+                    name=name + "_cast",
+                ),
+                name=name + "_xdivy",
+            ),
+            name=name + "_reduce_mean",
+        )
+    elif reduction == "sum":
+        return flow.math.reduce_sum(loss, name=name + "_reduce_sum")
+    else:
+        return loss
+
+
+def ctc_greedy_decoder(
+    log_probs: oneflow._oneflow_internal.BlobDesc,
+    input_lengths: oneflow._oneflow_internal.BlobDesc,
+    merge_repeated: bool = True,
+    name: Optional[str] = None,
+) -> Tuple[oneflow._oneflow_internal.BlobDesc, oneflow._oneflow_internal.BlobDesc]:
+    """Performs greedy decoding on the logits given in input (best path).
+
+    Args:
+        log_probs (oneflow._oneflow_internal.BlobDesc): A Blob of shape [input_length, batch_size, num_labels]. The logarithmized probabilities of the outputs (e.g. obtained with flow.nn.logsoftmax()).
+        input_lengths (oneflow._oneflow_internal.BlobDesc): A Blob of shape [batch_size]. It represent the lengths of the inputs. And the lengths are specified for each sequence to achieve masking under the assumption that sequences are padded to equal lengths.
+        merge_repeated (bool, optional): If merge_repeated is True, merge repeated classes in output. This means that if consecutive logits' maximum indices are the same, only the first of these is emitted. Defaults to True.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        decoded(oneflow._oneflow_internal.BlobDesc): A Blob of shape [batch_size, input_length], The decoded outputs.
+        neg_sum_logits(oneflow._oneflow_internal.BlobDesc): A float matrix (batch_size x 1) containing, for the sequence found, the negative of the sum of the greatest logit at each timeframe.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow as flow
+        import oneflow.typing as tp
+        import numpy as np
+        from typing import Tuple
+
+
+        @flow.global_function()
+        def ctc_greedy_decoder_job(
+            log_probs: tp.Numpy.Placeholder(shape=(4, 2, 5)),
+            input_lengths: tp.Numpy.Placeholder(shape=(2,), dtype=flow.int64),
+        ) -> Tuple[tp.Numpy, tp.Numpy]:
+            decoded, neg_sum_logits = flow.nn.ctc_greedy_decoder(
+                log_probs, input_lengths, merge_repeated=True
+            )
+            return decoded, neg_sum_logits
+
+
+        log_probs = np.array(
+            [
+                [[-1.54, -1.20, -1.95, -1.65, -1.81], [-1.84, -1.74, -1.58, -1.55, -1.12]],
+                [[-1.68, -1.48, -1.89, -1.30, -2.07], [-1.13, -1.45, -1.24, -1.61, -1.66]],
+                [[-1.56, -1.40, -2.83, -1.67, -1.48], [-1.20, -2.01, -2.05, -1.95, -1.24]],
+                [[-2.09, -1.76, -1.36, -1.67, -1.45], [-1.85, -1.48, -1.34, -2.16, -1.55]],
+            ]
+        ).astype(np.float32)
+        input_lengths = np.array([4, 4])
+        decoded, neg_sum_logits = ctc_greedy_decoder_job(log_probs, input_lengths)
+
+        # decoded [[1 3 1 2] [0 2 0 0]]
+        # neg_sum_logits [[5.26] [4.79]]
+
+
+    """
+    name = name if name is not None else id_util.UniqueStr("CTCGreedyDecode_")
+    (decoded, neg_sum_logits) = (
+        flow.user_op_builder(name)
+        .Op("ctc_greedy_decoder")
+        .Input("log_probs", [log_probs])
+        .Input("input_lengths", [input_lengths])
+        .Output("decoded")
+        .Output("neg_sum_logits")
+        .Attr("merge_repeated", merge_repeated)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()
+    )
+    return (decoded, neg_sum_logits)
diff --git a/python/oneflow/ops/losses.py b/python/oneflow/ops/losses.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/python/oneflow/ops/math_binary_elementwise_ops.py b/python/oneflow/ops/math_binary_elementwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..62785812440c3f8325a4f58e72ca2d07dfc37f15
--- /dev/null
+++ b/python/oneflow/ops/math_binary_elementwise_ops.py
@@ -0,0 +1,288 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+from typing import Optional, Union
+
+import oneflow as flow
+import oneflow._oneflow_internal
+import oneflow.framework.id_util as id_util
+import oneflow.framework.remote_blob as remote_blob_util
+
+
+def build_math_binary_elementwise_op(math_op, x, y, name=None):
+    if name is None:
+        name = id_util.UniqueStr(math_op + "_")
+    return (
+        flow.user_op_builder(name)
+        .Op(math_op)
+        .Input("x", [x])
+        .Input("y", [y])
+        .Output("z")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
+def atan2(
+    x: oneflow._oneflow_internal.BlobDesc,
+    y: oneflow._oneflow_internal.BlobDesc,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the values of :math:`arctan(\\frac{x}{y})`.
+
+    The equation is:
+
+    .. math::
+
+        out = arctan(\\frac{x}{y})
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        y (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow as flow
+        import numpy as np
+        import oneflow.typing as tp
+
+
+        @flow.global_function()
+        def atan2Job(x: tp.Numpy.Placeholder((3,),), y: tp.Numpy.Placeholder((3, ))
+        )-> tp.Numpy:
+            return flow.math.atan2(x, y)
+
+        x = np.array([1, 2, 3]).astype(np.float32)
+        y = np.array([4, 4, 4]).astype(np.float32)
+        out = atan2Job(x, y)
+
+
+        # out [0.24497867 0.4636476  0.6435011 ]
+        # We take the first value as an example
+        # (arctan(1/4) * pi) / 180 = 0.24497867
+
+    """
+    return build_math_binary_elementwise_op("atan2", x, y, name)
+
+
+def pow(
+    x: oneflow._oneflow_internal.BlobDesc,
+    y: Union[oneflow._oneflow_internal.BlobDesc, float],
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the Pow result.
+
+    The equation is:
+
+    .. math::
+
+        out = x^y
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        y (Union[oneflow._oneflow_internal.BlobDesc, float]): A Blob or float value, the exponential factor of Pow
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    Example 1:
+
+    .. code-block:: python
+
+        import oneflow as flow
+        import numpy as np
+        import oneflow.typing as tp
+
+
+        @flow.global_function()
+        def powJob(x: tp.Numpy.Placeholder((3,), ), y: tp.Numpy.Placeholder((3,))
+                ) -> tp.Numpy:
+            return flow.math.pow(x, y)
+
+
+        x = np.array([2, 3, 4]).astype(np.float32)
+        y = np.array([2, 3, 4]).astype(np.float32)
+        out = powJob(x, y)
+
+        # out [  4.  27. 256.]
+
+    Example 2:
+
+    .. code-block:: python
+
+        import oneflow as flow
+        import oneflow.typing as tp
+        import numpy as np
+
+
+        @flow.global_function()
+        def scalar_pow_job(x: tp.Numpy.Placeholder(shape=(3, )))->tp.Numpy:
+            with flow.scope.placement("cpu", "0:0"):
+                out = flow.math.pow(x, 2.0)
+            return out
+
+
+        x = np.array([1, 2, 3]).astype(np.float32)
+        out = scalar_pow_job(x)
+
+        # out [1. 4. 9.]
+    """
+    if name is None:
+        name = id_util.UniqueStr("Pow_")
+    if isinstance(y, (int, float)):
+        return (
+            flow.user_op_builder(name)
+            .Op("scalar_pow")
+            .Input("in", [x])
+            .Attr("exponent", float(y))
+            .Output("out")
+            .Build()
+            .InferAndTryRun()
+            .RemoteBlobList()[0]
+        )
+    else:
+        return build_math_binary_elementwise_op("pow", x, y, name)
+
+
+def floordiv(
+    x: oneflow._oneflow_internal.BlobDesc,
+    y: oneflow._oneflow_internal.BlobDesc,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the result of :math:`x/y`, rounding toward the most negative integer value
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        y (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow as flow
+        import numpy as np
+        import oneflow.typing as tp
+
+
+        @flow.global_function()
+        def floor_div_Job(x: tp.Numpy.Placeholder((3,)),
+                        y: tp.Numpy.Placeholder((3,))
+        ) -> tp.Numpy:
+            return flow.math.floordiv(x, y)
+
+
+        x = np.array([4, 3, 5]).astype(np.float32)
+        y = np.array([3, 2, 2]).astype(np.float32)
+        out = floor_div_Job(x, y)
+
+        # out [1. 1. 2.]
+    """
+    return build_math_binary_elementwise_op("floordiv", x, y, name)
+
+
+def xdivy(
+    x: oneflow._oneflow_internal.BlobDesc,
+    y: oneflow._oneflow_internal.BlobDesc,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the result of :math:`x/y`
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        y (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow as flow
+        import numpy as np
+        import oneflow.typing as tp
+
+
+        @flow.global_function()
+        def xdivy_Job(x: tp.Numpy.Placeholder((3,)),
+                        y: tp.Numpy.Placeholder((3,))
+        ) -> tp.Numpy:
+            return flow.math.xdivy(x, y)
+
+
+        x = np.array([4, 3, 5]).astype(np.float32)
+        y = np.array([3, 2, 2]).astype(np.float32)
+        out = xdivy_Job(x, y)
+
+        # out [1.3333334 1.5       2.5      ]
+
+    """
+    return build_math_binary_elementwise_op("xdivy", x, y, name)
+
+
+def xlogy(
+    x: oneflow._oneflow_internal.BlobDesc,
+    y: oneflow._oneflow_internal.BlobDesc,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the result of :math:`x*log(y)`
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): A Blob
+        y (oneflow._oneflow_internal.BlobDesc): A Blob
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow as flow
+        import numpy as np
+        import oneflow.typing as tp
+
+
+        @flow.global_function()
+        def xlogy_Job(x: tp.Numpy.Placeholder((3,)),
+                    y: tp.Numpy.Placeholder((3,))
+        ) -> tp.Numpy:
+            return flow.math.xlogy(x, y)
+
+
+        x = np.array([2, 2, 2]).astype(np.float32)
+        y = np.array([4, 8, 16]).astype(np.float32)
+        out = xlogy_Job(x, y)
+
+        # out [2.7725887 4.158883  5.5451775]
+    """
+    return build_math_binary_elementwise_op("xlogy", x, y, name)
diff --git a/python/oneflow/ops/nn_ops.py b/python/oneflow/ops/nn_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..84a338f415cec81b2ff0ba74cc19013953107937
--- /dev/null
+++ b/python/oneflow/ops/nn_ops.py
@@ -0,0 +1,202 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import collections
+
+
+def calc_same_padding(input_size, filter_size, dilation_rate, stride):
+    effective_filter_size = (filter_size - 1) * dilation_rate + 1
+    output_size = (input_size + stride - 1) // stride
+    padding_needed = max(
+        0, int((output_size - 1) * stride + effective_filter_size - input_size)
+    )
+    return padding_needed
+
+
+def calc_pool_padding(padding, dhw_offset, ndims):
+    if isinstance(padding, str):
+        padding = "SAME_LOWER" if padding.upper() == "SAME" else padding
+        assert padding.upper() in ["VALID", "SAME_LOWER", "SAME_UPPER"]
+        padding_type = padding.lower()
+        ndim_pads_list = [[0, 0]] * ndims
+    elif isinstance(padding, (list, tuple)):
+        padding_type = "customized"
+        ndim_pads_list = get_ndim_pads_list(padding, dhw_offset, ndims)
+    else:
+        raise ValueError("padding must be str or a list.")
+    return (padding_type, ndim_pads_list)
+
+
+def _GetSequence(value, n, name):
+    """Formats value from input"""
+    if value is None:
+        value = [1]
+    elif not isinstance(value, collections.Sized):
+        value = [value]
+    current_n = len(value)
+    if current_n == 1:
+        return list(value * n)
+    elif current_n == n:
+        return list(value)
+    else:
+        raise ValueError(
+            "{} should be of length 1 or {} but was {}".format(name, n, current_n)
+        )
+
+
+def get_dhw_offset(channel_pos):
+    if channel_pos == "channels_first":
+        return 2
+    else:
+        return 1
+
+
+def check_conv_cudnn_padding_support(
+    input_size, pad, filter_size, dilation_rate, stride, is_dynamic
+):
+    assert len(pad) == 2
+    if pad[0] == pad[1]:
+        return True
+    elif is_dynamic or pad[0] < pad[1] or pad[0] - pad[1] > 1:
+        return False
+    else:
+        effective_filter_size = (filter_size - 1) * dilation_rate + 1
+        cudnn_output_size = (
+            input_size + 2 * pad[0] - effective_filter_size + stride
+        ) // stride
+        output_size = (
+            input_size + pad[0] + pad[1] - effective_filter_size + stride
+        ) // stride
+        return cudnn_output_size == output_size
+
+
+def check_ndim_conv_cudnn_padding_support(
+    inputs_shape,
+    ndim_pads_list,
+    kernel_sizes,
+    dilations,
+    strides,
+    dhw_offset,
+    is_dynamic,
+):
+    ndims = len(ndim_pads_list)
+    for i in range(ndims):
+        cudnn_support = check_conv_cudnn_padding_support(
+            inputs_shape[dhw_offset + i],
+            ndim_pads_list[i],
+            kernel_sizes[i],
+            dilations[i],
+            strides[i],
+            is_dynamic,
+        )
+        if not cudnn_support:
+            return False
+    return True
+
+
+def get_ndim_pads_list(padding, dhw_offset, ndims):
+    pads_list = []
+    for i in range(len(padding)):
+        pad = padding[i]
+        if isinstance(pad, int):
+            pad = [pad, pad]
+        elif isinstance(pad, (list, tuple)):
+            assert len(pad) == 2
+            pad = [pad[0], pad[1]]
+        else:
+            raise ValueError("padding must be list tuple or int")
+        if i in range(dhw_offset, dhw_offset + ndims):
+            pads_list.append(pad)
+        else:
+            assert pad == [0, 0]
+    return pads_list
+
+
+def calc_ndim_same_padding(
+    input_shape, padding, kernel_sizes, dilations, strides, dhw_offset
+):
+    ndim_padding_needed = []
+    ndims = len(kernel_sizes)
+    for i in range(ndims):
+        ndim_padding_needed.append(
+            calc_same_padding(
+                input_shape[dhw_offset + i], kernel_sizes[i], dilations[i], strides[i]
+            )
+        )
+    pads_small = [padding_needed // 2 for padding_needed in ndim_padding_needed]
+    pads_large = [ndim_padding_needed[i] - pads_small[i] for i in range(ndims)]
+    if padding.upper() == "SAME_LOWER":
+        return [[pads_large[i], pads_small[i]] for i in range(ndims)]
+    elif padding.upper() == "SAME_UPPER":
+        return [[pads_small[i], pads_large[i]] for i in range(ndims)]
+    else:
+        raise NotImplementedError
+
+
+def calc_conv_padding(inputs, padding, data_format, kernel_sizes, dilations, strides):
+    ndims = len(inputs.shape) - 2
+    assert len(kernel_sizes) == ndims
+    assert len(dilations) == ndims
+    assert len(strides) == ndims
+    is_dynamic = inputs.is_dynamic
+    channel_pos = "channels_first" if data_format.startswith("NC") else "channels_last"
+    dhw_offset = get_dhw_offset(channel_pos)
+    ndim_pads_list = []
+    if isinstance(padding, str):
+        padding = "SAME_LOWER" if padding.upper() == "SAME" else padding
+        assert padding.upper() in ["VALID", "SAME_LOWER", "SAME_UPPER"]
+        if padding.upper() == "VALID":
+            return_pads_list = [[0, 0]] * ndims
+            return (inputs, return_pads_list)
+        elif is_dynamic:
+            return_pads_list = [[0, 0]] * ndims
+            inputs = flow.same_padding(
+                inputs,
+                padding.lower(),
+                data_format=data_format,
+                kernel_size=kernel_sizes,
+                strides=strides,
+                dilation_rate=dilations,
+            )
+            return (inputs, return_pads_list)
+        else:
+            ndim_pads_list = calc_ndim_same_padding(
+                inputs.shape, padding, kernel_sizes, dilations, strides, dhw_offset
+            )
+            assert len(ndim_pads_list) == ndims
+    elif isinstance(padding, (list, tuple)):
+        assert len(padding) == ndims + 2
+        ndim_pads_list = get_ndim_pads_list(padding, dhw_offset, ndims)
+        assert len(ndim_pads_list) == ndims
+    else:
+        raise ValueError("padding must be str or a list.")
+    cudnn_padding_support = check_ndim_conv_cudnn_padding_support(
+        inputs.shape,
+        ndim_pads_list,
+        kernel_sizes,
+        dilations,
+        strides,
+        dhw_offset,
+        is_dynamic,
+    )
+    if cudnn_padding_support:
+        return (inputs, ndim_pads_list)
+    else:
+        pad_op_list = [[0, 0]] * (ndims + 2)
+        for i in range(ndims):
+            pad_op_list[dhw_offset + i] = ndim_pads_list[i]
+        inputs = flow.pad(inputs, paddings=pad_op_list)
+        return_pads_list = [[0, 0]] * ndims
+        return (inputs, return_pads_list)
diff --git a/python/oneflow/ops/one_hot.py b/python/oneflow/ops/one_hot.py
new file mode 100644
index 0000000000000000000000000000000000000000..0484633e9cc59294b45ffd4d0c52ee5a840aa73c
--- /dev/null
+++ b/python/oneflow/ops/one_hot.py
@@ -0,0 +1,142 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+from typing import Optional, Union
+
+import oneflow as flow
+import oneflow._oneflow_internal
+import oneflow.core.operator.op_conf_pb2 as op_conf_util
+import oneflow.core.register.logical_blob_id_pb2 as logical_blob_id_util
+import oneflow.framework.distribute as distribute_util
+import oneflow.framework.id_util as id_util
+import oneflow.framework.remote_blob as remote_blob_util
+
+
+def one_hot(
+    indices: oneflow._oneflow_internal.BlobDesc,
+    depth: int,
+    on_value: Union[int, float] = 1,
+    off_value: Union[int, float] = 0,
+    axis: int = -1,
+    dtype: Optional[flow.dtype] = None,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator generates a onehot Blob from input Blob.
+
+    If input Blob's rank is `N`, the corresponding onehot Blob's rank is `N+1`. The new axis is generated on the specified dimension according to the parameter `axis`.
+
+    The locations represented by `indices` take value `on_value`, while other locations take `off_value`
+
+    Args:
+        indices (oneflow._oneflow_internal.BlobDesc): The input Blob.
+        depth (int): The length of onehot Blob.
+        on_value (Union[int, float], optional): The fill value when `indices[i] == i`. Defaults to 1.
+        off_value (Union[int, float], optional): The fill value when `indice[i] != i`. Defaults to 0.
+        axis (int, optional): The specified dimension that the new axis is generated on. Defaults to -1.
+        dtype (Optional[flow.dtype], optional): The output data type, it can be "oneflow.int32", "oneflow.int64", "oneflow.float", "oneflow.double". Defaults to None.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Note:
+
+        The data type of input blob should be `int32` or `int64`
+
+    For example:
+
+    Example 1:
+
+    .. code-block:: python
+
+        import oneflow as flow
+        import oneflow.typing as tp
+        import numpy as np
+
+
+        @flow.global_function()
+        def onehot_Job(x: tp.Numpy.Placeholder((4, ), dtype=flow.int32)
+        ) -> tp.Numpy:
+            return flow.one_hot(indices=x,
+                                depth=5,
+                                axis=-1,
+                                dtype=flow.int32)
+
+
+        x = np.array([0, 3, 1, 2]).astype(np.int32)
+        out = onehot_Job(x)
+
+        # out [[1 0 0 0 0]
+        #      [0 0 0 1 0]
+        #      [0 1 0 0 0]
+        #      [0 0 1 0 0]]
+
+    Example 2:
+
+    .. code-block:: python
+
+        import oneflow as flow
+        import oneflow.typing as tp
+        import numpy as np
+
+
+        @flow.global_function()
+        def onehot_Job(x: tp.Numpy.Placeholder((4, ), dtype=flow.int32)
+        ) -> tp.Numpy:
+            return flow.one_hot(indices=x,
+                                depth=5,
+                                axis=0,
+                                dtype=flow.int32)
+
+
+        x = np.array([0, 3, 1, 2]).astype(np.int32)
+        out = onehot_Job(x)
+
+        # out [[1 0 0 0]
+        #      [0 0 1 0]
+        #      [0 0 0 1]
+        #      [0 1 0 0]
+        #      [0 0 0 0]]
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: [description]
+    """
+    out_ndims = len(indices.shape) + 1
+    if axis < 0:
+        axis += out_ndims
+    assert axis >= 0 and axis < out_ndims, ValueError(
+        "Expected axis to between [%d, %d).  But received: %d "
+        % (-out_ndims, out_ndims, axis)
+    )
+    out = (
+        flow.user_op_builder(name if name is not None else id_util.UniqueStr("OneHot_"))
+        .Op("one_hot")
+        .Input("indices", [indices])
+        .Attr("depth", int(depth))
+        .Attr("floating_on_value", float(on_value))
+        .Attr("integer_on_value", int(on_value))
+        .Attr("floating_off_value", float(off_value))
+        .Attr("integer_off_value", int(off_value))
+        .Attr("dtype", dtype)
+        .Output("out")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+    if axis != out_ndims - 1:
+        dim_list = list(range(0, out_ndims))
+        dim_list.insert(axis, out_ndims - 1)
+        dim_list.pop()
+        return flow.transpose(out, dim_list)
+    else:
+        return out
diff --git a/python/oneflow/ops/optimizer.py b/python/oneflow/ops/optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5aa96abb495984bff7b776ec83256d9f059d1a1
--- /dev/null
+++ b/python/oneflow/ops/optimizer.py
@@ -0,0 +1,1989 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import collections.abc
+import traceback
+from typing import Callable, List, Optional, Sequence, Text, Union
+
+import oneflow as flow
+import oneflow._oneflow_internal
+import oneflow._oneflow_internal.oneflow.core.job.job_conf as job_conf_cfg
+import oneflow._oneflow_internal.oneflow.core.job.learning_rate_schedule_conf as learning_rate_schedule_conf_cfg
+import oneflow.framework.c_api_util as c_api_util
+import oneflow.framework.runtime_mode as rt_mode
+import oneflow.framework.session_context as session_ctx
+from oneflow import oneflow_deprecate
+
+
+def GetVariablesForCurrentJob() -> List[Text]:
+    sess = session_ctx.GetDefaultSession()
+    assert (
+        rt_mode.CurrentMode() == rt_mode.GLOBAL_MODE
+    ), "Optimizer's Variables() or minimize() method should be called inside a Job Function to implicitly get variables from a job."
+    job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+    return list(sess.job_name2var_name2var_blob_[job_name].keys())
+
+
+class ClipGradientConf:
+    @property
+    def clip_conf(self) -> job_conf_cfg.ClipConf:
+        raise NotImplementedError()
+
+
+class by_global_norm(ClipGradientConf):
+    """This operator limits the norm of `Input` with `clip_norm`.
+
+    If the norm of `Input` is less than the `clip_norm`,
+
+    the `Output` will be the same as `Input`.
+
+    If the norm of `Input` is greater than the `clip_norm`, the `Output` will be scaled.
+
+    The equation is:
+
+    .. math::
+
+        Output = \\frac{clip\\_norm*Input}{norm(Input)}
+
+    Args:
+        clip_norm (float): The maximum norm value.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow as flow
+        import oneflow.typing as tp
+
+        @flow.global_function(type="train")
+        def train_job(
+            images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
+            labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
+        ) -> tp.Numpy:
+            with flow.scope.placement("gpu", "0:0"):
+                logits = lenet(images, train=True)
+                loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+                    labels, logits, name="softmax_loss"
+                )
+            # Set learning rate as 0.001
+            lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.001])
+            # Set gradient_clip
+            gradient_clip = flow.optimizer.grad_clipping.by_global_norm(1.0)
+            # Set AdamW optimizer with gradient clip
+            flow.optimizer.AdamW(lr_scheduler,
+                        do_bias_correction=False, weight_decay=0.00005,
+                        grad_clipping=gradient_clip).minimize(loss)
+
+            return loss
+
+    """
+
+    def __init__(self, clip_norm):
+        self.clip_norm = clip_norm
+
+    @property
+    def clip_conf(self):
+        clip_conf = job_conf_cfg.ClipConf()
+        clip_conf.mutable_clip_by_global_norm().set_clip_norm(self.clip_norm)
+        return clip_conf
+
+
+class WarmupConf:
+    @property
+    def warmup_conf(self) -> learning_rate_schedule_conf_cfg.WarmupConf:
+        raise NotImplementedError()
+
+
+class constant(WarmupConf):
+    """This operator use the constant warmup strategy to adjust the learning rate.
+
+    Before the steps are specified by user, the learning rate is:
+
+    .. math::
+
+        learning\\_rate = base\\_learning\\_rate*multiplier
+
+    After the steps are specified by user, the learning rate is:
+
+    .. math::
+
+        learning\\_rate = base\\_learning\\_rate
+
+    Args:
+        steps (int): [description]
+        multiplier (float): The scale factor :math:`multiplier`, it should be greater than 0. and less than 1.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow as flow
+        import oneflow.typing as tp
+
+        @flow.global_function(type="train")
+        def train_job(
+            images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
+            labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
+        ) -> tp.Numpy:
+            with flow.scope.placement("gpu", "0:0"):
+                logits = lenet(images, train=True)
+                loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+                    labels, logits, name="softmax_loss"
+                )
+
+            # Before 10 epochs, the learning rate is 0.001
+            # After 10 epochs, the learning rate is 0.01
+            warmup_scheduler = flow.optimizer.warmup.constant(10, 0.1)
+            lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.01], warmup=warmup_scheduler)
+            flow.optimizer.Adam(lr_scheduler).minimize(loss)
+
+            return loss
+
+    """
+
+    def __init__(self, steps, multiplier):
+        self.steps = steps
+        self.multiplier = multiplier
+
+    @property
+    def warmup_conf(self) -> learning_rate_schedule_conf_cfg.WarmupConf:
+        warmup_conf = learning_rate_schedule_conf_cfg.WarmupConf()
+        warmup_conf.mutable_constant_conf().set_warmup_batches(self.steps)
+        warmup_conf.mutable_constant_conf().set_multiplier(self.multiplier)
+        return warmup_conf
+
+
+class linear(WarmupConf):
+    """This operator uses the linear warmup strategy to adjust the learning rate.
+
+    When current train step is less than warmup steps, the learning rate will be updated as:
+
+    .. math::
+
+        & current\\_multiplier = start\\_multiplier + (1-start\\_multiplier)*\\frac{train\\_step}{warmup\\_step}
+
+        & current\\_learning\\_rate = learning\\_rate*current\\_multiplier
+
+    Args:
+        steps (int): The warmup steps.
+        start_multiplier (float): The start multiplier(:math:`start\\_multiplier`). It should be greater than 0. and less than 1.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow as flow
+        import oneflow.typing as tp
+
+        @flow.global_function(type="train")
+        def train_job(
+            images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
+            labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
+        ) -> tp.Numpy:
+            with flow.scope.placement("gpu", "0:0"):
+                logits = lenet(images, train=True)
+                loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+                    labels, logits, name="softmax_loss"
+                )
+
+            # Before 10 epochs, the learning rate will increase from 0.001 to 0.01 in linear.
+            warmup_scheduler = flow.optimizer.warmup.linear(10, 0.1)
+            lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.01], warmup=warmup_scheduler)
+            flow.optimizer.Adam(lr_scheduler).minimize(loss)
+
+            return loss
+
+    """
+
+    def __init__(self, steps, start_multiplier):
+        self.steps = steps
+        self.start_multiplier = start_multiplier
+
+    @property
+    def warmup_conf(self) -> learning_rate_schedule_conf_cfg.WarmupConf:
+        warmup_conf = learning_rate_schedule_conf_cfg.WarmupConf()
+        warmup_conf.mutable_linear_conf().set_warmup_batches(self.steps)
+        warmup_conf.mutable_linear_conf().set_start_multiplier(self.start_multiplier)
+        return warmup_conf
+
+
+class LrScheduler:
+    def __init__(
+        self,
+        base_lr: Optional[float] = None,
+        lr_lbn: Optional[Text] = None,
+        warmup: Optional[WarmupConf] = None,
+    ):
+        self.base_lr = base_lr
+        self.lr_lbn = lr_lbn
+        self.warmup = warmup
+
+    @property
+    def warmup_conf(self) -> learning_rate_schedule_conf_cfg.WarmupConf:
+        if self.warmup is None:
+            return None
+        return self.warmup.warmup_conf
+
+    @property
+    def learning_rate_decay_conf(
+        self,
+    ) -> Optional[learning_rate_schedule_conf_cfg.LearningRateDecayConf]:
+        raise NotImplementedError()
+
+    def SetLrFieldsInOptimizerConf(self, optimizer_conf) -> None:
+        if self.lr_lbn is not None:
+            assert self.base_lr is None
+            assert self.warmup is None
+            assert self.learning_rate_decay_conf is None
+            optimizer_conf.set_learning_rate_lbn(self.lr_lbn)
+        else:
+            assert self.base_lr is not None
+            optimizer_conf.set_base_learning_rate(self.base_lr)
+            if self.warmup_conf is not None:
+                optimizer_conf.mutable_warmup_conf().CopyFrom(self.warmup_conf)
+            if self.learning_rate_decay_conf is not None:
+                optimizer_conf.mutable_learning_rate_decay().CopyFrom(
+                    self.learning_rate_decay_conf
+                )
+
+
+class CosineScheduler(LrScheduler):
+    """This operator creates a Cosine decayed learning rate scheduler.
+
+    Before the steps are specified by user, the learning rate will be updated as:
+
+    .. math::
+
+        & cos\\_decay = 0.5*(1+cos(\\pi*\\frac{current\\_batch}{decayed\\_batch}))
+
+        & decay\\_factor = (1-\\alpha)*cos\\_decay+\\alpha
+
+        & learning\\_rate = base\\_learning\\_rate*decay\\_factor
+
+    After the steps specified by user, the learning rate will be :
+
+    .. math::
+
+        learning\\_rate = {base\\_learning\\_rate}*{\\alpha}
+
+    Args:
+        base_lr (float): The base learning rate (:math:`base\\_learning\\_rate`)
+        steps (int): The decay steps in the scheduler (:math:`decayed\\_batch`)
+        alpha (float, optional): The learning rate scale factor (:math:`\\alpha`). Defaults to 0.0.
+        warmup (Optional[WarmupConf], optional): The warmup strategy. Defaults to None.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow as flow
+        import oneflow.typing as tp
+
+        @flow.global_function(type="train")
+        def train_job(
+            images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
+            labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
+        ) -> tp.Numpy:
+            with flow.scope.placement("gpu", "0:0"):
+                logits = lenet(images, train=True)
+                loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+                    labels, logits, name="softmax_loss"
+                )
+
+            lr_scheduler = flow.optimizer.CosineScheduler(base_lr=0.01,
+                                                          steps=10,
+                                                          alpha=0.1)
+            flow.optimizer.Adam(lr_scheduler).minimize(loss)
+
+            return loss
+
+    """
+
+    def __init__(
+        self,
+        base_lr: float,
+        steps: int,
+        alpha: float = 0.0,
+        warmup: Optional[WarmupConf] = None,
+    ):
+        super().__init__(base_lr=base_lr, warmup=warmup)
+        self.steps = steps
+        self.alpha = alpha
+
+    @property
+    def learning_rate_decay_conf(
+        self,
+    ) -> Optional[learning_rate_schedule_conf_cfg.LearningRateDecayConf]:
+        learning_rate_decay_conf = (
+            learning_rate_schedule_conf_cfg.LearningRateDecayConf()
+        )
+        learning_rate_decay_conf.mutable_cosine_conf().set_decay_batches(self.steps)
+        learning_rate_decay_conf.mutable_cosine_conf().set_alpha(self.alpha)
+        return learning_rate_decay_conf
+
+
+class CustomScheduler(LrScheduler):
+    def __init__(self, lbn: Text):
+        super().__init__(lr_lbn=lbn)
+
+    @property
+    def learning_rate_decay_conf(
+        self,
+    ) -> learning_rate_schedule_conf_cfg.LearningRateDecayConf:
+        return None
+
+
+class PiecewiseConstantScheduler(LrScheduler):
+    """This operator creates a piecewise constant learning rate scheduler.
+
+    The change in learning rate can be described as follows:
+
+    .. code-block:: python
+
+        boundaries = [1000, 2000]
+        values = [0.1, 0.01, 0.001]
+
+        if current_step < 1000:
+            learning_rate = 0.1
+        elif 1000 < current_step < 2000:
+            learning_rate = 0.01
+        else:
+            learning_rate = 0.001
+
+    Args:
+        boundaries (Sequence[int]): A list of train steps.
+        values (Sequence[float]): A list of learning rate values during the different train step boundary.
+        warmup (Optional[WarmupConf], optional): The warmup strategy. Defaults to None.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow as flow
+        import oneflow.typing as tp
+
+        @flow.global_function(type="train")
+        def train_job(
+                images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
+                labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
+        ) -> tp.Numpy:
+            with flow.scope.placement("gpu", "0:0"):
+                logits = lenet(images, train=True)
+                loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+                    labels, logits, name="softmax_loss"
+                )
+
+            lr_scheduler = flow.optimizer.PiecewiseConstantScheduler(boundaries=[10, 20],
+                                                                     values=[0.1, 0.01, 0.001])
+            flow.optimizer.Adam(lr_scheduler).minimize(loss)
+
+            return loss
+
+    """
+
+    def __init__(
+        self,
+        boundaries: Sequence[int],
+        values: Sequence[float],
+        warmup: Optional[WarmupConf] = None,
+    ):
+        assert len(boundaries) + 1 == len(values)
+        super().__init__(base_lr=values[0], warmup=warmup)
+        self.boundaries = boundaries
+        self.values = values
+
+    @property
+    def learning_rate_decay_conf(
+        self,
+    ) -> Optional[learning_rate_schedule_conf_cfg.LearningRateDecayConf]:
+        learning_rate_decay_conf = (
+            learning_rate_schedule_conf_cfg.LearningRateDecayConf()
+        )
+        for boundary in self.boundaries:
+            learning_rate_decay_conf.mutable_piecewise_constant_conf().add_boundaries(
+                boundary
+            )
+        for value in self.values:
+            learning_rate_decay_conf.mutable_piecewise_constant_conf().add_values(value)
+        return learning_rate_decay_conf
+
+
+class PiecewiseScalingScheduler(LrScheduler):
+    """This operator creates a piecewise scaled decayed learning rate scheduler.
+
+    The change in learning rate can be described as follows:
+
+    .. code-block:: python
+
+        boundaries = [1000, 2000]
+        scale = [0.1, 0.01]
+        base_lr = 0.1
+
+        if current_step < 1000:
+            learning_rate = base_lr
+        elif 1000 < current_step < 2000:
+            learning_rate = 0.1*base_lr
+        else:
+            learning_rate = 0.01*base_lr
+
+    Args:
+        base_lr (float): The base learning rate
+        boundaries (Sequence[int]): A list of train steps.
+        scale (Union[float, Sequence[float]]): A list of learning rate scaled factors during the different train step boundary.
+        warmup (Optional[WarmupConf], optional): The warmup strategy. Defaults to None.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow as flow
+        import oneflow.typing as tp
+
+        @flow.global_function(type="train")
+        def train_job(
+            images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
+            labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
+        ) -> tp.Numpy:
+            with flow.scope.placement("gpu", "0:0"):
+                logits = lenet(images, train=True)
+                loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+                    labels, logits, name="softmax_loss"
+                )
+
+            lr_scheduler = flow.optimizer.PiecewiseScalingScheduler(base_lr=0.1,
+                                                                    boundaries=[5, 10],
+                                                                    scale=[0.5, 0.1])
+            flow.optimizer.SGD(lr_scheduler, momentum=0).minimize(loss)
+
+            return loss
+
+    """
+
+    def __init__(
+        self,
+        base_lr: float,
+        boundaries: Sequence[int],
+        scale: Union[float, Sequence[float]],
+        warmup: Optional[WarmupConf] = None,
+    ):
+        super().__init__(base_lr=base_lr, warmup=warmup)
+        self.boundaries = boundaries
+        if not isinstance(scale, collections.abc.Sequence):
+            scale = [scale] * len(boundaries)
+        assert len(boundaries) == len(scale)
+        self.scales = [1] + list(scale)
+
+    @property
+    def learning_rate_decay_conf(
+        self,
+    ) -> Optional[learning_rate_schedule_conf_cfg.LearningRateDecayConf]:
+        learning_rate_decay_conf = (
+            learning_rate_schedule_conf_cfg.LearningRateDecayConf()
+        )
+        for boundary in self.boundaries:
+            learning_rate_decay_conf.mutable_piecewise_scaling_conf().add_boundaries(
+                boundary
+            )
+        for scale in self.scales:
+            learning_rate_decay_conf.mutable_piecewise_scaling_conf().add_scales(scale)
+        return learning_rate_decay_conf
+
+
+class PolynomialScheduler(LrScheduler):
+    """This operator creates a polynomial decayed learning rate scheduler.
+
+    The learning rate will be updated as follows:
+
+    If cycle is `True`, the equation is:
+
+    .. math::
+
+        & decay\\_batch = decay\\_batch*ceil(\\frac{current\\_batch}{decay\\_batch})
+
+        & learning\\_rate = (base\\_lr-end\\_lr)*(1-\\frac{current\\_batch}{decay\\_batch})^{pow}+end\\_lr
+
+    If cycle is `False`, the equation is:
+
+    .. math::
+
+        & decay\\_batch = min(decay\\_batch, current\\_batch)
+
+        & learning\\_rate = (base\\_lr-end\\_lr)*(1-\\frac{current\\_batch}{decay\\_batch})^{pow}+end\\_lr
+
+    Args:
+        base_lr (float): The base learning rate
+        steps (int): The decayed steps
+        end_learning_rate (float, optional): The final learning rate. Defaults to 0.0001.
+        power (float, optional): The power of polynomial. Defaults to 1.0.
+        cycle (bool, optional): If cycle is true, the scheduler will decay the learning rate every decay steps. Defaults to False.
+        warmup (Optional[WarmupConf], optional): The warmup strategy. Defaults to None.
+
+    For example:
+
+        .. code-block:: python
+
+            import oneflow as flow
+            import oneflow.typing as tp
+
+            @flow.global_function(type="train")
+            def train_job(
+                    images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
+                    labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
+            ) -> tp.Numpy:
+                with flow.scope.placement("gpu", "0:0"):
+                    logits = lenet(images, train=True)
+                    loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+                        labels, logits, name="softmax_loss"
+                    )
+
+                lr_scheduler = flow.optimizer.PolynomialScheduler(base_lr=0.001,
+                                                                 steps=5,
+                                                                 end_learning_rate=0.00001,
+                                                                 power=2)
+                flow.optimizer.Adam(lr_scheduler).minimize(loss)
+
+                return loss
+
+    """
+
+    def __init__(
+        self,
+        base_lr: float,
+        steps: int,
+        end_learning_rate: float = 0.0001,
+        power: float = 1.0,
+        cycle: bool = False,
+        warmup: Optional[WarmupConf] = None,
+    ):
+        super().__init__(base_lr=base_lr, warmup=warmup)
+        self.steps = steps
+        self.end_learning_rate = end_learning_rate
+        self.power = power
+        self.cycle = cycle
+
+    @property
+    def learning_rate_decay_conf(
+        self,
+    ) -> Optional[learning_rate_schedule_conf_cfg.LearningRateDecayConf]:
+        learning_rate_decay_conf = (
+            learning_rate_schedule_conf_cfg.LearningRateDecayConf()
+        )
+        learning_rate_decay_conf.mutable_polynomial_conf().set_decay_batches(self.steps)
+        learning_rate_decay_conf.mutable_polynomial_conf().set_end_learning_rate(
+            self.end_learning_rate
+        )
+        learning_rate_decay_conf.mutable_polynomial_conf().set_power(self.power)
+        learning_rate_decay_conf.mutable_polynomial_conf().set_cycle(self.cycle)
+        return learning_rate_decay_conf
+
+
+from oneflow import oneflow_deprecate
+
+
+@oneflow_deprecate()
+class PolynomialSchduler(PolynomialScheduler):
+    def __init__(
+        self,
+        base_lr: float,
+        steps: int,
+        end_learning_rate: float = 0.0001,
+        power: float = 1.0,
+        cycle: bool = False,
+        warmup: Optional[WarmupConf] = None,
+    ):
+        print(
+            "WARNING:",
+            "oneflow.optimizer.PolynomialSchduler",
+            "will be removed in the future, use {} instead.".format(
+                "oneflow.optimizer.PolynomialScheduler"
+            ),
+        )
+        print(traceback.format_stack()[-2])
+        super().__init__(
+            base_lr=base_lr,
+            steps=steps,
+            end_learning_rate=end_learning_rate,
+            power=power,
+            cycle=cycle,
+            warmup=warmup,
+        )
+
+
+class LinearCosineScheduler(LrScheduler):
+    """This operator creates a linear cosine decayed learning rate scheduler.
+
+    The learning rate will be updated as follows:
+
+    .. math::
+
+        & current\\_batch = min(current\\_batch, decay\\_batch)
+
+        & linear\\_decay = \\frac{(decay\\_batch - current\\_batch)}{decay\\_batch}
+
+        & cosine\\_decay = 0.5*(1.0+cos(2*\\pi*num\\_periods*\\frac{current\\_batch}{decay\\_batch}))
+
+        & decay\\_factor = (\\alpha+linear\\_decay)*cosine\\_decay + \\beta
+
+        & learning\\_rate = base\\_learning\\_rate*decay\\_factor
+
+    Args:
+        base_lr (float): The base learning rate
+        steps (int): The decay steps
+        num_periods (float, optional): The number of decay periods. Defaults to 0.5.
+        alpha (float, optional): The :math:`\\alpha` in equation. Defaults to 0.0.
+        beta (float, optional): The :math:`\\beta` in equation. Defaults to 0.001.
+        warmup (Optional[WarmupConf], optional): The warmup strategy. Defaults to None.
+
+    For example:
+
+        .. code-block:: python
+
+            import oneflow as flow
+            import oneflow.typing as tp
+
+            @flow.global_function(type="train")
+            def train_job(
+                    images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
+                    labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
+            ) -> tp.Numpy:
+                with flow.scope.placement("gpu", "0:0"):
+                    logits = lenet(images, train=True)
+                    loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+                        labels, logits, name="softmax_loss"
+                    )
+
+                lr_scheduler = flow.optimizer.LinearCosineScheduler(base_lr=0.1,
+                                                                    steps=10)
+                flow.optimizer.SGD(lr_scheduler, momentum=0.9).minimize(loss)
+
+                return loss
+
+    """
+
+    def __init__(
+        self,
+        base_lr: float,
+        steps: int,
+        num_periods: float = 0.5,
+        alpha: float = 0.0,
+        beta: float = 0.001,
+        warmup: Optional[WarmupConf] = None,
+    ):
+        super().__init__(base_lr=base_lr, warmup=warmup)
+        self.steps = steps
+        self.num_periods = num_periods
+        self.alpha = alpha
+        self.beta = beta
+
+    @property
+    def learning_rate_decay_conf(
+        self,
+    ) -> Optional[learning_rate_schedule_conf_cfg.LearningRateDecayConf]:
+        learning_rate_decay_conf = (
+            learning_rate_schedule_conf_cfg.LearningRateDecayConf()
+        )
+        learning_rate_decay_conf.mutable_linear_cosine_conf().set_decay_batches(
+            self.steps
+        )
+        learning_rate_decay_conf.mutable_linear_cosine_conf().set_num_periods(
+            self.num_periods
+        )
+        learning_rate_decay_conf.mutable_linear_cosine_conf().set_alpha(self.alpha)
+        learning_rate_decay_conf.mutable_linear_cosine_conf().set_beta(self.beta)
+        return learning_rate_decay_conf
+
+
+class ExponentialScheduler(LrScheduler):
+    """This operator creates a exponential decayed learning rate scheduler.
+
+    The learning rate will be updated as follows:
+
+    If staircase is set to False, the equation is:
+
+    .. math::
+
+        & pow = \\frac{current\\_batch}{decay\\_batch}
+
+        & learning\\_rate = base\\_learning\\_rate*decay\\_rate^{pow}
+
+    If staircase is set to True, the equation is:
+
+    .. math::
+
+        & pow = floor(\\frac{current\\_batch}{decay\\_batch})
+
+        & learning\\_rate = base\\_learning\\_rate*decay\\_rate^{pow}
+
+    Args:
+        base_lr (float): The base learning rate
+        steps (int): The decay steps
+        decay_rate (float): The decay rate
+        staircase (bool, optional): If staircase is True, the scheduler decay the learning rate at discrete intervals. Defaults to False.
+        warmup (Optional[WarmupConf], optional): The warmup strategy. Defaults to None.
+
+    For example:
+
+        .. code-block::python
+
+            import oneflow as flow
+            import oneflow.typing as tp
+
+            @flow.global_function(type="train")
+            def train_job(
+                    images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
+                    labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
+            ) -> tp.Numpy:
+                with flow.scope.placement("gpu", "0:0"):
+                    logits = lenet(images, train=True)
+                    loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+                        labels, logits, name="softmax_loss"
+                    )
+
+                lr_scheduler = flow.optimizer.CosineScheduler(base_lr=0.01,
+                                                              steps=10,
+                                                              alpha=0.1)
+                flow.optimizer.Adam(lr_scheduler).minimize(loss)
+
+                return loss
+
+    """
+
+    def __init__(
+        self,
+        base_lr: float,
+        steps: int,
+        decay_rate: float,
+        staircase=False,
+        warmup: Optional[WarmupConf] = None,
+    ):
+        super().__init__(base_lr=base_lr, warmup=warmup)
+        self.steps = steps
+        self.decay_rate = decay_rate
+        self.staircase = staircase
+
+    @property
+    def learning_rate_decay_conf(
+        self,
+    ) -> Optional[learning_rate_schedule_conf_cfg.LearningRateDecayConf]:
+        learning_rate_decay_conf = (
+            learning_rate_schedule_conf_cfg.LearningRateDecayConf()
+        )
+        learning_rate_decay_conf.mutable_exponential_conf().set_decay_batches(
+            self.steps
+        )
+        learning_rate_decay_conf.mutable_exponential_conf().set_decay_rate(
+            self.decay_rate
+        )
+        learning_rate_decay_conf.mutable_exponential_conf().set_staircase(
+            self.staircase
+        )
+        return learning_rate_decay_conf
+
+
+class InverseTimeScheduler(LrScheduler):
+    """This operator creates a inverse time decayed learning rate scheduler.
+
+    The learning rate will be updated as follows:
+
+    If staircase is set to False, the equation is:
+
+    .. math::
+
+        & step\\_ratio = \\frac{current\\_batch}{decay\\_batch}
+
+        & learning\\_rate = \\frac{base\\_learning\\_rate}{1+decay\\_rate*step\\_ratio}
+
+    If staircase is set to True, the equation is:
+
+    .. math::
+
+        & step\\_ratio = \\frac{current\\_batch}{decay\\_batch}
+
+        & learning\\_rate = \\frac{base\\_learning\\_rate}{1+floor(decay\\_rate*step\\_ratio)}
+
+    Args:
+        base_lr (float): The base learning rate
+        steps (int): The decay steps
+        decay_rate (float): The decay rate
+        staircase (bool, optional): If staircase is True, the scheduler decay the learning rate at discrete intervals. Defaults to False.
+        warmup (Optional[WarmupConf], optional): The warmup strategy. Defaults to None.
+
+    For example:
+
+        .. code-block:: python
+
+            import oneflow as flow
+            import oneflow.typing as tp
+
+            @flow.global_function(type="train")
+            def train_job(
+                    images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
+                    labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
+            ) -> tp.Numpy:
+                with flow.scope.placement("gpu", "0:0"):
+                    logits = lenet(images, train=True)
+                    loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+                        labels, logits, name="softmax_loss"
+                    )
+
+                lr_scheduler = flow.optimizer.InverseTimeScheduler(base_lr=0.1,
+                                                                   steps=5,
+                                                                   decay_rate=0.9)
+                flow.optimizer.SGD(lr_scheduler, momentum=0.9).minimize(loss)
+
+                return loss
+
+    """
+
+    def __init__(
+        self,
+        base_lr: float,
+        steps: int,
+        decay_rate: float,
+        staircase: bool = False,
+        warmup: Optional[WarmupConf] = None,
+    ):
+        super().__init__(base_lr=base_lr, warmup=warmup)
+        self.steps = steps
+        self.decay_rate = decay_rate
+        self.staircase = staircase
+
+    @property
+    def learning_rate_decay_conf(
+        self,
+    ) -> Optional[learning_rate_schedule_conf_cfg.LearningRateDecayConf]:
+        learning_rate_decay_conf = (
+            learning_rate_schedule_conf_cfg.LearningRateDecayConf()
+        )
+        learning_rate_decay_conf.mutable_inverse_time_conf().set_decay_batches(
+            self.steps
+        )
+        learning_rate_decay_conf.mutable_inverse_time_conf().set_decay_rate(
+            self.decay_rate
+        )
+        learning_rate_decay_conf.mutable_inverse_time_conf().set_staircase(
+            self.staircase
+        )
+        return learning_rate_decay_conf
+
+
+class NaturalExpScheduler(LrScheduler):
+    """This operator creates a natural exponential decayed learning rate scheduler.
+
+    The learning rate will be updated as follows:
+
+    If staircase is set to False, the equation is:
+
+    .. math::
+
+        & step\\_ratio = \\frac{current\\_batch}{decay\\_batch}
+
+        & learning\\_rate = {base\\_learning\\_rate}*e^{-decay\\_rate*step\\_ratio}
+
+    If staircase is set to True, the equation is:
+
+    .. math::
+
+        & step\\_ratio = \\frac{current\\_batch}{decay\\_batch}
+
+        & learning\\_rate = {base\\_learning\\_rate}*e^{-decay\\_rate*floor(step\\_ratio)}
+
+    Args:
+        base_lr (float): The base learning rate
+        steps (int): The decay steps
+        decay_rate (float): The decay rate
+        staircase (bool, optional): If staircase is True, the scheduler decay the learning rate at discrete intervals. Defaults to False.
+        warmup (Optional[WarmupConf], optional): The warmup strategy. Defaults to None.
+
+    For example:
+
+        .. code-block:: python
+
+            import oneflow as flow
+            import oneflow.typing as tp
+
+            @flow.global_function(type="train")
+            def train_job(
+                    images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
+                    labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
+            ) -> tp.Numpy:
+                with flow.scope.placement("gpu", "0:0"):
+                    logits = lenet(images, train=True)
+                    loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+                        labels, logits, name="softmax_loss"
+                    )
+
+                lr_scheduler = flow.optimizer.NaturalExpScheduler(base_lr=0.1,
+                                                                  steps=10,
+                                                                  decay_rate=0.5)
+                flow.optimizer.SGD(lr_scheduler, momentum=0.9).minimize(loss)
+
+                return loss
+
+    """
+
+    def __init__(
+        self,
+        base_lr: float,
+        steps: int,
+        decay_rate: float,
+        staircase: bool = False,
+        warmup: Optional[WarmupConf] = None,
+    ):
+        super().__init__(base_lr=base_lr, warmup=warmup)
+        self.steps = steps
+        self.decay_rate = decay_rate
+        self.staircase = staircase
+
+    @property
+    def learning_rate_decay_conf(
+        self,
+    ) -> Optional[learning_rate_schedule_conf_cfg.LearningRateDecayConf]:
+        learning_rate_decay_conf = (
+            learning_rate_schedule_conf_cfg.LearningRateDecayConf()
+        )
+        learning_rate_decay_conf.mutable_natural_exp_conf.set_decay_batches(self.steps)
+        learning_rate_decay_conf.mutable_natural_exp_conf.set_decay_rate(
+            self.decay_rate
+        )
+        learning_rate_decay_conf.mutable_natural_exp_conf.set_staircase(self.staircase)
+        return learning_rate_decay_conf
+
+
+class LossScalePolicy:
+    def SetLossScaleFieldsInTrainConf(self, train_conf):
+        raise NotImplementedError()
+
+
+class StaticLossScalePolicy(LossScalePolicy):
+    def __init__(self, loss_scale_factor: float):
+        super().__init__()
+        self.loss_scale_factor = loss_scale_factor
+
+    def SetLossScaleFieldsInTrainConf(self, train_conf):
+        train_conf.loss_scale_factor = self.loss_scale_factor
+
+
+class DynamicLossScalePolicy(LossScalePolicy):
+    def __init__(
+        self, initial_loss_scale=2 ** 30, increment_period=2000, multiplier=2.0
+    ):
+        super().__init__()
+        self.initial_loss_scale = initial_loss_scale
+        self.increment_period = increment_period
+        self.multiplier = multiplier
+
+    def SetLossScaleFieldsInTrainConf(self, train_conf):
+        train_conf.mutable_dynamic_loss_scale_policy().set_initial_loss_scale(
+            self.initial_loss_scale
+        )
+        train_conf.mutable_dynamic_loss_scale_policy().set_increment_period(
+            self.increment_period
+        )
+        train_conf.mutable_dynamic_loss_scale_policy().set_multiplier(self.multiplier)
+
+
+class Optimizer:
+    def __init__(
+        self,
+        loss_scale_factor: Optional[int] = None,
+        train_step_lbn: Optional[Text] = None,
+        loss_scale_policy: Optional[LossScalePolicy] = None,
+    ):
+        self.train_step_lbn = train_step_lbn
+        if loss_scale_factor is not None:
+            assert loss_scale_policy is None
+            self.loss_scale_policy = StaticLossScalePolicy(loss_scale_factor)
+        else:
+            self.loss_scale_policy = loss_scale_policy
+        self._variables_list_init = False
+
+    def Variables(self) -> List[Text]:
+        if not self._variables_list_init:
+            if self.variables is None:
+                self.variables = list(GetVariablesForCurrentJob())
+            elif callable(self.variables):
+                self.variables = list(self.variables())
+            else:
+                self.variables = list(self.variables)
+            self._variables_list_init = True
+        return self.variables
+
+    def _AddOptimizerConfInTrainConf(self, train_conf: job_conf_cfg.TrainConf) -> None:
+        raise NotImplementedError()
+
+    @property
+    def train_conf(self) -> job_conf_cfg.TrainConf:
+        train_conf = job_conf_cfg.TrainConf()
+        if self.train_step_lbn is not None:
+            train_conf.set_train_step_lbn(self.train_step_lbn)
+        if self.loss_scale_policy is not None:
+            self.loss_scale_policy.SetLossScaleFieldsInTrainConf(train_conf)
+        self._AddOptimizerConfInTrainConf(train_conf)
+        return train_conf
+
+    def minimize(
+        self,
+        loss: Union[
+            Sequence[oneflow._oneflow_internal.BlobDesc],
+            oneflow._oneflow_internal.BlobDesc,
+        ],
+    ) -> None:
+        if not isinstance(loss, collections.abc.Sequence):
+            loss = [loss]
+        c_api_util.CurJobBuildAndInferCtx_SetTrainConf(self.train_conf)
+        for x in loss:
+            flow.losses.add_loss(x)
+
+
+class SGD(Optimizer):
+    """The optimizer of the stochastic gradient descent algorithm.
+
+    This algorithm takes a random sample's gradient as an approximate estimate of the overall gradient in small batch gradient descent.
+
+    When the momentum = 0, the equation of parameters updating is:
+
+    .. math::
+
+        param_{new} = param_{old} - learning\\_rate*grad
+
+    With momentum, the equation of parameters updating is:
+
+    .. math::
+
+        & V_{t} = \\beta*V_{t-1} + learning\\_rate*g_t
+
+        & param_{new} = param_{old} - V_{t}
+
+    Args:
+        lr_scheduler (LrScheduler): The scheduler of learning rate.
+        loss_scale_factor (Optional[float], optional): The scale factor of loss. Defaults to None.
+        momentum (float, optional): Momentum factor (:math:`\\beta`). Defaults to 0.9.
+        grad_clipping (Optional[ClipGradientConf], optional): The gradient clipping strategy. Defaults to None.
+        train_step_lbn (Optional[Text], optional): [description]. Defaults to None.
+        loss_scale_policy (Optional[LossScalePolicy]): The policy of loss scale.
+        variables(Optional[
+            Union[Sequence[Text], Callable[[], Sequence[Text]]]
+        ]): maintained variables.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow as flow
+        import oneflow.typing as tp
+
+        @flow.global_function(type="train")
+        def train_job(
+            images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
+            labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
+        ) -> tp.Numpy:
+            with flow.scope.placement("gpu", "0:0"):
+                logits = lenet(images, train=True)
+                loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+                    labels, logits, name="softmax_loss"
+                )
+
+            # Set Learning rate as 0.1
+            lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.1])
+            # Set Momentum=0.9 SGD optimizer
+            flow.optimizer.SGD(lr_scheduler, momentum=0.9).minimize(loss)
+
+            return loss
+    """
+
+    def __init__(
+        self,
+        lr_scheduler: LrScheduler,
+        loss_scale_factor: Optional[float] = None,
+        momentum: float = 0.9,
+        grad_clipping: Optional[ClipGradientConf] = None,
+        train_step_lbn: Optional[Text] = None,
+        loss_scale_policy: Optional[LossScalePolicy] = None,
+        variables: Optional[
+            Union[Sequence[Text], Callable[[], Sequence[Text]]]
+        ] = GetVariablesForCurrentJob,
+    ):
+        super().__init__(loss_scale_factor, train_step_lbn, loss_scale_policy)
+        self.lr_scheduler = lr_scheduler
+        self.grad_clipping = grad_clipping
+        self.momentum = momentum
+        self.variables = variables
+
+    def _AddOptimizerConfInTrainConf(self, train_conf) -> None:
+        optimizer_conf = train_conf.mutable_optimizer_conf().Add()
+        self.lr_scheduler.SetLrFieldsInOptimizerConf(optimizer_conf)
+        if self.grad_clipping is not None:
+            optimizer_conf.mutable_clip_conf().CopyFrom(self.grad_clipping.clip_conf)
+        if self.momentum == 0:
+            optimizer_conf.mutable_naive_conf()
+        else:
+            optimizer_conf.mutable_momentum_conf().set_beta(self.momentum)
+        for variable in self.Variables():
+            optimizer_conf.add_variable_op_names(variable)
+
+
+class SGDW(Optimizer):
+    """The optimizer of the stochastic-gradient-descent-weight-decay algorithm.
+
+    (More details please refer to `Decoupled Weight Decay Regularization <https://arxiv.org/abs/1711.05101>`_).
+
+    When the momentum = 0, the equation of parameters updating is:
+
+    .. math::
+
+        param_{new} = param_{old} - learning\\_rate*(grad + \\lambda*param_{old}))
+
+    With momentum, the equation of parameters updating is:
+
+    .. math::
+
+        & V_{t} = \\beta*V_{t-1} - learning\\_rate*g_t
+
+        & param_{new} = param_{old} + V_{t} - learning\\_rate * \\lambda*param_{old}
+
+    Args:
+        lr_scheduler (LrScheduler): The scheduler of learning rate.
+        loss_scale_factor (Optional[float], optional): The scale factor of loss. Defaults to None.
+        momentum (float, optional): Momentum factor (:math:`\\beta`). Defaults to 0.9.
+        weight_decay (Optional[float], optional): The weight decay factor (In the equation is :math:`\\lambda`). Defaults to None.
+        weight_decay_includes (Optional[Union[Sequence[Text], Text]], optional): The name of the model parameters that use weight decay. Defaults to None.
+        weight_decay_excludes (Optional[Union[Sequence[Text], Text]], optional): The name of the model parameters that do not use weight decay. Defaults to None.
+        grad_clipping (Optional[ClipGradientConf], optional): The gradient clipping strategy. Defaults to None.
+        train_step_lbn (Optional[Text], optional): [description]. Defaults to None.
+        loss_scale_policy (Optional[LossScalePolicy]): The policy of loss scale.
+        variables(Optional[
+            Union[Sequence[Text], Callable[[], Sequence[Text]]]
+        ]): maintained variables.
+
+    Note:
+
+        Only one of `weight_decay_includes` and `weight_decay_excludes` can be set. If both are None,
+        all the model parameters will use weight decay.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow as flow
+        import oneflow.typing as tp
+
+        @flow.global_function(type="train")
+        def train_job(
+            images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
+            labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
+        ) -> tp.Numpy:
+            with flow.scope.placement("gpu", "0:0"):
+                logits = lenet(images, train=True)
+                loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+                    labels, logits, name="softmax_loss"
+                )
+
+            # Set Learning rate as 0.1
+            lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.1])
+            # Set Momentum=0.9 SGDW optimizer, weight_decay factor is 0.00005
+            flow.optimizer.SGDW(lr_scheduler, momentum=0.9, weight_decay=0.00005).minimize(loss)
+
+            return loss
+    """
+
+    def __init__(
+        self,
+        lr_scheduler: LrScheduler,
+        loss_scale_factor: Optional[float] = None,
+        momentum: float = 0.9,
+        weight_decay: Optional[float] = None,
+        weight_decay_includes: Optional[Union[Sequence[Text], Text]] = None,
+        weight_decay_excludes: Optional[Union[Sequence[Text], Text]] = None,
+        grad_clipping: Optional[ClipGradientConf] = None,
+        train_step_lbn: Optional[Text] = None,
+        loss_scale_policy: Optional[LossScalePolicy] = None,
+        variables: Optional[
+            Union[Sequence[Text], Callable[[], Sequence[Text]]]
+        ] = GetVariablesForCurrentJob,
+    ):
+        super().__init__(loss_scale_factor, train_step_lbn, loss_scale_policy)
+        self.lr_scheduler = lr_scheduler
+        self.grad_clipping = grad_clipping
+        self.momentum = momentum
+        self.weight_decay = weight_decay
+        if isinstance(weight_decay_includes, str):
+            weight_decay_includes = [weight_decay_includes]
+        if isinstance(weight_decay_excludes, str):
+            weight_decay_excludes = [weight_decay_excludes]
+        self.weight_decay_includes = weight_decay_includes
+        self.weight_decay_excludes = weight_decay_excludes
+        self.variables = variables
+
+    def _AddOptimizerConfInTrainConf(self, train_conf) -> None:
+        optimizer_conf = train_conf.mutable_optimizer_conf().Add()
+        self.lr_scheduler.SetLrFieldsInOptimizerConf(optimizer_conf)
+        if self.grad_clipping is not None:
+            optimizer_conf.mutable_clip_conf().CopyFrom(self.grad_clipping.clip_conf)
+        if self.momentum == 0:
+            optimizer_conf.mutable_naive_conf()
+        else:
+            optimizer_conf.mutable_momentum_conf().set_beta(self.momentum)
+        if self.weight_decay is not None:
+            optimizer_conf.mutable_weight_decay_conf().set_weight_decay_rate(
+                self.weight_decay
+            )
+            assert not (
+                self.weight_decay_excludes is not None
+                and self.weight_decay_includes is not None
+            )
+            if self.weight_decay_includes is not None:
+                for weight_decay_include in self.weight_decay_includes:
+                    optimizer_conf.mutable_weight_decay_conf().mutable_includes().add_pattern(
+                        weight_decay_include
+                    )
+            elif self.weight_decay_excludes is not None:
+                for weight_decay_exclude in self.weight_decay_excludes:
+                    optimizer_conf.weight_decay_conf().mutable_excludes().add_pattern(
+                        weight_decay_exclude
+                    )
+        for variable in self.Variables():
+            optimizer_conf.add_variable_op_names(variable)
+
+
+class Adam(Optimizer):
+    """The optimizer of the Adam algorithm.
+
+    This algorithm can adjust the learning rate of each parameter dynamically according to the 1st-moment estimates
+
+    and the 2nd-moment estimates of gradient.
+
+    With bias correction, the equation of parameters updating is:
+
+    .. math::
+
+        & V_t = \\beta_1*V_{t-1} + (1-\\beta_1)*grad
+
+        & S_t = \\beta_2*S_{t-1} + (1-\\beta_2)*{grad} \\odot {grad}
+
+        & \\hat{V_t} = \\frac{V_t}{1-\\beta_1^t}
+
+        & \\hat{S_t} = \\frac{S_t}{1-\\beta_2^t}
+
+        & \\hat{g} = learning\\_rate*\\frac{\\hat{V_t}}{\\sqrt{\\hat{S_t}}+\\epsilon}
+
+        & param_{new} = param_{old} - \\hat{g}
+
+    Without bias correction, the equation of parameters updating is:
+
+    .. math::
+
+        & V_t = \\beta_1*V_{t-1} + (1-\\beta_1)*grad
+
+        & S_t = \\beta_2*S_{t-1} + (1-\\beta_2)*{grad} \\odot {grad}
+
+        & \\hat{g} = learning\\_rate*\\frac{{V_t}}{\\sqrt{{S_t}}+\\epsilon}
+
+        & param_{new} = param_{old} - \\hat{g}
+
+    More details please refer to `Adam <https://arxiv.org/abs/1412.6980>`_
+
+    Args:
+        lr_scheduler (LrScheduler): The scheduler of learning rate.
+        beta1 (float, optional): The exponential weighted average decay rate for the 1st-moment estimates (:math:`\\beta_1`). Defaults to 0.9.
+        beta2 (float, optional): The exponential weighted average decay rate for the 2rd-moment estimates (:math:`\\beta_2`). Defaults to 0.999.
+        epsilon ([type], optional): A small float constant value for numerical stability (:math:`\\epsilon`). Defaults to 1e-8.
+        do_bias_correction (bool, optional): Whether to do the bias correction. Defaults to False.
+        loss_scale_factor (Optional[float], optional): The scale factor of loss. Defaults to None.
+        grad_clipping (Optional[ClipGradientConf], optional): The gradient clipping strategy. Defaults to None.
+        train_step_lbn (Optional[Text], optional): [description]. Defaults to None.
+        loss_scale_policy (Optional[LossScalePolicy]): The policy of loss scale.
+        variables(Optional[
+            Union[Sequence[Text], Callable[[], Sequence[Text]]]
+        ]): maintained variables.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow as flow
+        import oneflow.typing as tp
+
+        @flow.global_function(type="train")
+        def train_job(
+            images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
+            labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
+        ) -> tp.Numpy:
+            with flow.scope.placement("gpu", "0:0"):
+                logits = lenet(images, train=True)
+                loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+                    labels, logits, name="softmax_loss"
+                )
+
+            # Set learning rate as 0.001
+            lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.001])
+            # Set Adam optimizer
+            flow.optimizer.Adam(lr_scheduler, do_bias_correction=False).minimize(loss)
+
+            return loss
+    """
+
+    def __init__(
+        self,
+        lr_scheduler: LrScheduler,
+        beta1=0.9,
+        beta2=0.999,
+        epsilon=1e-08,
+        do_bias_correction=False,
+        loss_scale_factor: Optional[float] = None,
+        grad_clipping: Optional[ClipGradientConf] = None,
+        train_step_lbn: Optional[Text] = None,
+        loss_scale_policy: Optional[LossScalePolicy] = None,
+        variables: Optional[
+            Union[Sequence[Text], Callable[[], Sequence[Text]]]
+        ] = GetVariablesForCurrentJob,
+    ):
+        super().__init__(loss_scale_factor, train_step_lbn, loss_scale_policy)
+        self.lr_scheduler = lr_scheduler
+        self.grad_clipping = grad_clipping
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.do_bias_correction = do_bias_correction
+        self.variables = variables
+
+    def _AddOptimizerConfInTrainConf(self, train_conf) -> None:
+        optimizer_conf = train_conf.mutable_optimizer_conf().Add()
+        self.lr_scheduler.SetLrFieldsInOptimizerConf(optimizer_conf)
+        if self.grad_clipping is not None:
+            optimizer_conf.mutable_clip_conf().CopyFrom(self.grad_clipping.clip_conf)
+        optimizer_conf.mutable_adam_conf().set_beta1(self.beta1)
+        optimizer_conf.mutable_adam_conf().set_beta2(self.beta2)
+        optimizer_conf.mutable_adam_conf().set_epsilon(self.epsilon)
+        optimizer_conf.mutable_adam_conf().set_do_bias_correction(
+            self.do_bias_correction
+        )
+        for variable in self.Variables():
+            optimizer_conf.add_variable_op_names(variable)
+
+
+class AdamW(Optimizer):
+    """The optimizer of the Adam-weight-decay algorithm.
+
+    If we use L2 regularization,
+
+    it will be invalid due to the adaptive learning rate in Adam optimizer
+
+    (More details please refer to `Adam-weight-decay <https://www.fast.ai/2018/07/02/adam-weight-decay/>`_).
+
+    So we use Adam-weight-decay algorithm to solve this problem.
+
+    With bias correction, the equation of parameters updating is:
+
+    .. math::
+
+        & V_t = \\beta_1*V_{t-1} + (1-\\beta_1)*grad
+
+        & S_t = \\beta_2*S_{t-1} + (1-\\beta_2)*{grad} \\odot {grad}
+
+        & \\hat{V_t} = \\frac{V_t}{1-\\beta_1^t}
+
+        & \\hat{S_t} = \\frac{S_t}{1-\\beta_2^t}
+
+        & \\hat{g} = learning\\_rate*(\\frac{\\hat{V_t}}{\\sqrt{\\hat{S_t}}+\\epsilon}+\\lambda*param_{old})
+
+        & param_{new} = param_{old} - \\hat{g}
+
+    Without bias correction, the equation of parameters updating is:
+
+    .. math::
+
+        & V_t = \\beta_1*V_{t-1} + (1-\\beta_1)*grad
+
+        & S_t = \\beta_2*S_{t-1} + (1-\\beta_2)*{grad} \\odot {grad}
+
+        & \\hat{g} = learning\\_rate*(\\frac{{V_t}}{\\sqrt{{S_t}}+\\epsilon}+\\lambda*param_{old})
+
+        & param_{new} = param_{old} - \\hat{g}
+
+    Args:
+        lr_scheduler (LrScheduler): The scheduler of learning rate.
+        beta1 (float, optional): The exponential weighted average decay rate for the 1st-moment estimates (:math:`\\beta_1`). Defaults to 0.9.
+        beta2 (float, optional): The exponential weighted average decay rate for the 2rd-moment estimates (:math:`\\beta_2`). Defaults to 0.999.
+        epsilon ([type], optional): A small float constant value for numerical stability (:math:`\\epsilon`). Defaults to 1e-8.
+        do_bias_correction (bool, optional): Whether to do the bias correction. Defaults to False.
+        loss_scale_factor (Optional[float], optional): The scale factor of loss. Defaults to None.
+        weight_decay (Optional[float], optional): The weight decay factor (In the equation is :math:`\\lambda`). Defaults to None.
+        weight_decay_includes (Optional[Union[Sequence[Text], Text]], optional): The name of the model parameters that use weight decay. Defaults to None.
+        weight_decay_excludes (Optional[Union[Sequence[Text], Text]], optional): The name of the model parameters that do not use weight decay. Defaults to None.
+        grad_clipping (Optional[ClipGradientConf], optional): The gradient clipping strategy. Defaults to None.
+        train_step_lbn (Optional[Text], optional): [description]. Defaults to None.
+        loss_scale_policy (Optional[LossScalePolicy]): The policy of loss scale.
+        variables(Optional[
+            Union[Sequence[Text], Callable[[], Sequence[Text]]]
+        ]): maintained variables.
+
+    Note:
+
+        Only one of `weight_decay_includes` and `weight_decay_excludes` can be set. If both are None,
+        all the model parameters will use weight decay.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow as flow
+        import oneflow.typing as tp
+
+        @flow.global_function(type="train")
+        def train_job(
+            images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
+            labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
+        ) -> tp.Numpy:
+            with flow.scope.placement("gpu", "0:0"):
+                logits = lenet(images, train=True)
+                loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+                    labels, logits, name="softmax_loss"
+                )
+
+            # Set learning rate as 0.001
+            lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.001])
+            # Set AdamW optimizer, weight_decay factor is 0.00005
+            flow.optimizer.AdamW(lr_scheduler,
+                    do_bias_correction=False, weight_decay=0.00005).minimize(loss)
+
+            return loss
+
+    """
+
+    def __init__(
+        self,
+        lr_scheduler: LrScheduler,
+        beta1=0.9,
+        beta2=0.999,
+        epsilon=1e-08,
+        do_bias_correction=False,
+        loss_scale_factor: Optional[float] = None,
+        weight_decay: Optional[float] = None,
+        weight_decay_includes: Optional[Union[Sequence[Text], Text]] = None,
+        weight_decay_excludes: Optional[Union[Sequence[Text], Text]] = None,
+        grad_clipping: Optional[ClipGradientConf] = None,
+        train_step_lbn: Optional[Text] = None,
+        loss_scale_policy: Optional[LossScalePolicy] = None,
+        variables: Optional[
+            Union[Sequence[Text], Callable[[], Sequence[Text]]]
+        ] = GetVariablesForCurrentJob,
+    ):
+        super().__init__(loss_scale_factor, train_step_lbn, loss_scale_policy)
+        self.lr_scheduler = lr_scheduler
+        self.grad_clipping = grad_clipping
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.do_bias_correction = do_bias_correction
+        self.weight_decay = weight_decay
+        if isinstance(weight_decay_includes, str):
+            weight_decay_includes = [weight_decay_includes]
+        if isinstance(weight_decay_excludes, str):
+            weight_decay_excludes = [weight_decay_excludes]
+        self.weight_decay_includes = weight_decay_includes
+        self.weight_decay_excludes = weight_decay_excludes
+        self.variables = variables
+
+    def _AddOptimizerConfInTrainConf(self, train_conf) -> None:
+        optimizer_conf = train_conf.mutable_optimizer_conf().Add()
+        self.lr_scheduler.SetLrFieldsInOptimizerConf(optimizer_conf)
+        if self.grad_clipping is not None:
+            optimizer_conf.mutable_clip_conf().CopyFrom(self.grad_clipping.clip_conf)
+        optimizer_conf.mutable_adam_conf().set_beta1(self.beta1)
+        optimizer_conf.mutable_adam_conf().set_beta2(self.beta2)
+        optimizer_conf.mutable_adam_conf().set_epsilon(self.epsilon)
+        optimizer_conf.mutable_adam_conf().set_do_bias_correction(
+            self.do_bias_correction
+        )
+        if self.weight_decay is not None:
+            optimizer_conf.mutable_weight_decay_conf().set_weight_decay_rate(
+                self.weight_decay
+            )
+            assert not (
+                self.weight_decay_excludes is not None
+                and self.weight_decay_includes is not None
+            )
+            if self.weight_decay_includes is not None:
+                for weight_decay_include in self.weight_decay_includes:
+                    optimizer_conf.mutable_weight_decay_conf().mutable_includes().add_pattern(
+                        weight_decay_include
+                    )
+            elif self.weight_decay_excludes is not None:
+                for weight_decay_exclude in self.weight_decay_excludes:
+                    optimizer_conf.mutable_weight_decay_conf().mutable_excludes().add_pattern(
+                        weight_decay_exclude
+                    )
+        for variable in self.Variables():
+            optimizer_conf.add_variable_op_names(variable)
+
+
+class RMSProp(Optimizer):
+    """The optimizer of the RMSProp algorithm.
+
+    This algorithm uses mean squared gradient to adjust the learning rate.
+
+    The equation of parameters updating is:
+
+        if centered:
+
+            .. math::
+
+                & mg_t = mg * \\beta_1 + (1 - \\beta_1) * grad
+
+                & denom_t = S_t - mg_t * mg_t
+
+        else:
+
+            .. math::
+
+                denom_t = S_t
+
+        .. math::
+
+            param_{new} = param_{old} - \\frac{learning\\_rate}{\\sqrt{denom_t+\\epsilon}} \\odot grad
+
+    Args:
+        lr_scheduler (LrScheduler): The scheduler of learning rate.
+        decay_rate (float, optional): The decay factor (:math:`\\beta_1`). Defaults to 0.99.
+        epsilon (float, optional): A small float constant value for numerical stability (:math:`\\epsilon`). Defaults to 1e-8.
+        centered (bool, optional): If `True`, gradients are normalized by the estimated
+                                   variance of the gradient; if False, by the uncentered second moment.
+                                   Setting this to `True` may help with training, but is slightly more
+                                   expensive in terms of computation and memory. Defaults to `False`.
+        loss_scale_factor (Optional[float], optional): The scale factor of loss. Defaults to None.
+        grad_clipping (Optional[ClipGradientConf], optional): The gradient clipping strategy. Defaults to None.
+        train_step_lbn (Optional[Text], optional): [description]. Defaults to None.
+        loss_scale_policy (Optional[LossScalePolicy]): The policy of loss scale.
+        variables(Optional[
+            Union[Sequence[Text], Callable[[], Sequence[Text]]]
+        ]): maintained variables.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow as flow
+        import oneflow.typing as tp
+
+        @flow.global_function(type="train")
+        def train_job(
+            images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
+            labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
+        ) -> tp.Numpy:
+            with flow.scope.placement("gpu", "0:0"):
+                logits = lenet(images, train=True)
+                loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+                    labels, logits, name="softmax_loss"
+                )
+            # Set learning rate as 0.001
+            lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.001])
+            # Set RMSProp optimizer
+            flow.optimizer.RMSProp(lr_scheduler).minimize(loss)
+
+            return loss
+
+    """
+
+    def __init__(
+        self,
+        lr_scheduler: LrScheduler,
+        decay_rate: float = 0.99,
+        epsilon: float = 1e-08,
+        centered: bool = False,
+        loss_scale_factor: Optional[float] = None,
+        grad_clipping: Optional[ClipGradientConf] = None,
+        train_step_lbn: Optional[Text] = None,
+        loss_scale_policy: Optional[LossScalePolicy] = None,
+        variables: Optional[
+            Union[Sequence[Text], Callable[[], Sequence[Text]]]
+        ] = GetVariablesForCurrentJob,
+    ):
+        super().__init__(loss_scale_factor, train_step_lbn, loss_scale_policy)
+        self.lr_scheduler = lr_scheduler
+        self.grad_clipping = grad_clipping
+        self.decay_rate = decay_rate
+        self.epsilon = epsilon
+        self.centered = centered
+        self.variables = variables
+
+    def _AddOptimizerConfInTrainConf(self, train_conf) -> None:
+        optimizer_conf = train_conf.mutable_optimizer_conf().Add()
+        self.lr_scheduler.SetLrFieldsInOptimizerConf(optimizer_conf)
+        if self.grad_clipping is not None:
+            optimizer_conf.mutable_clip_conf().CopyFrom(self.grad_clipping.clip_conf)
+        optimizer_conf.mutable_rmsprop_conf().set_decay_rate(self.decay_rate)
+        optimizer_conf.mutable_rmsprop_conf().set_centered(self.centered)
+        optimizer_conf.mutable_rmsprop_conf().set_epsilon(self.epsilon)
+        for variable in self.Variables():
+            optimizer_conf.add_variable_op_names(variable)
+
+
+class LARS(Optimizer):
+    """The optimizer of the LARS algorithm.
+
+    The equation of parameters updating is:
+
+    .. math::
+
+        & local\\_learning\\_rate = learning\\_rate*lars\\_coeff*\\frac{\\lVert{parm_{old}\\rVert}}{\\epsilon+\\lVert{grad\\rVert}+weight_decay*\\lVert{parm_{old}\\rVert}}
+
+        & momentum_t = \\beta*momentum_{t-1} + local\\_learning\\_rate*(grad)
+
+        & param_{new} = param_{old} - momentum_t - local_learning_rate * weight_decay * param_{old}
+
+    Args:
+        lr_scheduler (LrScheduler): The scheduler of learning rate.
+        momentum_beta (float, optional): The momentum factor (:math:`\\beta`). Defaults to 0.9.
+        epsilon (float, optional): A small float constant value for numerical stability (:math:`\\epsilon`). Defaults to 1e-9.
+        lars_coefficient (float, optional): The coefficient factor, it defines how much we trust the layer to change its weights (:math:`lars\\_coeff`). Defaults to 0.0001.
+        loss_scale_factor (Optional[float], optional): The scale factor of loss. Defaults to None.
+        weight_decay (Optional[float], optional): The weight decay factor (In the equation is :math:`\\lambda`). Defaults to None.
+        weight_decay_includes (Optional[Union[Sequence[Text], Text]], optional): The name of the model parameters that use weight decay. Defaults to None.
+        weight_decay_excludes (Optional[Union[Sequence[Text], Text]], optional): The name of the model parameters that do not use weight decay. Defaults to None.
+        grad_clipping (Optional[ClipGradientConf], optional): The gradient clipping strategy. Defaults to None.
+        train_step_lbn (Optional[Text], optional): [description]. Defaults to None.
+        loss_scale_policy (Optional[LossScalePolicy]): The policy of loss scale.
+
+    Note:
+
+        Only one of `weight_decay_includes` and `weight_decay_excludes` can be set. If both are None,
+        all the model parameters will use weight decay.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow as flow
+        import oneflow.typing as tp
+
+        @flow.global_function(type="train")
+        def train_job(
+                images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
+                labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
+        ) -> tp.Numpy:
+            with flow.scope.placement("gpu", "0:0"):
+                logits = lenet(images, train=True)
+                loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+                    labels, logits, name="softmax_loss"
+                )
+            # Set learning rate as 0.1
+            lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.1])
+            # Set LARS optimizer, momentum factor is 0.9
+            flow.optimizer.LARS(lr_scheduler, momentum_beta=0.9).minimize(loss)
+
+            return loss
+
+    """
+
+    def __init__(
+        self,
+        lr_scheduler: LrScheduler,
+        momentum_beta: float = 0.9,
+        epsilon: float = 1e-09,
+        lars_coefficient: float = 0.0001,
+        loss_scale_factor: Optional[float] = None,
+        weight_decay: Optional[float] = None,
+        weight_decay_includes: Optional[Union[Sequence[Text], Text]] = None,
+        weight_decay_excludes: Optional[Union[Sequence[Text], Text]] = None,
+        grad_clipping: Optional[ClipGradientConf] = None,
+        train_step_lbn: Optional[Text] = None,
+        loss_scale_policy: Optional[LossScalePolicy] = None,
+        variables: Optional[
+            Union[Sequence[Text], Callable[[], Sequence[Text]]]
+        ] = GetVariablesForCurrentJob,
+    ):
+        super().__init__(loss_scale_factor, train_step_lbn, loss_scale_policy)
+        self.lr_scheduler = lr_scheduler
+        self.grad_clipping = grad_clipping
+        self.momentum_beta = momentum_beta
+        self.epsilon = epsilon
+        self.lars_coefficient = lars_coefficient
+        self.weight_decay = weight_decay
+        if isinstance(weight_decay_includes, str):
+            weight_decay_includes = [weight_decay_includes]
+        if isinstance(weight_decay_excludes, str):
+            weight_decay_excludes = [weight_decay_excludes]
+        self.weight_decay_includes = weight_decay_includes
+        self.weight_decay_excludes = weight_decay_excludes
+        self.variables = variables
+
+    def _AddOptimizerConfInTrainConf(self, train_conf) -> None:
+        optimizer_conf = train_conf.mutable_optimizer_conf().Add()
+        self.lr_scheduler.SetLrFieldsInOptimizerConf(optimizer_conf)
+        if self.grad_clipping is not None:
+            optimizer_conf.mutable_clip_conf().CopyFrom(self.grad_clipping.clip_conf)
+        optimizer_conf.mutable_lars_conf().set_momentum_beta(self.momentum_beta)
+        optimizer_conf.mutable_lars_conf().set_epsilon(self.epsilon)
+        optimizer_conf.mutable_lars_conf().set_lars_coefficient(self.lars_coefficient)
+        if self.weight_decay is not None:
+            optimizer_conf.mutable_weight_decay_conf().set_weight_decay_rate(
+                self.weight_decay
+            )
+            assert not (
+                self.weight_decay_excludes is not None
+                and self.weight_decay_includes is not None
+            )
+            if self.weight_decay_includes is not None:
+                for weight_decay_include in self.weight_decay_includes:
+                    optimizer_conf.mutable_weight_decay_conf().mutable_includes().add_pattern(
+                        weight_decay_include
+                    )
+            elif self.weight_decay_excludes is not None:
+                for weight_decay_exclude in self.weight_decay_excludes:
+                    optimizer_conf.mutable_weight_decay_conf().mutable_excludes().add_pattern(
+                        weight_decay_exclude
+                    )
+        for variable in self.Variables():
+            optimizer_conf.add_variable_op_names(variable)
+
+
+class LazyAdam(Optimizer):
+    """
+    The optimizer of the LazyAdam algorithm.
+
+    This algorithm can adjust the learning rate of each parameter dynamically according to the 1st-moment estimates and the 2nd-moment estimates of the gradient.
+
+    The difference between Adam optimizer and LazyAdam optimizer is that LazyAdam only updates the element that has gradient in the current batch, it is faster than Adam optimizer.
+
+    .. math::
+
+        & V_t = \\beta_1*V_{t-1} + (1-\\beta_1)*grad
+
+        & S_t = \\beta_2*S_{t-1} + (1-\\beta_2)*{grad} \\odot {grad}
+
+        & \\hat{g} = learning\\_rate*\\frac{{V_t}}{\\sqrt{{S_t}}+\\epsilon}
+
+        & param_{new} = param_{old} - \\hat{g}
+
+    Args:
+        lr_scheduler (LrScheduler): The scheduler of learning rate.
+        beta1 (float, optional): The exponential weighted average decay rate for the 1st-moment estimates (:math:`\\beta_1`). Defaults to 0.9.
+        beta2 (float, optional): The exponential weighted average decay rate for the 2rd-moment estimates (:math:`\\beta_2`). Defaults to 0.999.
+        epsilon ([type], optional): A small float constant value for numerical stability (:math:`\\epsilon`). Defaults to 1e-8.
+        loss_scale_factor (Optional[float], optional): The scale factor of loss. Defaults to None.
+        grad_clipping (Optional[ClipGradientConf], optional): The gradient clipping strategy. Defaults to None.
+        train_step_lbn (Optional[Text], optional): [description]. Defaults to None.
+        loss_scale_policy (Optional[LossScalePolicy]): The policy of loss scale.
+        variables(Optional[
+            Union[Sequence[Text], Callable[[], Sequence[Text]]]
+        ]): maintained variables.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow as flow
+        import oneflow.typing as tp
+
+        @flow.global_function(type="train")
+        def train_job(
+            images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
+            labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
+        ) -> tp.Numpy:
+            with flow.scope.placement("gpu", "0:0"):
+                logits = lenet(images, train=True)
+                loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+                    labels, logits, name="softmax_loss"
+                )
+            # Set learning rate as 0.001
+            lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.001])
+            # Set LazyAdam optimizer
+            flow.optimizer.LazyAdam(lr_scheduler).minimize(loss)
+
+            return loss
+
+    """
+
+    def __init__(
+        self,
+        lr_scheduler: LrScheduler,
+        beta1: float = 0.9,
+        beta2: float = 0.999,
+        epsilon: float = 1e-08,
+        loss_scale_factor: Optional[float] = None,
+        grad_clipping: Optional[ClipGradientConf] = None,
+        train_step_lbn: Optional[Text] = None,
+        loss_scale_policy: Optional[LossScalePolicy] = None,
+        variables: Optional[
+            Union[Sequence[Text], Callable[[], Sequence[Text]]]
+        ] = GetVariablesForCurrentJob,
+    ):
+        super().__init__(loss_scale_factor, train_step_lbn, loss_scale_policy)
+        self.lr_scheduler = lr_scheduler
+        self.grad_clipping = grad_clipping
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.variables = variables
+
+    def _AddOptimizerConfInTrainConf(self, train_conf) -> None:
+        optimizer_conf = train_conf.mutable_optimizer_conf().Add()
+        self.lr_scheduler.SetLrFieldsInOptimizerConf(optimizer_conf)
+        if self.grad_clipping is not None:
+            optimizer_conf.mutable_clip_conf().CopyFrom(self.grad_clipping.clip_conf)
+        optimizer_conf.mutable_lazy_adam_conf().set_beta1(self.beta1)
+        optimizer_conf.mutable_lazy_adam_conf().set_beta2(self.beta2)
+        optimizer_conf.mutable_lazy_adam_conf().set_epsilon(self.epsilon)
+        for variable in self.Variables():
+            optimizer_conf.add_variable_op_names(variable)
+
+
+class LAMB(Optimizer):
+    """
+
+    Args:
+        lr_scheduler (LrScheduler): The scheduler of learning rate.
+        beta1 (float, optional): The exponential weighted average decay rate for the 1st-moment estimates (:math:`\\beta_1`). Defaults to 0.9.
+        beta2 (float, optional): The exponential weighted average decay rate for the 2rd-moment estimates (:math:`\\beta_2`). Defaults to 0.999.
+        epsilon ([type], optional): A small float constant value for numerical stability (:math:`\\epsilon`). Defaults to 1e-6.
+        loss_scale_factor (Optional[float], optional): The scale factor of loss. Defaults to None.
+        weight_decay (Optional[float], optional): The weight decay factor (In the equation is :math:`\\lambda`). Defaults to None.
+        weight_decay_includes (Optional[Union[Sequence[Text], Text]], optional): The name of the model parameters that use weight decay. Defaults to None.
+        weight_decay_excludes (Optional[Union[Sequence[Text], Text]], optional): The name of the model parameters that do not use weight decay. Defaults to None.
+        grad_clipping (Optional[ClipGradientConf], optional): The gradient clipping strategy. Defaults to None.
+        train_step_lbn (Optional[Text], optional): [description]. Defaults to None.
+        loss_scale_policy (Optional[LossScalePolicy]): The policy of loss scale.
+        variables(Optional[
+            Union[Sequence[Text], Callable[[], Sequence[Text]]]
+        ]): maintained variables.
+
+    Note:
+
+        Only one of `weight_decay_includes` and `weight_decay_excludes` can be set. If both are None,
+        all the model parameters will use weight decay.
+
+    """
+
+    def __init__(
+        self,
+        lr_scheduler: LrScheduler,
+        beta1: float = 0.9,
+        beta2: float = 0.999,
+        epsilon: float = 1e-06,
+        loss_scale_factor: Optional[float] = None,
+        weight_decay: Optional[float] = None,
+        weight_decay_includes: Optional[Union[Sequence[Text], Text]] = None,
+        weight_decay_excludes: Optional[Union[Sequence[Text], Text]] = None,
+        grad_clipping: Optional[ClipGradientConf] = None,
+        train_step_lbn: Optional[Text] = None,
+        loss_scale_policy: Optional[LossScalePolicy] = None,
+        variables: Optional[
+            Union[Sequence[Text], Callable[[], Sequence[Text]]]
+        ] = GetVariablesForCurrentJob,
+    ):
+        super().__init__(loss_scale_factor, train_step_lbn, loss_scale_policy)
+        self.lr_scheduler = lr_scheduler
+        self.grad_clipping = grad_clipping
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.weight_decay = weight_decay
+        if isinstance(weight_decay_includes, str):
+            weight_decay_includes = [weight_decay_includes]
+        if isinstance(weight_decay_excludes, str):
+            weight_decay_excludes = [weight_decay_excludes]
+        self.weight_decay_includes = weight_decay_includes
+        self.weight_decay_excludes = weight_decay_excludes
+        self.variables = variables
+
+    def _AddOptimizerConfInTrainConf(self, train_conf) -> None:
+        optimizer_conf = train_conf.mutable_optimizer_conf().Add()
+        self.lr_scheduler.SetLrFieldsInOptimizerConf(optimizer_conf)
+        if self.grad_clipping is not None:
+            optimizer_conf.mutable_clip_conf().CopyFrom(self.grad_clipping.clip_conf)
+        optimizer_conf.mutable_lamb_conf().set_beta1(self.beta1)
+        optimizer_conf.mutable_lamb_conf().set_beta2(self.beta2)
+        optimizer_conf.mutable_lamb_conf().set_epsilon(self.epsilon)
+        if self.weight_decay is not None:
+            optimizer_conf.mutable_weight_decay_conf().set_weight_decay_rate(
+                self.weight_decay
+            )
+            assert not (
+                self.weight_decay_excludes is not None
+                and self.weight_decay_includes is not None
+            )
+            if self.weight_decay_includes is not None:
+                for weight_decay_include in self.weight_decay_includes:
+                    optimizer_conf.mutable_weight_decay_conf().mutable_includes().add_pattern(
+                        weight_decay_include
+                    )
+            elif self.weight_decay_excludes is not None:
+                for weight_decay_exclude in self.weight_decay_excludes:
+                    optimizer_conf.mutable_weight_decay_conf().mutable_excludes().add_pattern(
+                        weight_decay_exclude
+                    )
+        for variable in self.Variables():
+            optimizer_conf.add_variable_op_names(variable)
+
+
+class CombinedOptimizer(Optimizer):
+    """
+    Combined optimizer for multi optimizer case.
+
+    Args:
+        optimizers (Sequence[Optimizer]): optimizers to work together
+        loss_scale_factor (Optional[float], optional): The scale factor of loss. Defaults to None.
+        train_step_lbn (Optional[Text], optional): [description]. Defaults to None.
+        loss_scale_policy (Optional[LossScalePolicy]): The policy of loss scale.
+
+        Example: see test_multi_optimizer.py
+    """
+
+    def __init__(
+        self,
+        optimizers: Sequence[Optimizer],
+        loss_scale_factor: Optional[float] = None,
+        train_step_lbn: Optional[Text] = None,
+        loss_scale_policy: Optional[LossScalePolicy] = None,
+    ):
+        super().__init__(loss_scale_factor, train_step_lbn, loss_scale_policy)
+        for optimizer in optimizers:
+            assert not isinstance(
+                optimizer, CombinedOptimizer
+            ), "Forbid constructing CombinedOptimizer recursively"
+            assert (
+                optimizer.train_step_lbn is None
+            ), "Only one train step lbn among multi optimizers, please set thisparameter in CombinedOptimizer"
+            assert (
+                optimizer.loss_scale_policy is None
+            ), "Only one loss scale policy among multi optimizers, please set thisparameter in CombinedOptimizer"
+        self.optimizers = optimizers
+
+    def Variables(self) -> List[Text]:
+        if not self._variables_list_init:
+            self.variables = []
+            for optimizer in self.optimizers:
+                self.variables.append(optimizer.Variables())
+            self._variables_list_init = True
+        return self.variables
+
+    def _SanityCheck(self):
+        all_variables = set(GetVariablesForCurrentJob())
+        union_set = set()
+        inter_set = all_variables
+        for optimizer in self.optimizers:
+            s = set(optimizer.Variables())
+            union_set.union(s)
+            inter_set = inter_set.intersection(s)
+        assert union_set.issubset(all_variables)
+        assert (
+            len(inter_set) == 0
+        ), "Do not allow overlap of variables between multi optimizers"
+
+    def _AddOptimizerConfInTrainConf(self, train_conf) -> None:
+        self._SanityCheck()
+        for optimizer in self.optimizers:
+            optimizer._AddOptimizerConfInTrainConf(train_conf)
diff --git a/python/oneflow/ops/partial_fc_sample.py b/python/oneflow/ops/partial_fc_sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..8bb903bbc48ce79034a1bec724cb6b9ab4498796
--- /dev/null
+++ b/python/oneflow/ops/partial_fc_sample.py
@@ -0,0 +1,53 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+from typing import Optional, Union
+
+import oneflow as flow
+import oneflow._oneflow_internal
+import oneflow.core.operator.op_conf_pb2 as op_conf_util
+import oneflow.core.register.logical_blob_id_pb2 as logical_blob_id_util
+import oneflow.framework.distribute as distribute_util
+import oneflow.framework.id_util as id_util
+import oneflow.framework.remote_blob as remote_blob_util
+
+
+def distributed_partial_fc_sample(
+    weight: oneflow._oneflow_internal.BlobDesc,
+    label: oneflow._oneflow_internal.BlobDesc,
+    num_sample: int,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    parallel_num = flow.current_scope().device_parallel_desc_symbol.parallel_num
+    assert num_sample % parallel_num == 0
+    assert weight.shape[0] % parallel_num == 0
+    return (
+        flow.user_op_builder(
+            name
+            if name is not None
+            else id_util.UniqueStr("DistributedPartialFcSample_")
+        )
+        .Op("distributed_partial_fc_sample")
+        .Input("weight", [weight])
+        .Input("label", [label])
+        .Attr("num_sample", num_sample)
+        .Output("mapped_label")
+        .Output("sampled_label")
+        .Output("sampled_weight")
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()
+    )
diff --git a/python/oneflow/ops/prelu.py b/python/oneflow/ops/prelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..672f0ecc24d3114fd16afb19d7f58075dd4cc665
--- /dev/null
+++ b/python/oneflow/ops/prelu.py
@@ -0,0 +1,155 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional, Sequence
+
+import oneflow as flow
+import oneflow._oneflow_internal
+import oneflow.core.job.initializer_conf_pb2 as initializer_conf_util
+import oneflow.core.job.regularizer_conf_pb2 as regularizer_conf_util
+import oneflow.core.operator.op_conf_pb2 as op_conf_util
+import oneflow.framework.distribute as distribute_util
+import oneflow.framework.remote_blob as remote_blob_util
+
+
+def prelu(
+    inputs: oneflow._oneflow_internal.BlobDesc,
+    alpha_initializer: Optional[initializer_conf_util.InitializerConf] = None,
+    alpha_regularizer: Optional[regularizer_conf_util.RegularizerConf] = None,
+    shared_axes: Optional[Sequence[int]] = None,
+    trainable: bool = True,
+    name: str = "PRelu",
+    model_distribute: oneflow._oneflow_internal.distribute.Distribute = oneflow._oneflow_internal.distribute.broadcast(),
+) -> oneflow._oneflow_internal.BlobDesc:
+    """The Prelu(Parametric Rectified Linear Unit) activation.
+
+    The :math:`\\alpha` is a parameter that can be trained in network
+
+    The equation is
+
+    .. math::
+
+        out = max(0, x) + \\alpha*min(0, x)
+
+    Args:
+        inputs (oneflow._oneflow_internal.BlobDesc): The input Blob.
+        alpha_initializer (Optional[initializer_conf_util.InitializerConf], optional): The initializer of alpha. Defaults to None.
+        alpha_regularizer (Optional[regularizer_conf_util.RegularizerConf], optional): The regularizer of alpha. Defaults to None.
+        shared_axes (Optional[Sequence[int]], optional): The axis along which to share learnable parameters for the prelu activation function. Defaults to None.
+        trainable (bool, optional): Whether to train the parameter :math:`\\alpha`. Defaults to True.
+        name (str, optional): The name for the operation. Defaults to "PRelu".
+        model_distribute (oneflow._oneflow_internal.distribute.Distribute, optional): Define the way to ditribute the model. Defaults to oneflow._oneflow_internal.distribute.broadcast().
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The activated Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow as flow
+        import oneflow.typing as tp
+
+        BATCH_SIZE = 100
+
+
+        def lenet(data, train=False):
+            initializer = flow.truncated_normal(0.1)
+            conv1 = flow.layers.conv2d(
+                data,
+                32,
+                5,
+                padding="SAME",
+                name="conv1",
+                kernel_initializer=initializer,
+            )
+            prelu1 = flow.layers.prelu(conv1,
+                                    alpha_initializer=initializer,
+                                    shared_axes=[2, 3],
+                                    name="Prelu1")
+            pool1 = flow.nn.max_pool2d(
+                prelu1, ksize=2, strides=2, padding="SAME", name="pool1", data_format="NCHW"
+            )
+            conv2 = flow.layers.conv2d(
+                pool1,
+                64,
+                5,
+                padding="SAME",
+                name="conv2",
+                kernel_initializer=initializer,
+            )
+            prelu2 = flow.layers.prelu(conv2,
+                                    alpha_initializer=initializer,
+                                    shared_axes=[2, 3],
+                                    name="Prelu2")
+            pool2 = flow.nn.max_pool2d(
+                prelu2, ksize=2, strides=2, padding="SAME", name="pool2", data_format="NCHW"
+            )
+            reshape = flow.reshape(pool2, [pool2.shape[0], -1])
+            hidden = flow.layers.dense(
+                reshape,
+                512,
+                activation=flow.nn.relu,
+                kernel_initializer=initializer,
+                name="dense1",
+            )
+            if train:
+                hidden = flow.nn.dropout(hidden, rate=0.5, name="dropout")
+            return flow.layers.dense(hidden, 10, kernel_initializer=initializer, name="dense2")
+
+
+        @flow.global_function(type="train")
+        def train_job(
+                images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
+                labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
+        ) -> tp.Numpy:
+            with flow.scope.placement("gpu", "0:0"):
+                logits = lenet(images, train=True)
+                loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+                    labels, logits, name="softmax_loss"
+                )
+
+            lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.1])
+            flow.optimizer.SGD(lr_scheduler, momentum=0.9).minimize(loss)
+            return loss
+
+    """
+    alpha_shape = list(inputs.shape[1:])
+    if shared_axes is not None:
+        for i in shared_axes:
+            assert i >= 1 and i < len(inputs.shape)
+            alpha_shape[i - 1] = 1
+    if alpha_initializer is None:
+        alpha_initializer = flow.constant_initializer(0)
+    with flow.scope.namespace(name):
+        alpha = flow.get_variable(
+            name="alpha",
+            shape=alpha_shape,
+            dtype=inputs.dtype,
+            initializer=alpha_initializer,
+            regularizer=alpha_regularizer,
+            trainable=trainable,
+            distribute=model_distribute,
+            reuse=False,
+        )
+    op = (
+        flow.user_op_builder(name)
+        .Op("prelu")
+        .Input("x", [inputs])
+        .Input("alpha", [alpha])
+        .Output("y")
+        .Build()
+    )
+    return op.InferAndTryRun().SoleOutputBlob()
diff --git a/python/oneflow/ops/quantize_ops.py b/python/oneflow/ops/quantize_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..55316bcd1a11241b10868519ccf813f4089643e5
--- /dev/null
+++ b/python/oneflow/ops/quantize_ops.py
@@ -0,0 +1,353 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional, Tuple
+
+import oneflow as flow
+import oneflow._oneflow_internal
+import oneflow.framework.id_util as id_util
+import oneflow.framework.remote_blob as remote_blob_util
+
+
+def min_max_observer(
+    input: oneflow._oneflow_internal.BlobDesc,
+    quantization_bit: int = 8,
+    quantization_scheme: str = "symmetric",
+    quantization_formula: str = "google",
+    per_layer_quantization: bool = True,
+    name: Optional[str] = None,
+) -> Tuple[oneflow._oneflow_internal.BlobDesc, oneflow._oneflow_internal.BlobDesc]:
+    """Compute the quantization parameters of the input tensor.
+
+    First compute the max and min values of input tensor:
+
+    .. math::
+
+        & max\\_value = max(input)
+
+        & min\\_value = min(input)
+
+    Then compute the scale and zero_point with the following equations:
+
+        if quantization_scheme == "symmetric":
+
+        .. math::
+
+            & denom = 2^{quantization\\_to\\_bit - 1} - 1
+
+            & scale = max(|max\\_value|,|min\\_value|) / denom
+
+            & zero\\_point = 0
+
+        elif quantization_scheme == "affine":
+
+        .. math::
+
+            & denom = 2^{quantization\\_to\\_bit} - 1
+
+            & scale = (max\\_value - min\\_value) / denom
+
+            & zero\\_point = -min\\_value / scale
+
+    If per_layer_quantization is False, then the shape of scale and zero_point will be (input.shape[0],).
+
+    Args:
+        input (oneflow._oneflow_internal.BlobDesc): input tensor.
+        quantization_bit (int): Quantize input to uintX / intX, X can be in range [2, 8]. Defaults to 8.
+        quantization_scheme (str): "symmetric" or "affine", quantize to signed / unsigned integer. Defaults to "symmetric".
+        quantization_formula (str): Support "google" or "cambricon".
+        per_layer_quantization (bool): True or False, means per-layer / per-channel quantization. Defaults to True.
+        name (Optional[str]): This operator's name. Defaults to None.
+
+    Returns:
+        Tuple[oneflow._oneflow_internal.BlobDesc, oneflow._oneflow_internal.BlobDesc]: The scale and zero_point of input tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow as flow
+        import numpy as np
+        import oneflow.typing as tp
+
+        @flow.global_function(type="predict", function_config=flow.FunctionConfig())
+        def QuantizeJob(
+            input: tp.Numpy.Placeholder(input_shape, dtype=type_name_to_flow_type[dtype])
+        ): tp.Numpy
+            with flow.scope.placement(device_type, "0:0"):
+                scale, zero_point = flow.quantization.min_max_observer(
+                    input, quantization_bit=8,
+                    quantization_scheme="symmetric",
+                    quantization_formula="google",
+                    per_layer_quantization=True
+                )
+            return scale, zero_point
+
+        input = (np.random.random(input_shape) - 0.5).astype(type_name_to_np_type[dtype])
+        scale, zero_point = QuantizeJob(input)
+
+    """
+    if quantization_formula == "cambricon" and (not per_layer_quantization):
+        raise NotImplementedError(
+            "per-channel mode is not supported in cambricon scheme"
+        )
+    (scale, zero_point) = (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("MinMaxObserver_")
+        )
+        .Op("min_max_observer")
+        .Input("in", [input])
+        .Output("scale")
+        .Output("zero_point")
+        .Attr("quantization_bit", quantization_bit)
+        .Attr("quantization_scheme", quantization_scheme)
+        .Attr("quantization_formula", quantization_formula)
+        .Attr("per_layer_quantization", per_layer_quantization)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()
+    )
+    return (scale, zero_point)
+
+
+def moving_average_min_max_observer(
+    input: oneflow._oneflow_internal.BlobDesc,
+    quantization_bit: int = 8,
+    quantization_scheme: str = "symmetric",
+    quantization_formula: str = "google",
+    momentum: float = 0.95,
+    name: Optional[str] = None,
+) -> Tuple[oneflow._oneflow_internal.BlobDesc, oneflow._oneflow_internal.BlobDesc]:
+    """Compute the quantization parameters based on the moving average of the input tensor's min and max values.
+
+    First compute the moving\\_max and moving\\_min value of input tensor:
+
+        if quantization_scheme == "symmetric":
+
+        .. math::
+
+            & moving\\_max = moving\\_max * momentum + |max(input)| * (1 - momentum)
+
+            & moving\\_min = moving\\_max
+
+        elif quantization_scheme == "affine":
+
+        .. math::
+
+            & moving\\_max = moving\\_max * momentum + max(input) * (1 - momentum)
+
+            & moving\\_min = moving\\_min * momentum + min(input) * (1 - momentum)
+
+    The moving average of min and max values are initialized as the first batch of input `Blob`'s min and max.
+
+    Then compute the scale and zero_point with the following equations:
+
+        if quantization_scheme == "symmetric":
+
+        .. math::
+
+            & denom = 2^{quantization\\_to\\_bit - 1} - 1
+
+            & scale = moving\\_max / denom
+
+            & zero\\_point = 0
+
+        elif quantization_scheme == "affine":
+
+        .. math::
+
+            & denom = 2^{quantization\\_to\\_bit} - 1
+
+            & scale = (moving\\_max - moving\\_min) / denom
+
+            & zero\\_point = -moving\\_min / scale
+
+    Args:
+        input (oneflow._oneflow_internal.BlobDesc): input tensor.
+        quantization_bit (int): Quantize input to uintX / intX, X can be in range [2, 8]. Defaults to 8.
+        quantization_scheme (str): "symmetric" or "affine", quantize to signed / unsigned integer. Defaults to "symmetric".
+        quantization_formula (str): Support "google" or "cambricon".
+        momentum (float): Smoothing parameter for exponential moving average operation. Defaults to 0.95.
+        name (Optional[str]): This operator's name. Defaults to None.
+
+    Returns:
+        Tuple[oneflow._oneflow_internal.BlobDesc, oneflow._oneflow_internal.BlobDesc]: The scale and zero_point of input tensor.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow as flow
+        import numpy as np
+        import oneflow.typing as tp
+
+        @flow.global_function(type="predict", function_config=flow.FunctionConfig())
+        def QuantizeJob(
+            input: tp.Numpy.Placeholder(input_shape, dtype=type_name_to_flow_type[dtype])
+        ): tp.Numpy
+            with flow.scope.placement(device_type, "0:0"):
+                scale, zero_point = flow.quantization.moving_average_min_max_observer(
+                    input, quantization_bit=8,
+                    quantization_scheme="symmetric",
+                    quantization_formula="google",
+                    momentum=0.95
+                )
+            return scale, zero_point
+
+        input = (np.random.random(input_shape) - 0.5).astype(type_name_to_np_type[dtype])
+        scale, zero_point = QuantizeJob(input)
+
+    """
+    op_name = (
+        name if name is not None else id_util.UniqueStr("MovingAverageMinMaxObserver_")
+    )
+    training = True if flow.current_global_function_desc().IsTrainable() else False
+    with flow.scope.namespace(op_name):
+        moving_max = flow.get_variable(
+            "moving_max",
+            shape=(1,),
+            dtype=input.dtype,
+            initializer=flow.zeros_initializer(input.dtype),
+            trainable=False,
+        )
+        moving_min = flow.get_variable(
+            "moving_min",
+            shape=(1,),
+            dtype=input.dtype,
+            initializer=flow.zeros_initializer(input.dtype),
+            trainable=False,
+        )
+        current_train_step = flow.get_variable(
+            "current_train_step",
+            shape=(1,),
+            dtype=flow.int64,
+            initializer=flow.zeros_initializer(flow.int64),
+            trainable=False,
+        )
+    stop_update_after_iters = 1
+    (scale, zero_point) = (
+        flow.user_op_builder(op_name)
+        .Op("moving_average_min_max_observer")
+        .Input("in", [input])
+        .Input("current_train_step", [current_train_step])
+        .Input("moving_max", [moving_max])
+        .Input("moving_min", [moving_min])
+        .Output("scale")
+        .Output("zero_point")
+        .Attr("training", training)
+        .Attr("stop_update_after_iters", stop_update_after_iters)
+        .Attr("quantization_bit", quantization_bit)
+        .Attr("quantization_scheme", quantization_scheme)
+        .Attr("quantization_formula", quantization_formula)
+        .Attr("momentum", momentum)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()
+    )
+    return (scale, zero_point)
+
+
+def fake_quantization(
+    input: oneflow._oneflow_internal.BlobDesc,
+    scale: oneflow._oneflow_internal.BlobDesc,
+    zero_point: oneflow._oneflow_internal.BlobDesc,
+    quantization_bit: int = 8,
+    quantization_scheme: str = "symmetric",
+    quantization_formula: str = "google",
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """Simulate the quantize and dequantize operations in training time.
+
+    The output will be computed as:
+
+        if quantization_scheme == "symmetric":
+
+        .. math::
+
+            & quant\\_max = 2^{quantization\\_to\\_bit - 1} - 1
+
+            & quant\\_min = -quant\\_max
+
+            & clamp(round(x / scale), quant\\_min, quant\\_max) * scale
+
+        elif quantization_scheme == "affine":
+
+        .. math::
+
+            & quant\\_max = 2^{quantization\\_to\\_bit} - 1
+
+            & quant\\_min = 0
+
+            & (clamp(round(x / scale + zero\\_point), quant\\_min, quant\\_max) - zero\\_point) * scale
+
+    Args:
+        input (oneflow._oneflow_internal.BlobDesc): input tensor.
+        scale (oneflow._oneflow_internal.BlobDesc): Computed by min_max_observer or moving_average_min_max_observer op.
+        zero_point (oneflow._oneflow_internal.BlobDesc): Computed by min_max_observer or moving_average_min_max_observer op.
+        quantization_bit (int): Quantize input to uintX / intX, X can be in range [2, 8]. Defaults to 8.
+        quantization_scheme (str): "symmetric" or "affine", quantize to signed / unsigned integer. Defaults to "symmetric".
+        quantization_formula (str): Support "google" or "cambricon".
+        name (Optional[str]): This operator's name. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: Input tensor after quantize and dequantize operations.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow as flow
+        import numpy as np
+        import oneflow.typing as tp
+
+        @flow.global_function(type="predict", function_config=flow.FunctionConfig())
+        def QuantizeJob(
+            input: tp.Numpy.Placeholder(input_shape, dtype=type_name_to_flow_type[dtype])
+        ): tp.Numpy
+            with flow.scope.placement(device_type, "0:0"):
+                scale, zero_point = flow.quantization.min_max_observer(
+                    input, quantization_bit=8,
+                    quantization_scheme="symmetric",
+                    quantization_formula="google",
+                    per_layer_quantization=True
+                )
+                fake_quantize_out = flow.quantization.fake_quantization(
+                    input, scale, zero_point,
+                    quantization_bit=8,
+                    quantization_scheme="symmetric",
+                    quantization_formula="google"
+                )
+            return fake_quantize_out
+
+        input = (np.random.random(input_shape) - 0.5).astype(type_name_to_np_type[dtype])
+        fake_quantize_out = QuantizeJob(input)
+
+    """
+    return (
+        flow.user_op_builder(
+            name if name is not None else id_util.UniqueStr("Fake_Quantization_")
+        )
+        .Op("fake_quantization")
+        .Input("in", [input])
+        .Input("scale", [scale])
+        .Input("zero_point", [zero_point])
+        .Output("out")
+        .Attr("quantization_bit", quantization_bit)
+        .Attr("quantization_scheme", quantization_scheme)
+        .Attr("quantization_formula", quantization_formula)
+        .Build()
+        .InferAndTryRun()
+        .SoleOutputBlob()
+    )
diff --git a/python/oneflow/ops/random_ops.py b/python/oneflow/ops/random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..010dabc7b30c12bfc850781fc4585da1d0bd49eb
--- /dev/null
+++ b/python/oneflow/ops/random_ops.py
@@ -0,0 +1,106 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional
+
+import oneflow as flow
+import oneflow._oneflow_internal
+import oneflow.framework.id_util as id_util
+import oneflow.framework.module as module_util
+import oneflow.framework.remote_blob as remote_blob_util
+
+
+def Bernoulli(
+    x: oneflow._oneflow_internal.BlobDesc,
+    seed: Optional[int] = None,
+    dtype: Optional[flow.dtype] = None,
+    name: str = "Bernoulli",
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator returns a Blob with binaray random numbers (0 / 1) from a Bernoulli distribution.
+
+    Args:
+        x (oneflow._oneflow_internal.BlobDesc): The input Blob.
+        seed (Optional[int], optional): The random seed. Defaults to None.
+        dtype (Optional[flow.dtype], optional): The data type. Defaults to None.
+        name (str, optional): The name for the operation. Defaults to "Bernoulli".
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result Blob.
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow as flow
+        import numpy as np
+        import oneflow.typing as tp
+
+
+        @flow.global_function()
+        def bernoulli_Job(x: tp.Numpy.Placeholder(shape=(3, 3), dtype=flow.float32),
+        ) -> tp.Numpy:
+            out = flow.random.bernoulli(x)
+            return out
+
+
+        x = np.array([[0.25, 0.45, 0.3],
+                    [0.55, 0.32, 0.13],
+                    [0.75, 0.15, 0.1]]).astype(np.float32)
+        out = bernoulli_Job(x)
+
+        # Because our random seed is not fixed, so the return value is different each time.
+        # out [[1. 0. 0.]
+        #      [0. 0. 1.]
+        #      [0. 0. 0.]]
+
+    """
+    assert isinstance(name, str)
+    if dtype is None:
+        dtype = x.dtype
+    if seed is not None:
+        assert name is not None
+    module = flow.find_or_create_module(
+        name, lambda: BernoulliModule(dtype=dtype, random_seed=seed, name=name)
+    )
+    return module(x)
+
+
+class BernoulliModule(module_util.Module):
+    def __init__(self, dtype: flow.dtype, random_seed: Optional[int], name: str):
+        module_util.Module.__init__(self, name)
+        (seed, has_seed) = flow.random.gen_seed(random_seed)
+        self.op_module_builder = (
+            flow.user_op_module_builder("bernoulli")
+            .InputSize("in", 1)
+            .Output("out")
+            .Attr("dtype", dtype)
+            .Attr("has_seed", has_seed)
+            .Attr("seed", seed)
+            .CheckAndComplete()
+        )
+        self.op_module_builder.user_op_module.InitOpKernel()
+
+    def forward(self, x: oneflow._oneflow_internal.BlobDesc):
+        if self.call_seq_no == 0:
+            name = self.module_name
+        else:
+            name = id_util.UniqueStr("Bernoulli_")
+        return (
+            self.op_module_builder.OpName(name)
+            .Input("in", [x])
+            .Build()
+            .InferAndTryRun()
+            .SoleOutputBlob()
+        )
diff --git a/python/oneflow/ops/random_util.py b/python/oneflow/ops/random_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..fac6e6a479280e8f7a497a3a788a20800057b78d
--- /dev/null
+++ b/python/oneflow/ops/random_util.py
@@ -0,0 +1,43 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import random
+import sys
+import typing
+
+import oneflow.framework.hob as hob
+import oneflow.support.enable_if as enable_if
+
+
+def api_gen_random_seed(seed: typing.Optional[int] = None):
+    api = enable_if.unique([consistent_gen_random_seed, mirrored_gen_random_seed])
+    return api(seed)
+
+
+@enable_if.condition(hob.consistent_view_enabled)
+def consistent_gen_random_seed(seed=None):
+    if seed is None:
+        seed = random.randint(-sys.maxsize, sys.maxsize)
+    return (seed, True)
+
+
+@enable_if.condition(hob.mirrored_view_enabled)
+def mirrored_gen_random_seed(seed=None):
+    if seed is None:
+        seed = -1
+        has_seed = False
+    else:
+        has_seed = True
+    return (seed, has_seed)
diff --git a/python/oneflow/ops/reduce_mean.py b/python/oneflow/ops/reduce_mean.py
new file mode 100644
index 0000000000000000000000000000000000000000..fba16a3f6cb50a70261912a1cc2c32a59b1ffe55
--- /dev/null
+++ b/python/oneflow/ops/reduce_mean.py
@@ -0,0 +1,87 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import collections
+from typing import Optional, Union
+
+import oneflow as flow
+import oneflow._oneflow_internal
+import oneflow.core.operator.op_conf_pb2 as op_conf_util
+import oneflow.core.register.logical_blob_id_pb2 as logical_blob_id_util
+import oneflow.framework.id_util as id_util
+import oneflow.framework.remote_blob as remote_blob_util
+
+
+def reduce_mean(
+    input_blob: oneflow._oneflow_internal.BlobDesc,
+    axis: Optional[Union[collections.Sized, int]] = None,
+    keepdims: bool = False,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    """This operator computes the mean of input Blob along the specified axis
+
+    Args:
+        input_blob (oneflow._oneflow_internal.BlobDesc): A Blob
+        axis (Optional[Union[collections.Sized, int]], optional): The dimension along which the mean value is computed. Defaults to None.
+        keepdims (bool, optional): Whether to keep the reduced dimension in the output Blob. Defaults to False.
+        name (Optional[str], optional): The name for the operation. Defaults to None.
+
+    Returns:
+        oneflow._oneflow_internal.BlobDesc: The result of average on the specified axis of input Blob
+
+    For example:
+
+    .. code-block:: python
+
+        import oneflow as flow
+        import numpy as np
+        import oneflow.typing as tp
+
+
+        @flow.global_function()
+        def reduce_mean_Job(x: tp.Numpy.Placeholder((3, 3))
+        ) -> tp.Numpy:
+            return flow.math.reduce_mean(x, axis=1, keepdims=True)
+
+
+        x = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]).astype(np.float32)
+        out = reduce_mean_Job(x)
+
+        # out [[2.]
+        #      [5.]
+        #      [8.]]
+
+    """
+    reduce_sum = flow.math.reduce_sum(
+        input_blob, axis=axis, keepdims=keepdims, name=name
+    )
+    if input_blob.is_dynamic:
+        reduce_count = flow.math.reduced_shape_elem_cnt(
+            input_blob, axis=axis, dtype=input_blob.dtype
+        )
+        return reduce_sum / reduce_count
+    else:
+        if axis is None:
+            axes = []
+        else:
+            axes = list(axis) if isinstance(axis, collections.Sized) else [axis]
+        reduce_count = 1
+        if len(axes) == 0:
+            for dim in input_blob.shape:
+                reduce_count *= dim
+        else:
+            for i in axes:
+                reduce_count *= input_blob.shape[i]
+        return flow.math.multiply(reduce_sum, 1.0 / reduce_count)
diff --git a/python/oneflow/ops/regularizer_util.py b/python/oneflow/ops/regularizer_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5cdfac1f38cb791a1a08c28e797fda33eb6fa21
--- /dev/null
+++ b/python/oneflow/ops/regularizer_util.py
@@ -0,0 +1,152 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow.core.job.regularizer_conf_pb2 as regularizer_conf_util
+import oneflow.core.operator.op_conf_pb2 as op_conf_util
+
+
+def l1_l2_regularizer(
+    l1: float = 0.01, l2: float = 0.01
+) -> regularizer_conf_util.RegularizerConf:
+    """This operator creates a L1 and L2 weight regularizer. 
+
+    Args:
+        l1 (float, optional): The L1 regularization coefficient. Defaults to 0.01.
+        l2 (float, optional): The L2 regularization coefficient. Defaults to 0.01.
+
+    Returns:
+        regularizer_conf_util.RegularizerConf: A regularizer that can be used in other layers or operators.
+    
+    For example: 
+
+    .. code-block:: python 
+
+        import oneflow as flow
+        import numpy as np
+        import oneflow.typing as tp
+
+
+        @flow.global_function()
+        def conv2d_l1_l2_Job(x: tp.Numpy.Placeholder((1, 256, 32, 32))
+        ) -> tp.Numpy:
+            initializer = flow.truncated_normal(0.1)
+            regularizer = flow.regularizers.l1_l2(l1=0.001, l2=0.001)
+            conv2d = flow.layers.conv2d(
+                x,
+                filters=128,
+                kernel_size=3,
+                strides=1,
+                padding='SAME',
+                kernel_initializer=initializer, 
+                kernel_regularizer=regularizer,
+                name="Conv2d"
+            )
+            return conv2d
+
+
+        x = np.random.randn(1, 256, 32, 32).astype(np.float32)
+        out = conv2d_l1_l2_Job(x)
+    
+    """
+    regularizer = regularizer_conf_util.RegularizerConf()
+    setattr(regularizer.l1_l2_conf, "l1", l1)
+    setattr(regularizer.l1_l2_conf, "l2", l2)
+    return regularizer
+
+
+def l1_regularizer(l: float = 0.01) -> regularizer_conf_util.RegularizerConf:
+    """This operator creates a L1 weight regularizer. 
+
+    Args:
+        l (float, optional): The L1 regularization coefficient. Defaults to 0.01.
+
+    Returns:
+        regularizer_conf_util.RegularizerConf: A regularizer that can be used in other layers or operators.
+    
+    For example: 
+
+    .. code-block:: python 
+
+        import oneflow as flow
+        import numpy as np
+        import oneflow.typing as tp
+
+
+        @flow.global_function()
+        def conv2d_l1_Job(x: tp.Numpy.Placeholder((1, 256, 32, 32))
+        ) -> tp.Numpy:
+            initializer = flow.truncated_normal(0.1)
+            regularizer = flow.regularizers.l1(l=0.001)
+            conv2d = flow.layers.conv2d(
+                x,
+                filters=128,
+                kernel_size=3,
+                strides=1,
+                padding='SAME',
+                kernel_initializer=initializer, 
+                kernel_regularizer=regularizer,
+                name="Conv2d"
+            )
+            return conv2d
+
+
+        x = np.random.randn(1, 256, 32, 32).astype(np.float32)
+        out = conv2d_l1_Job(x)
+            
+    """
+    return l1_l2_regularizer(l1=l, l2=0.0)
+
+
+def l2_regularizer(l: float = 0.01) -> regularizer_conf_util.RegularizerConf:
+    """This operator creates a L2 weight regularizer. 
+
+    Args:
+        l (float, optional): The L2 regularization coefficient. Defaults to 0.01.
+
+    Returns:
+        regularizer_conf_util.RegularizerConf: A regularizer that can be used in other layers or operators.
+
+    For example: 
+
+    .. code-block:: python 
+
+        import oneflow as flow
+        import numpy as np
+        import oneflow.typing as tp
+
+
+        @flow.global_function()
+        def conv2d_l2_Job(x: tp.Numpy.Placeholder((1, 256, 32, 32))
+        ) -> tp.Numpy:
+            initializer = flow.truncated_normal(0.1)
+            regularizer = flow.regularizers.l2(l=0.001)
+            conv2d = flow.layers.conv2d(
+                x,
+                filters=128,
+                kernel_size=3,
+                strides=1,
+                padding='SAME',
+                kernel_initializer=initializer, 
+                kernel_regularizer=regularizer,
+                name="Conv2d"
+            )
+            return conv2d
+
+
+        x = np.random.randn(1, 256, 32, 32).astype(np.float32)
+        out = conv2d_l2_Job(x)
+
+    """
+    return l1_l2_regularizer(l1=0.0, l2=l)
diff --git a/python/oneflow/ops/tensor_buffer_ops.py b/python/oneflow/ops/tensor_buffer_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4713fff9e4dd9484f3f8740cfc3618f5dbad65c9
--- /dev/null
+++ b/python/oneflow/ops/tensor_buffer_ops.py
@@ -0,0 +1,22 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import functools
+import operator
+from typing import List, Optional, Sequence
+
+import oneflow as flow
+import oneflow._oneflow_internal
+import oneflow.framework.id_util as id_util
diff --git a/python/oneflow/ops/transpose_util.py b/python/oneflow/ops/transpose_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..c93cd1a221796b0eb6dc4b43d4f0735babd3c366
--- /dev/null
+++ b/python/oneflow/ops/transpose_util.py
@@ -0,0 +1,36 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Sequence
+
+
+def is_perm(perm: Sequence[int]) -> bool:
+    return list(range(len(perm))) == sorted(list(perm))
+
+
+def get_perm_when_transpose_axis_to_last_dim(num_axes: int, axis: int) -> tuple:
+    axis = axis if axis >= 0 else axis + num_axes
+    assert 0 <= axis < num_axes, "axis out of range"
+    perm = [dim if dim < axis else dim + 1 for dim in range(num_axes - 1)]
+    perm.append(axis)
+    return tuple(perm)
+
+
+def get_inversed_perm(perm: Sequence[int]) -> tuple:
+    assert is_perm(perm)
+    inversed_perm = [-1] * len(perm)
+    for i in range(len(perm)):
+        inversed_perm[perm[i]] = i
+    return tuple(inversed_perm)
diff --git a/python/oneflow/ops/two_stage_reduce.py b/python/oneflow/ops/two_stage_reduce.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba238464533d73660a104599dc66d11d4ec039fd
--- /dev/null
+++ b/python/oneflow/ops/two_stage_reduce.py
@@ -0,0 +1,156 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Optional, Sequence, Union
+
+import oneflow as flow
+import oneflow._oneflow_internal
+import oneflow.framework.distribute as distribute_util
+import oneflow.framework.hob as hob
+import oneflow.framework.id_util as id_util
+import oneflow.framework.remote_blob as remote_blob_util
+import oneflow.ops.user_op_builder as user_op_builder
+import oneflow.support.enable_if as enable_if
+
+
+def api_two_stage_reduce_max(
+    x: oneflow._oneflow_internal.BlobDesc,
+    axis: Optional[Union[int, Sequence[int]]] = None,
+    keepdims: bool = False,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    func = enable_if.unique([two_stage_reduce_max])
+    return func(x, axis=axis, keepdims=keepdims, name=name)
+
+
+@enable_if.condition(hob.in_global_mode)
+def two_stage_reduce_max(x, axis=None, keepdims=False, name=None):
+    name = name if name is not None else id_util.UniqueStr("ReduceMax_")
+    return two_stage_reduce(x, axis, keepdims, "reduce_max", name)
+
+
+def api_two_stage_reduce_min(
+    x: oneflow._oneflow_internal.BlobDesc,
+    axis: Optional[Union[int, Sequence[int]]] = None,
+    keepdims: bool = False,
+    name: Optional[str] = None,
+) -> oneflow._oneflow_internal.BlobDesc:
+    func = enable_if.unique([two_stage_reduce_min])
+    return func(x, axis=axis, keepdims=keepdims, name=name)
+
+
+@enable_if.condition(hob.in_global_mode)
+def two_stage_reduce_min(x, axis=None, keepdims=False, name=None):
+    name = name if name is not None else id_util.UniqueStr("ReduceMin_")
+    return two_stage_reduce(x, axis, keepdims, "reduce_min", name)
+
+
+def two_stage_reduce(x, axis=None, keepdims=False, op_type_name=None, name=None):
+    assert check_x_dictribute(x, axis)
+    axis = _check_axis(axis, x.shape)
+    device_stage_out_list = []
+    device_stage_count_list = []
+    distribute_axis = x.distribute.axis
+    x_list = flow.advanced.distribute_split(x, axis=distribute_axis)
+    parallel_desc_symbol = flow.current_scope().device_parallel_desc_symbol
+    device_tag = parallel_desc_symbol.device_tag
+    parallel_id = 0
+    for (
+        machine_id,
+        device_ids,
+    ) in parallel_desc_symbol.machine_id2device_id_list.items():
+        for device_id in device_ids:
+            with flow.scope.placement(
+                device_tag, "@" + str(machine_id) + ":" + str(device_id)
+            ):
+                (device_stage_out, device_stage_count) = reduce_device_stage(
+                    x_list[parallel_id],
+                    axis,
+                    op_type_name + "_device_stage",
+                    name + "_device_stage" + str(parallel_id),
+                )
+                device_stage_out_list.append(device_stage_out)
+                device_stage_count_list.append(device_stage_count)
+                parallel_id += 1
+    device_stage_out = flow.advanced.distribute_concat(
+        device_stage_out_list, axis=distribute_axis
+    )
+    device_stage_count = flow.advanced.distribute_concat(
+        device_stage_count_list, axis=distribute_axis
+    )
+    device_stage_out = device_stage_out.with_distribute(flow.distribute.broadcast())
+    device_stage_count = device_stage_count.with_distribute(flow.distribute.broadcast())
+    out = reduce_global_stage(
+        device_stage_out,
+        device_stage_count,
+        axis,
+        keepdims,
+        op_type_name + "_global_stage",
+        name + "_global_stage",
+    )
+    return out
+
+
+def reduce_device_stage(x, axis, op_name, name):
+    (out, mask, count) = (
+        flow.user_op_builder(name)
+        .Op(op_name)
+        .Input("in", [x])
+        .Output("out")
+        .Output("mask")
+        .Output("count")
+        .Attr("axis", axis)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()
+    )
+    return (out, count)
+
+
+def reduce_global_stage(x, device_count, axis, keepdims, op_name, name):
+    (out, mask) = (
+        flow.user_op_builder(name)
+        .Op(op_name)
+        .Input("in", [x])
+        .Input("device_count", [device_count])
+        .Output("out")
+        .Output("mask")
+        .Attr("axis", axis)
+        .Attr("keepdims", keepdims)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()
+    )
+    return out
+
+
+def _check_axis(axis, shape):
+    if axis is None:
+        axis = list(range(len(shape)))
+    if isinstance(axis, int):
+        axis = [axis]
+    assert isinstance(axis, (list, tuple)), "Invalid axis {}".format(axis)
+    for x in axis:
+        if x < 0:
+            x += len(shape)
+        assert x >= 0 and x < len(shape), "Invalid axis {}".format(axis)
+    return axis
+
+
+def check_x_dictribute(x, axis):
+    for i in axis:
+        if x.distribute is oneflow._oneflow_internal.distribute.split(i):
+            return True
+    return False
diff --git a/python/oneflow/ops/user_op_builder.py b/python/oneflow/ops/user_op_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..88752423337fa096adb10e8e31063d835fafd3e8
--- /dev/null
+++ b/python/oneflow/ops/user_op_builder.py
@@ -0,0 +1,548 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import random
+import traceback
+
+from google.protobuf import text_format
+
+import oneflow
+import oneflow as flow
+import oneflow._oneflow_internal
+import oneflow._oneflow_internal.oneflow.core.common.data_type as data_type_cfg
+import oneflow._oneflow_internal.oneflow.core.common.shape as shape_cfg
+import oneflow._oneflow_internal.oneflow.core.framework.user_op_attr as user_op_attr_cfg
+import oneflow.core.eager.eager_symbol_pb2 as eager_symbol_util
+import oneflow.core.framework.user_op_attr_pb2 as attr_value_pb
+import oneflow.core.operator.op_conf_pb2 as op_conf_util
+import oneflow.core.register.logical_blob_id_pb2 as logical_blob_id_util
+import oneflow.eager.eager_blob_util as eager_blob_util
+import oneflow.eager.gradient_util as gradient_util
+import oneflow.experimental.namescope as name_scope
+import oneflow.framework.c_api_util as c_api_util
+import oneflow.framework.compile_context as compile_context
+import oneflow.framework.distribute as distribute
+import oneflow.framework.hob as hob
+import oneflow.framework.interpret_util as interpret_util
+import oneflow.framework.remote_blob as remote_blob_util
+import oneflow.support.enable_if as enable_if
+
+blob_register = oneflow._oneflow_internal.GetDefaultBlobRegister()
+
+
+class UserOp(object):
+    def __init__(self, op_name, op_type_name=None):
+        self.op_conf_ = op_conf_util.OperatorConf()
+        self.op_conf_.name = op_name
+        if op_type_name is not None:
+            self.op_conf_.user_conf.op_type_name = op_type_name
+        device_tag = oneflow.current_scope().device_parallel_desc_symbol.device_tag
+        self.op_conf_.device_tag = device_tag
+        self.output_arg_key_list_ = []
+
+    @property
+    def op_conf(self):
+        return self.op_conf_
+
+    def InferAndTryRun(self):
+        raise NotImplementedError
+
+    def MakeRemoteBlob(self, lbi):
+        raise NotImplementedError
+
+    def RemoteBlobList(self):
+        remote_blob_list = []
+        for k in self.op_conf_.user_conf.output:
+            if k not in self.output_arg_key_list_:
+                raise ValueError(
+                    "output_arg_name {} of {} op is not set in python op builder".format(
+                        k, self.op_conf_.name
+                    )
+                )
+        for output_arg_name in self.output_arg_key_list_:
+            assert output_arg_name in self.op_conf_.user_conf.output
+            for i in range(len(self.op_conf_.user_conf.output[output_arg_name].s)):
+                lbi = logical_blob_id_util.LogicalBlobId()
+                lbi.op_name = self.op_conf_.name
+                lbi.blob_name = "{}_{}".format(output_arg_name, i)
+                remote_blob_obj = self.MakeRemoteBlob(lbi)
+                remote_blob_list.append(remote_blob_obj)
+                if flow.eager_execution_enabled():
+                    gradient_util.GetDefaultBackwardBlobRegister().TrySetObject4BlobName(
+                        remote_blob_obj.logical_blob_name, remote_blob_obj.blob_object
+                    )
+        return tuple(remote_blob_list)
+
+    def RemoteBlobDict(self):
+        remote_blob_dict = {}
+        for k in self.op_conf_.user_conf.output:
+            if k not in self.output_arg_key_list_:
+                raise ValueError(
+                    "output_arg_name {} of {} op is not set in python op builder".format(
+                        k, self.op_conf_.name
+                    )
+                )
+        for output_arg_name in self.output_arg_key_list_:
+            assert output_arg_name in self.op_conf_.user_conf.output
+            if output_arg_name not in remote_blob_dict:
+                remote_blob_dict[output_arg_name] = []
+            for i in range(len(self.op_conf_.user_conf.output[output_arg_name].s)):
+                lbi = logical_blob_id_util.LogicalBlobId()
+                lbi.op_name = self.op_conf_.name
+                lbi.blob_name = "{}_{}".format(output_arg_name, i)
+                remote_blob_dict[output_arg_name].append(self.MakeRemoteBlob(lbi))
+        return remote_blob_dict
+
+    def SoleOutputBlob(self):
+        blobs = self.RemoteBlobList()
+        assert len(blobs) == 1
+        return blobs[0]
+
+
+class UserOpModule(object):
+    @property
+    def opkernel_object(self):
+        return self.opkernel_object_
+
+    def set_opkernel_object(self, opkernel_object):
+        assert not hasattr(self, "opkernel_object_")
+        self.opkernel_object_ = opkernel_object
+
+    def InitOpKernel(self):
+        raise NotImplementedError
+
+
+def api_user_op_builder(op_name):
+    """Build a wrapper of user op.
+
+    For instance::
+        def myargmax(
+            input: oneflow._oneflow_internal.BlobDesc) -> oneflow._oneflow_internal.BlobDesc:
+            return (
+            flow.user_op_builder("myargmax")
+            .Op("argmax")
+            .Input("in", [input])
+            .Output("out")
+            .Build()
+            .InferAndTryRun()
+            .RemoteBlobList()[0]
+            )
+
+    Args:
+        op_name (str): name of new user op
+
+    Returns:
+        UserOpConfBuilder: `UserOpConfBuilder` object used to build a wrapper of user op.
+    """
+    api = enable_if.unique([lazy_user_op_builder, eager_user_op_builder])
+    return api(op_name)
+
+
+@enable_if.condition(hob.in_global_mode & ~hob.eager_execution_enabled)
+def lazy_user_op_builder(op_name):
+    job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+    op_name = name_scope.GetJobNameScopePrefix(job_name) + op_name
+    return UserOpConfBuilder(LazyUserOp, op_name, None)
+
+
+class LazyUserOp(UserOp):
+    def __init__(self, op_name, op_type_name):
+        UserOp.__init__(self, op_name, op_type_name)
+
+    def InferAndTryRun(self):
+        compile_context.CurJobAddOp(self.op_conf_)
+        return self
+
+    def MakeRemoteBlob(self, lbi):
+        return remote_blob_util.RemoteBlob(lbi)
+
+
+@enable_if.condition(hob.in_global_mode & hob.eager_execution_enabled)
+def eager_user_op_builder(op_name):
+    job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+    op_name = name_scope.GetJobNameScopePrefix(job_name) + op_name
+    return UserOpConfBuilder(EagerUserOp, op_name, None)
+
+
+class EagerUserOp(UserOp):
+    def __init__(self, op_name, op_type_name):
+        UserOp.__init__(self, op_name, op_type_name)
+
+    def InferAndTryRun(self):
+        interpret_util.Forward(self.op_conf_)
+        return self
+
+    def MakeRemoteBlob(self, lbi):
+        return remote_blob_util.EagerLogicalBlob(lbi)
+
+
+def api_consistent_user_op_builder(op_name):
+    job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+    op_name = name_scope.GetJobNameScopePrefix(job_name) + op_name
+    return UserOpConfBuilder(ConsistentUserOp, op_name, None)
+
+
+class ConsistentUserOp(UserOp):
+    def __init__(self, op_name, op_type_name):
+        UserOp.__init__(self, op_name, op_type_name)
+
+    def InferAndTryRun(self):
+        interpret_util.ConsistentForward(self.op_conf_)
+        return self
+
+    def MakeRemoteBlob(self, lbi):
+        return remote_blob_util.RemoteBlob(lbi)
+
+
+class UserOpConfBuilder(object):
+    def __init__(self, user_op_or_module_class, op_name, op_type_name):
+        self.user_op_ = user_op_or_module_class(op_name, op_type_name)
+
+    def CheckAndComplete(self):
+        assert self.user_op_.op_conf_.user_conf.op_type_name != ""
+        self.user_op_.op_conf_ = c_api_util.CheckAndCompleteUserOpConf(
+            self.user_op_.op_conf_
+        )
+        return self
+
+    def Build(self):
+        """Build op when in/output and other attribute set up.
+
+        Returns:
+            self
+
+        """
+        return self.CheckAndComplete().user_op_
+
+    def OpName(self, op_name):
+        job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+        op_name = name_scope.GetJobNameScopePrefix(job_name) + op_name
+        self.user_op_.op_conf_.name = op_name
+        user_conf = self.user_op_.op_conf_.user_conf
+
+        def GetLbn(output_name, i):
+            return "{}/{}_{}".format(op_name, output_name, i)
+
+        for (output_name, output) in user_conf.output.items():
+            output.s[:] = [GetLbn(output_name, i) for i in range(len(output.s))]
+        return self
+
+    def Op(self, op_type_name):
+        """set typename of op
+
+        Args:
+            op_type_name (string): op type name
+
+        Returns:
+            self
+        """
+        self.user_op_.op_conf_.user_conf.op_type_name = op_type_name
+        return self
+
+    def Input(self, input_name, input_blob_list):
+        """Set input blob of op
+
+        Args:
+            input_name (str): input name of blob
+            input_blob_list : list of blobs
+
+        Returns:
+            self
+        """
+        assert isinstance(input_blob_list, (tuple, list))
+        input_conf = self.user_op_.op_conf_.user_conf.input
+        input_conf[input_name].ClearField("s")
+        for input_blob in input_blob_list:
+            input_conf[input_name].s.append(input_blob.unique_name)
+        return self
+
+    def InputSize(self, input_name, input_blob_size):
+        input_conf = self.user_op_.op_conf_.user_conf.input
+        assert input_blob_size >= 0
+        assert input_name not in input_conf
+        for i in range(input_blob_size):
+            unique_name = "%s/%s_%s" % (self.user_op_.op_conf_.name, input_name, i)
+            input_conf[input_name].s.append(unique_name)
+        return self
+
+    def Output(self, output_name, num=1):
+        """Set output blob of op
+
+        Args:
+            output_name (str): name of output blob
+            num (int, optional):  Defaults to 1.
+
+        Returns:
+            self
+        """
+        assert isinstance(num, int) and num >= 1
+        out_lbns = []
+        for i in range(num):
+            lbn = "{}/{}_{}".format(self.user_op_.op_conf_.name, output_name, i)
+            out_lbns.append(lbn)
+        self.user_op_.op_conf_.user_conf.output[output_name].s[:] = out_lbns
+        self.user_op_.output_arg_key_list_.append(output_name)
+        return self
+
+    def Attr(self, attr_name, attr_value, attr_type_name=None):
+        """Set value of op's attribute.
+
+        Args:
+            attr_name (str): attribute name of op
+            attr_value (Any): attribute value of op
+
+        Raises:
+            ValueError: raised when value is not idential to op's attribute type.
+
+        Returns:
+            [type]: [description]
+        """
+        if attr_type_name != None:
+            print(
+                'WARNING: Argument \'attr_type_name\' of UserOpConfBuilder.Attr has been deprecated. Please remove it.\n\n            For instance:\n                -     .Attr("out_num", out_num, "AttrTypeInt64")\n                +     .Attr("out_num", out_num)\n                        '
+            )
+            print(traceback.format_stack()[-2])
+        attribute = user_op_attr_cfg.AttrValue()
+        assert isinstance(attr_name, str)
+        attr_type = oneflow._oneflow_internal.GetUserOpAttrType(
+            self.user_op_.op_conf_.user_conf.op_type_name, attr_name
+        )
+        if attr_type == user_op_attr_cfg.kAtInt32:
+            assert isinstance(attr_value, int)
+            attribute.set_at_int32(attr_value)
+        elif attr_type == user_op_attr_cfg.kAtInt64:
+            assert isinstance(attr_value, int)
+            attribute.set_at_int64(attr_value)
+        elif attr_type == user_op_attr_cfg.kAtBool:
+            assert isinstance(attr_value, bool)
+            attribute.set_at_bool(attr_value)
+        elif attr_type == user_op_attr_cfg.kAtFloat:
+            assert isinstance(attr_value, (float, int))
+            attribute.set_at_float(attr_value)
+        elif attr_type == user_op_attr_cfg.kAtDouble:
+            assert isinstance(attr_value, (float, int))
+            attribute.set_at_double(attr_value)
+        elif attr_type == user_op_attr_cfg.kAtString:
+            assert isinstance(attr_value, str)
+            attribute.set_at_string(attr_value)
+        elif attr_type == user_op_attr_cfg.kAtShape:
+            assert isinstance(attr_value, (tuple, list))
+            attribute_mutable_at_shape = attribute.mutable_at_shape()
+            for x in attr_value:
+                assert isinstance(x, int)
+                attribute_mutable_at_shape.add_dim(x)
+        elif attr_type == user_op_attr_cfg.kAtDataType:
+            assert attr_value in oneflow.dtypes()
+            attr_value = oneflow._oneflow_internal.deprecated.GetProtoDtype4OfDtype(
+                attr_value
+            )
+            assert isinstance(attr_value, int)
+            attribute.set_at_data_type(data_type_cfg.DataType(attr_value))
+        elif attr_type == user_op_attr_cfg.kAtListInt32:
+            assert isinstance(attr_value, (tuple, list))
+            attribute_mutable_at_list_int32 = attribute.mutable_at_list_int32()
+            for x in attr_value:
+                assert isinstance(x, int)
+                attribute_mutable_at_list_int32.add_val(x)
+        elif attr_type == user_op_attr_cfg.kAtListInt64:
+            assert isinstance(attr_value, (tuple, list))
+            attribute_mutable_at_list_int64 = attribute.mutable_at_list_int64()
+            for x in attr_value:
+                assert isinstance(x, int)
+                attribute_mutable_at_list_int64.add_val(x)
+        elif attr_type == user_op_attr_cfg.kAtListFloat:
+            assert isinstance(attr_value, (tuple, list))
+            attribute_mutable_at_list_float = attribute.mutable_at_list_float()
+            for x in attr_value:
+                assert isinstance(x, (float, int))
+                attribute_mutable_at_list_float.add_val(x)
+        elif attr_type == user_op_attr_cfg.kAtListDataType:
+            assert isinstance(attr_value, (tuple, list))
+            attribute_mutable_at_list_data_type = attribute.mutable_at_list_data_type()
+            for x in attr_value:
+                assert x in oneflow.dtypes()
+                x = oneflow._oneflow_internal.deprecated.GetProtoDtype4OfDtype(x)
+                assert isinstance(x, int)
+                attribute_mutable_at_list_data_type.add_val(data_type_cfg.DataType(x))
+        elif attr_type == user_op_attr_cfg.kAtListShape:
+            assert isinstance(attr_value, (tuple, list))
+            attribute_mutable_at_list_shape = (
+                attribute.mutable_at_list_shape().mutable_val()
+            )
+            for x in attr_value:
+                assert isinstance(x, (tuple, list))
+                shape = shape_cfg.ShapeProto()
+                for dim in x:
+                    assert isinstance(dim, int)
+                    shape.add_dim(dim)
+                attribute_mutable_at_list_shape.Add().CopyFrom(shape)
+        elif attr_type == user_op_attr_cfg.kAtListString:
+            assert isinstance(attr_value, (tuple, list))
+            attribute_mutable_at_list_string = attribute.mutable_at_list_string()
+            for x in attr_value:
+                assert isinstance(x, str)
+                attribute_mutable_at_list_string.add_val(x)
+        else:
+            raise ValueError("Invalid op attribute type {}".format(attr_type))
+        self.user_op_.op_conf_.user_conf.attr[attr_name].CopyFrom(
+            text_format.Parse(str(attribute), attr_value_pb.AttrValue())
+        )
+        return self
+
+
+def api_user_op_module_builder(op_type_name):
+    api = enable_if.unique(
+        [lazy_user_op_module_builder, eager_logical_user_op_module_builder]
+    )
+    return api(op_type_name)
+
+
+class UserOpModuleBuilder(UserOpConfBuilder):
+    def __init__(self, *args, **kwargs):
+        UserOpConfBuilder.__init__(self, *args, **kwargs)
+        self.user_op_module.op_conf.scope_symbol_id = flow.current_scope().symbol_id
+
+    @property
+    def user_op_module(self):
+        return self.user_op_
+
+    def Op(self, op_type_name):
+        raise ValueError(
+            "user op module builder of {} can't call '.Op(op_type_name)' method".format(
+                op_type_name
+            )
+        )
+
+
+@enable_if.condition(hob.in_global_mode & ~hob.eager_execution_enabled)
+def lazy_user_op_module_builder(op_type_name):
+    job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+    op_name = name_scope.GetJobNameScopePrefix(job_name) + op_type_name
+    return UserOpModuleBuilder(LazyUserOpModule, op_name, op_type_name)
+
+
+@enable_if.condition(hob.in_global_mode & hob.eager_execution_enabled)
+def eager_logical_user_op_module_builder(op_type_name):
+    job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+    op_name = name_scope.GetJobNameScopePrefix(job_name) + op_type_name
+    return UserOpModuleBuilder(EagerLogicalUserOpModule, op_name, op_type_name)
+
+
+class LazyUserOpModule(UserOpModule, UserOp):
+    def __init__(self, op_name, op_type_name):
+        UserOp.__init__(self, op_name, op_type_name)
+
+    def InitOpKernel(self):
+        self.set_opkernel_object(None)
+
+    def InferAndTryRun(self):
+        assert hob.in_global_mode(None)
+        compile_context.CurJobAddOp(self.op_conf_)
+        return self
+
+    def MakeRemoteBlob(self, lbi):
+        return remote_blob_util.RemoteBlob(lbi)
+
+
+class EagerLogicalUserOpModule(UserOpModule, UserOp):
+    def __init__(self, op_name, op_type_name):
+        UserOp.__init__(self, op_name, op_type_name)
+
+    def InitOpKernel(self):
+        def BuildInstruction(builder):
+            if not isinstance(
+                self.op_conf,
+                oneflow._oneflow_internal.oneflow.core.operator.op_conf.OperatorConf,
+            ):
+                cfg_op_conf = oneflow._oneflow_internal.deprecated.MakeOpConfByString(
+                    str(self.op_conf)
+                )
+            self.set_opkernel_object(builder.NewOpKernelObject(cfg_op_conf))
+
+        oneflow._oneflow_internal.deprecated.LogicalRun(BuildInstruction)
+
+    def InferAndTryRun(self):
+        assert hob.in_global_mode(None)
+        interpret_util.OpKernelForward(self.op_conf, self.opkernel_object)
+        return self
+
+    def MakeRemoteBlob(self, lbi):
+        return remote_blob_util.EagerLogicalBlob(lbi)
+
+
+def api_consistent_user_op_module_builder(op_type_name):
+    api = enable_if.unique(
+        [
+            lazy_consistent_user_op_module_builder,
+            eager_consistent_user_op_module_builder,
+        ]
+    )
+    return api(op_type_name)
+
+
+@enable_if.condition(hob.in_global_mode & ~hob.eager_execution_enabled)
+def lazy_consistent_user_op_module_builder(op_type_name):
+    job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+    op_name = name_scope.GetJobNameScopePrefix(job_name) + op_type_name
+    return UserOpModuleBuilder(LazyConsistentUserOpModule, op_name, op_type_name)
+
+
+@enable_if.condition(hob.in_global_mode & hob.eager_execution_enabled)
+def eager_consistent_user_op_module_builder(op_type_name):
+    job_name = oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName()
+    op_name = name_scope.GetJobNameScopePrefix(job_name) + op_type_name
+    return UserOpModuleBuilder(EagerConsistentUserOpModule, op_name, op_type_name)
+
+
+class LazyConsistentUserOpModule(UserOpModule, UserOp):
+    def __init__(self, op_name, op_type_name):
+        UserOp.__init__(self, op_name, op_type_name)
+
+    def InitOpKernel(self):
+        self.set_opkernel_object(None)
+
+    def InferAndTryRun(self):
+        assert hob.in_global_mode(None)
+        compile_context.CurJobAddConsistentOp(self.op_conf_)
+        return self
+
+    def MakeRemoteBlob(self, lbi):
+        return remote_blob_util.RemoteBlob(lbi)
+
+
+class EagerConsistentUserOpModule(UserOpModule, UserOp):
+    def __init__(self, op_name, op_type_name):
+        UserOp.__init__(self, op_name, op_type_name)
+
+    def InitOpKernel(self):
+        def BuildInstruction(builder):
+            if not isinstance(
+                self.op_conf,
+                oneflow._oneflow_internal.oneflow.core.operator.op_conf.OperatorConf,
+            ):
+                cfg_op_conf = oneflow._oneflow_internal.deprecated.MakeOpConfByString(
+                    str(self.op_conf)
+                )
+            self.set_opkernel_object(builder.NewOpKernelObject(cfg_op_conf))
+
+        oneflow._oneflow_internal.deprecated.LogicalRun(BuildInstruction)
+
+    def InferAndTryRun(self):
+        assert hob.in_global_mode(None)
+        interpret_util.OpKernelConsistentForward(self.op_conf, self.opkernel_object)
+        return self
+
+    def MakeRemoteBlob(self, lbi):
+        return remote_blob_util.EagerLogicalBlob(lbi)
diff --git a/python/oneflow/ops/util/__init__.py b/python/oneflow/ops/util/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/python/oneflow/ops/util/custom_op_module.py b/python/oneflow/ops/util/custom_op_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..f68dfe747066cbaf0dfb3c98aed6c9fb534d0f61
--- /dev/null
+++ b/python/oneflow/ops/util/custom_op_module.py
@@ -0,0 +1,170 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import importlib.util
+import os
+import os.path
+import shutil
+import subprocess as sp
+import sys
+import sysconfig
+
+import numpy
+
+import oneflow
+import oneflow._oneflow_internal
+import oneflow.framework.sysconfig as oneflow_sysconfig
+
+
+def run_cmd(cmd, cwd=None):
+    if cwd:
+        res = sp.run(cmd, cwd=cwd, shell=True, stdout=sp.PIPE, stderr=sp.STDOUT)
+    else:
+        res = sp.run(cmd, shell=True, stdout=sp.PIPE, stderr=sp.STDOUT)
+    out = res.stdout.decode("utf8")
+    if res.returncode != 0:
+        err_msg = "Run cmd failed: {}, output: {}".format(cmd, out)
+        raise Exception(err_msg)
+    if len(out) and out[-1] == "\n":
+        out = out[:-1]
+    return out
+
+
+def compile(compiler, flags, link, inputs, output):
+    if os.path.exists(output):
+        return True
+    if isinstance(inputs, list):
+        cmd = "{} {} {} {} -o {}".format(
+            compiler, " ".join(inputs), flags, link, output
+        )
+    else:
+        cmd = "{} {} {} {} -o {}".format(compiler, inputs, flags, link, output)
+    run_cmd(cmd)
+    return True
+
+
+def get_cflags():
+    return " ".join(oneflow_sysconfig.get_compile_flags())
+
+
+def get_lflags():
+    return (
+        " ".join(oneflow_sysconfig.get_link_flags())
+        + " -Wl,-rpath "
+        + oneflow_sysconfig.get_lib()
+    )
+
+
+class PythonKernelRegistry(object):
+    """A helper class to store python kernel module
+    """
+
+    def __init__(self):
+        self.kernels_ = {}
+
+    def Register(self, op_module_name, module):
+        self.kernels_[op_module_name] = module
+
+
+_python_kernel_reg = PythonKernelRegistry()
+
+
+class CustomOpModule(object):
+    def __init__(self, op_module_name, module_path=""):
+        self.op_module_name_ = op_module_name
+        self.api = None
+        self.so_path_ = ""
+        self.objs_ = []
+        self.has_api_ = False
+        self.has_def_ = False
+        self.has_py_kernel_ = False
+        self.has_cpu_kernel_ = False
+        self.has_gpu_kernel_ = False
+        self.got_so_ = False
+        module_path = os.path.normpath(module_path)
+        pwd_path = os.getcwd()
+        if module_path != "." and module_path != pwd_path:
+            module_folder = os.path.join(module_path, self.op_module_name_)
+            pwd_folder = os.path.join(pwd_path, self.op_module_name_)
+            if os.path.exists(pwd_folder):
+                shutil.rmtree(pwd_folder)
+            shutil.copytree(module_folder, pwd_folder)
+        self.src_prefix_ = os.path.join(
+            pwd_path, self.op_module_name_, self.op_module_name_
+        )
+        out_path = os.path.join(pwd_path, self.op_module_name_, "out")
+        if not os.path.exists(out_path):
+            os.makedirs(out_path)
+        self.out_prefix_ = os.path.join(out_path, self.op_module_name_)
+
+    def py_api(self):
+        assert os.path.exists("{}_py_api.py".format(self.src_prefix_))
+        spec = importlib.util.spec_from_file_location(
+            self.op_module_name_, "{}_py_api.py".format(self.src_prefix_)
+        )
+        self.api = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(self.api)
+        return self
+
+    def cpp_def(self):
+        flags = "-std=c++11 -c -fPIC -O2 " + get_cflags()
+        compile(
+            "g++",
+            flags,
+            get_lflags(),
+            "{}_cpp_def.cpp".format(self.src_prefix_),
+            "{}_cpp_def.o".format(self.out_prefix_),
+        )
+        self.objs_.append("{}_cpp_def.o".format(self.out_prefix_))
+        self.has_def_ = True
+        return self
+
+    def py_kernel(self):
+        assert os.path.exists("{}_py_kernel.py".format(self.src_prefix_))
+        spec = importlib.util.spec_from_file_location(
+            self.op_module_name_, "{}_py_kernel.py".format(self.src_prefix_)
+        )
+        kernel = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(kernel)
+        _python_kernel_reg.Register(self.op_module_name_, kernel)
+        oneflow._oneflow_internal.RegisterPyKernelCaller(self.op_module_name_)
+        self.has_py_kernel_ = True
+        return self
+
+    def cpp_kernel(self):
+        flags = "-std=c++11 -c -fPIC -O2 " + get_cflags()
+        compile(
+            "g++",
+            flags,
+            "",
+            "{}_cpp_kernel.cpp".format(self.src_prefix_),
+            "{}_cpp_kernel.o".format(self.out_prefix_),
+        )
+        self.objs_.append("{}_cpp_kernel.o".format(self.out_prefix_))
+        self.has_cpu_kernel_ = True
+        return self
+
+    def gpu_kernel(self):
+        raise NotImplementedError
+
+    def build_load(self):
+        if len(self.objs_) > 0:
+            flags = "-std=c++11 -shared -fPIC " + get_cflags()
+            compile(
+                "g++", flags, get_lflags(), self.objs_, "{}.so".format(self.out_prefix_)
+            )
+            self.got_so_ = True
+            self.so_path_ = self.out_prefix_ + ".so"
+        oneflow.config.load_library_now(self.so_path_)
diff --git a/python/oneflow/optim/__init__.py b/python/oneflow/optim/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2789b7eaa7aabe17486a5ac1a135f2ddbdd46e58
--- /dev/null
+++ b/python/oneflow/optim/__init__.py
@@ -0,0 +1,22 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.nn.optimizer.adam import Adam
+from oneflow.nn.optimizer.adamw import AdamW
+from oneflow.nn.optimizer.optimizer import Optimizer
+from oneflow.nn.optimizer.rmsprop import RMSprop
+from oneflow.nn.optimizer.sgd import SGD
+
+from . import lr_scheduler
diff --git a/python/oneflow/optim/lr_scheduler.py b/python/oneflow/optim/lr_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..96320a2f70e1c0ab6d4c5b71b7d2f71be3d181ce
--- /dev/null
+++ b/python/oneflow/optim/lr_scheduler.py
@@ -0,0 +1,19 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.nn.optimizer.cosine_annealing_lr import CosineAnnealingLR
+from oneflow.nn.optimizer.lambda_lr import LambdaLR
+from oneflow.nn.optimizer.lr_scheduler import LrScheduler as _LRScheduler
+from oneflow.nn.optimizer.step_lr import StepLR
diff --git a/python/oneflow/optimizer/__init__.py b/python/oneflow/optimizer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5262358366abd6d461a6d2cad774e0cb7ad9ae6
--- /dev/null
+++ b/python/oneflow/optimizer/__init__.py
@@ -0,0 +1,36 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.ops.optimizer import (
+    LAMB,
+    LARS,
+    SGD,
+    SGDW,
+    Adam,
+    AdamW,
+    CombinedOptimizer,
+    CosineScheduler,
+    CustomScheduler,
+    ExponentialScheduler,
+    InverseTimeScheduler,
+    LazyAdam,
+    LinearCosineScheduler,
+    NaturalExpScheduler,
+    PiecewiseConstantScheduler,
+    PiecewiseScalingScheduler,
+    PolynomialSchduler,
+    PolynomialScheduler,
+    RMSProp,
+)
diff --git a/python/oneflow/optimizer/grad_clipping.py b/python/oneflow/optimizer/grad_clipping.py
new file mode 100644
index 0000000000000000000000000000000000000000..25115be3ccad54447b38b5734b729b90c39fb203
--- /dev/null
+++ b/python/oneflow/optimizer/grad_clipping.py
@@ -0,0 +1,16 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.ops.optimizer import by_global_norm
diff --git a/python/oneflow/optimizer/loss_scale.py b/python/oneflow/optimizer/loss_scale.py
new file mode 100644
index 0000000000000000000000000000000000000000..c96c59fd4da47d9db9f5c528f3512cc7c4b095a8
--- /dev/null
+++ b/python/oneflow/optimizer/loss_scale.py
@@ -0,0 +1,17 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.ops.optimizer import DynamicLossScalePolicy as dynamic_loss_scale
+from oneflow.ops.optimizer import StaticLossScalePolicy as static_loss_scale
diff --git a/python/oneflow/optimizer/warmup.py b/python/oneflow/optimizer/warmup.py
new file mode 100644
index 0000000000000000000000000000000000000000..37df34a45c529616dd27c5a1b08a88a642636b4b
--- /dev/null
+++ b/python/oneflow/optimizer/warmup.py
@@ -0,0 +1,16 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.ops.optimizer import constant, linear
diff --git a/python/oneflow/profiler.py b/python/oneflow/profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e29bbde696b12cb03236e593942af0caa3f7d0a
--- /dev/null
+++ b/python/oneflow/profiler.py
@@ -0,0 +1,19 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.framework.profiler import ProfilerStart as profiler_start
+from oneflow.framework.profiler import ProfilerStop as profiler_stop
+from oneflow.framework.profiler import RangePop as range_pop
+from oneflow.framework.profiler import RangePush as range_push
diff --git a/python/oneflow/quantization.py b/python/oneflow/quantization.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c754a45a26820863c52c8a87042584cb2841b0b
--- /dev/null
+++ b/python/oneflow/quantization.py
@@ -0,0 +1,20 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.ops.quantize_ops import (
+    fake_quantization,
+    min_max_observer,
+    moving_average_min_max_observer,
+)
diff --git a/python/oneflow/random.py b/python/oneflow/random.py
new file mode 100644
index 0000000000000000000000000000000000000000..b10ce648152eeda00132c5f92dcccc368f670211
--- /dev/null
+++ b/python/oneflow/random.py
@@ -0,0 +1,17 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.ops.random_ops import Bernoulli as bernoulli
+from oneflow.ops.random_util import api_gen_random_seed as gen_seed
diff --git a/python/oneflow/regularizers.py b/python/oneflow/regularizers.py
new file mode 100644
index 0000000000000000000000000000000000000000..9661cf0926f0c3fffa964bcaa8ed336a06a30fed
--- /dev/null
+++ b/python/oneflow/regularizers.py
@@ -0,0 +1,18 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.ops.regularizer_util import l1_l2_regularizer as l1_l2
+from oneflow.ops.regularizer_util import l1_regularizer as l1
+from oneflow.ops.regularizer_util import l2_regularizer as l2
diff --git a/python/oneflow/saved_model.py b/python/oneflow/saved_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f5a89b1aecbc325fcdf856f39672647a8906c46
--- /dev/null
+++ b/python/oneflow/saved_model.py
@@ -0,0 +1,20 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.serving.saved_model_builder import (
+    GraphBuilder,
+    ModelBuilder,
+    SignatureBuilder,
+)
diff --git a/python/oneflow/sbp.py b/python/oneflow/sbp.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6e4190ff40ce87efd4556d6618144370b0ded95
--- /dev/null
+++ b/python/oneflow/sbp.py
@@ -0,0 +1,19 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.framework.distribute import split_sbp as split
+
+broadcast = oneflow._oneflow_internal.sbp.broadcast()
+partial_sum = oneflow._oneflow_internal.sbp.partial_sum()
diff --git a/python/oneflow/scope.py b/python/oneflow/scope.py
new file mode 100644
index 0000000000000000000000000000000000000000..7da334b8d0669ddd64f1587d31b854c088dcc5d0
--- /dev/null
+++ b/python/oneflow/scope.py
@@ -0,0 +1,26 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.experimental.namescope import name_scope as namespace
+from oneflow.framework.distribute import (
+    ConsistentStrategyEnabled as consistent_view_enabled,
+)
+from oneflow.framework.distribute import DistributeConsistentStrategy as consistent_view
+from oneflow.framework.distribute import DistributeMirroredStrategy as mirrored_view
+from oneflow.framework.distribute import (
+    MirroredStrategyEnabled as mirrored_view_enabled,
+)
+from oneflow.framework.placement_util import api_placement as placement
+from oneflow.framework.scope_util import deprecated_current_scope as current_scope
diff --git a/python/oneflow/serving/__init__.py b/python/oneflow/serving/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcbfb38a4afd9e4ecb9c2abf8e60ae8214ae07de
--- /dev/null
+++ b/python/oneflow/serving/__init__.py
@@ -0,0 +1,21 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from oneflow.serving.inference_session import (
+    InferenceSession,
+    ModelVersionPolicy,
+    SessionOption,
+)
diff --git a/python/oneflow/serving/inference_session.py b/python/oneflow/serving/inference_session.py
new file mode 100644
index 0000000000000000000000000000000000000000..f23462a5d7863769dbb7617778bb506c7744d2e1
--- /dev/null
+++ b/python/oneflow/serving/inference_session.py
@@ -0,0 +1,487 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import asyncio
+import contextlib
+import enum
+import inspect
+import os
+
+import google.protobuf.text_format as text_format
+import numpy as np
+
+import oneflow as flow
+import oneflow._oneflow_internal
+import oneflow._oneflow_internal.oneflow.core.common.data_type as dtype_proto_cfg
+import oneflow._oneflow_internal.oneflow.core.common.shape as shape_proto_cfg
+import oneflow._oneflow_internal.oneflow.core.job.job_conf as job_conf_proto_cfg
+import oneflow._oneflow_internal.oneflow.core.job.sbp_parallel as sbp_parallel_cfg
+import oneflow._oneflow_internal.oneflow.core.operator.interface_blob_conf as interface_blob_conf_proto_cfg
+import oneflow.core.job.job_conf_pb2 as job_conf_proto
+import oneflow.core.operator.interface_blob_conf_pb2 as interface_blob_conf_proto
+import oneflow.core.serving.saved_model_pb2 as saved_model_pb
+import oneflow.framework.c_api_util as c_api_util
+import oneflow.framework.compile_context as compile_ctx
+import oneflow.framework.dtype as dtype_util
+import oneflow.framework.input_blob_def as input_blob_util
+import oneflow.framework.job_instance as job_instance_util
+import oneflow.framework.placement_util as placement_util
+import oneflow.framework.runtime_mode as runtime_mode
+import oneflow.framework.scope_util as scope_util
+import oneflow.framework.session_util as session_util
+
+
+def _is_int(val):
+    try:
+        num = int(val)
+    except ValueError:
+        return False
+    return True
+
+
+def _find_model_latest_version(saved_model_dir):
+    version_dirs = []
+    for f in os.listdir(saved_model_dir):
+        if os.path.isdir(os.path.join(saved_model_dir, f)) and _is_int(f):
+            version_dirs.append(f)
+    version_dirs.sort(reverse=True, key=lambda x: int(x))
+    return version_dirs[0]
+
+
+def _need_check_device_tag(op_conf):
+    if op_conf.HasField("return_conf"):
+        return False
+    return op_conf.HasField("device_tag")
+
+
+def _signature_proto_to_cfg(signature_proto, mut_signature_cfg):
+    assert isinstance(signature_proto, job_conf_proto.JobSignatureDef)
+    assert isinstance(mut_signature_cfg, job_conf_proto_cfg.JobSignatureDef)
+    for (input_name, input_def) in signature_proto.inputs.items():
+        input_def_cfg = job_conf_proto_cfg.JobInputDef()
+        input_def_cfg.mutable_lbi().set_op_name(input_def.lbi.op_name)
+        input_def_cfg.mutable_lbi().set_blob_name(input_def.lbi.blob_name)
+        _inferface_blob_conf_proto_to_cfg(
+            input_def.blob_conf, input_def_cfg.mutable_blob_conf()
+        )
+        mut_signature_cfg.mutable_inputs()[input_name].CopyFrom(input_def_cfg)
+    for (output_name, output_def) in signature_proto.outputs.items():
+        output_def_cfg = job_conf_proto_cfg.JobOutputDef()
+        output_def_cfg.mutable_lbi().set_op_name(output_def.lbi.op_name)
+        output_def_cfg.mutable_lbi().set_blob_name(output_def.lbi.blob_name)
+        mut_signature_cfg.mutable_outputs()[output_name].CopyFrom(output_def_cfg)
+
+
+def _inferface_blob_conf_proto_to_cfg(
+    inferface_blob_conf_proto, mut_inferface_blob_conf_cfg
+):
+    assert isinstance(
+        inferface_blob_conf_proto, interface_blob_conf_proto.InterfaceBlobConf
+    )
+    assert isinstance(
+        mut_inferface_blob_conf_cfg, interface_blob_conf_proto_cfg.InterfaceBlobConf
+    )
+    shape = shape_proto_cfg.ShapeProto()
+    for dim in inferface_blob_conf_proto.shape.dim:
+        shape.add_dim(dim)
+    mut_inferface_blob_conf_cfg.mutable_shape().CopyFrom(shape)
+    dtype = dtype_proto_cfg.DataType(int(inferface_blob_conf_proto.data_type))
+    mut_inferface_blob_conf_cfg.set_data_type(dtype)
+    if inferface_blob_conf_proto.HasField("parallel_distribution"):
+        assert len(inferface_blob_conf_proto.parallel_distribution.sbp_parallel) == 1
+        sbp_proto = inferface_blob_conf_proto.parallel_distribution.sbp_parallel[0]
+        if sbp_proto.HasField("split_parallel"):
+            split_axis = sbp_proto.split_parallel.axis
+            sbp = sbp_parallel_cfg.SbpParallel()
+            sbp.mutable_split_parallel().set_axis(split_axis)
+            mut_inferface_blob_conf_cfg.mutable_parallel_distribution().mutable_sbp_parallel().Add().CopyFrom(
+                sbp
+            )
+    mut_inferface_blob_conf_cfg.set_is_dynamic(inferface_blob_conf_proto.is_dynamic)
+
+
+class ModelVersionPolicy(enum.Enum):
+    LATEST = 1
+
+
+class SessionOption(object):
+    def __init__(self):
+        self.device_tag = "gpu"
+        self.device_num = 1
+        self.is_mirrored_view = False
+
+
+class InferenceSession(object):
+    class SessionStatus(enum.Enum):
+        OPEN = 1
+        RUNNING = 2
+        CLOSED = 3
+
+    def __init__(self, option=None):
+        if option is None:
+            self.option_ = SessionOption()
+        else:
+            assert isinstance(option, SessionOption)
+            self.option_ = option
+        self.is_mirrored_ = self.option_.is_mirrored_view
+        self.checkpoint_path_ = None
+        self.config_proto_ = None
+        self.job_name2job_conf_ = {}
+        self.inter_user_job_info_ = None
+        self.cur_job_name_ = None
+        self.inferface_name2info_ = {}
+        self.output_name2future_ = {}
+        self.job_futures_ = []
+        self.status_ = None
+        self._init_event_loop()
+        self.init()
+
+    def __del__(self):
+        if self.status_ != self.SessionStatus.CLOSED:
+            self.close()
+
+    def _init_event_loop(self):
+        self.event_loop_ = asyncio.get_event_loop()
+        if self.event_loop_.is_closed():
+            asyncio.set_event_loop(asyncio.new_event_loop())
+            self.event_loop_ = asyncio.get_event_loop()
+
+    def init(self):
+        if not oneflow._oneflow_internal.IsEnvInited():
+            flow.env.init()
+        if not oneflow._oneflow_internal.IsSessionInited():
+            self._make_config_proto()
+            session_util._TryCompleteConfigProto(self.config_proto_)
+            c_api_util.InitLazyGlobalSession(self.config_proto_)
+        self.status_ = self.SessionStatus.OPEN
+
+    def close(self):
+        self.event_loop_.run_until_complete(self.wait_for_all_jobs_finished())
+        self.event_loop_.close()
+        if self.status_ == self.SessionStatus.RUNNING:
+            oneflow._oneflow_internal.StopLazyGlobalSession()
+            oneflow._oneflow_internal.DestroyLazyGlobalSession()
+        elif self.status_ == self.SessionStatus.OPEN:
+            oneflow._oneflow_internal.DestroyLazyGlobalSession()
+        else:
+            pass
+        self.status_ = self.SessionStatus.CLOSED
+
+    def _check_status(self, *status):
+        check_success = False
+        for stat in status:
+            if self.status_ == stat:
+                check_success = True
+                break
+        if check_success is False:
+            caller_func_name = inspect.stack()[1].function
+            allowed_status = ",".join(status)
+            raise ValueError(
+                "The calling to {} is only allowed when status is {}, current status is {}".format(
+                    caller_func_name, allowed_status, self.status_
+                )
+            )
+
+    def _make_config_proto(self):
+        if self.config_proto_ is None:
+            self.config_proto_ = session_util._GetDefaultConfigProto()
+        if self.option_.device_tag == "gpu":
+            self.config_proto_.resource.gpu_device_num = self.option_.device_num
+        elif self.option_.device_tag == "cpu":
+            self.config_proto_.resource.cpu_device_num = self.option_.device_num
+            self.config_proto_.resource.gpu_device_num = 0
+        else:
+            raise NotImplementedError(
+                "not supported device tag {}".format(self.option_.device_tag)
+            )
+        self.config_proto_.resource.enable_legacy_model_io = True
+
+    def set_checkpoint_path(self, checkpoint_path):
+        self._check_status(self.SessionStatus.OPEN)
+        self.checkpoint_path_ = checkpoint_path
+
+    def set_job_signature(self, job_name, signature):
+        assert isinstance(signature, job_conf_proto.JobSignatureDef)
+        job_conf = self._get_job_conf(job_name)
+        _signature_proto_to_cfg(signature, job_conf.mutable_signature())
+
+    def set_job_batch_size(self, job_name, batch_size):
+        self._check_status(self.SessionStatus.OPEN)
+        job_conf = self._get_job_conf(job_name)
+        for (_, mut_input_def) in job_conf.mutable_signature().mutable_inputs().items():
+            mut_shape = mut_input_def.mutable_blob_conf().mutable_shape()
+            mut_shape.mutable_dim()[0] = batch_size
+
+    def _get_job_conf(self, job_name):
+        if job_name in self.job_name2job_conf_:
+            return self.job_name2job_conf_[job_name]
+        else:
+            job_conf = job_conf_proto_cfg.JobConfigProto()
+            job_conf.set_job_name(job_name)
+            job_conf.mutable_predict_conf()
+            self.job_name2job_conf_[job_name] = job_conf
+            return job_conf
+
+    @contextlib.contextmanager
+    def open(self, job_name, signature=None, batch_size=None):
+        self._check_status(self.SessionStatus.OPEN)
+        c_api_util.JobBuildAndInferCtx_Open(job_name)
+        if signature is not None:
+            self.set_job_signature(job_name, signature)
+        if isinstance(batch_size, int):
+            self.set_job_batch_size(job_name, batch_size)
+        job_conf = self._get_job_conf(job_name)
+        c_api_util.CurJobBuildAndInferCtx_SetJobConf(job_conf)
+        tag_and_dev_ids = placement_util.GetDefaultMachineDeviceIds(
+            self.config_proto_.resource
+        )
+        scope = scope_util.MakeInitialScope(
+            job_conf, *tag_and_dev_ids, None, self.is_mirrored_
+        )
+        with runtime_mode.ModeScope(runtime_mode.GLOBAL_MODE):
+            with scope_util.ScopeContext(scope):
+                self.cur_job_name_ = job_name
+                yield self
+                self.cur_job_name_ = None
+        oneflow._oneflow_internal.JobBuildAndInferCtx_Close()
+
+    def compile(self, op_list):
+        self._check_status(self.SessionStatus.OPEN)
+        scope = flow.current_scope()
+        device_tag = scope.device_parallel_desc_symbol.device_tag
+        for op_conf in op_list:
+            if _need_check_device_tag(op_conf) and op_conf.device_tag != device_tag:
+                print(
+                    "WARNING: the device_tag of op {} is not equal to the device_tag of seesion's current scope ({} vs. {}), which may cause the op graph to be incompatible".format(
+                        op_conf.name, op_conf.device_tag, device_tag
+                    )
+                )
+            compile_ctx.CurJobAddOp(op_conf)
+        oneflow._oneflow_internal.CurJobBuildAndInferCtx_Complete()
+        oneflow._oneflow_internal.CurJobBuildAndInferCtx_Rebuild()
+
+    def launch(self):
+        self._check_status(self.SessionStatus.OPEN)
+        oneflow._oneflow_internal.StartLazyGlobalSession()
+        self.inter_user_job_info_ = c_api_util.GetInterUserJobInfo()
+        self._run_load_checkpoint_job()
+        self.status_ = self.SessionStatus.RUNNING
+
+    def load_saved_model(
+        self,
+        saved_model_dir,
+        model_version=ModelVersionPolicy.LATEST,
+        saved_model_meta_file_basename="saved_model",
+        graph_name=None,
+        signature_name=None,
+    ):
+        if not os.path.isdir(saved_model_dir):
+            raise ValueError("{} is not a valid directory".format(saved_model_dir))
+        if isinstance(model_version, int):
+            pass
+        elif model_version == ModelVersionPolicy.LATEST:
+            model_version = _find_model_latest_version(saved_model_dir)
+        else:
+            raise NotImplementedError
+        saved_model_path = os.path.join(saved_model_dir, str(model_version))
+        if not os.path.isdir(saved_model_path):
+            raise ValueError(
+                "version {} of saved model in dir {} do not exist".format(
+                    model_version, saved_model_dir
+                )
+            )
+        subfiles = list(os.listdir(saved_model_path))
+        saved_model_meta_pb_filename = saved_model_meta_file_basename + ".pb"
+        saved_model_meta_prototxt_filename = (
+            saved_model_meta_file_basename + ".prototxt"
+        )
+        saved_model_proto = saved_model_pb.SavedModel()
+        if saved_model_meta_pb_filename in subfiles:
+            saved_model_meta_file_path = os.path.join(
+                saved_model_path, saved_model_meta_pb_filename
+            )
+            with open(saved_model_meta_file_path, "rb") as f:
+                saved_model_proto.ParseFromString(f.read())
+        elif saved_model_meta_prototxt_filename in subfiles:
+            saved_model_meta_file_path = os.path.join(
+                saved_model_path, saved_model_meta_prototxt_filename
+            )
+            with open(saved_model_meta_file_path, "rt") as f:
+                text_format.Merge(f.read(), saved_model_proto)
+        else:
+            raise ValueError(
+                "saved model meta file {} do not exist in {}".format(
+                    saved_model_meta_file_basename, saved_model_path
+                )
+            )
+        self.set_checkpoint_path(
+            os.path.join(saved_model_path, saved_model_proto.checkpoint_dir)
+        )
+        signature = None
+        if graph_name is None:
+            graph_name = saved_model_proto.default_graph_name
+        elif graph_name not in saved_model_proto.graphs:
+            raise ValueError("graph {} do not exist".format(graph_name))
+        graph_def = saved_model_proto.graphs[graph_name]
+        if signature_name is None and graph_def.HasField("default_signature_name"):
+            signature_name = graph_def.default_signature_name
+        if signature_name is not None:
+            if signature_name not in graph_def.signatures:
+                raise ValueError("signature {} do not exist".format(signature_name))
+            else:
+                signature = graph_def.signatures[signature_name]
+        with self.open(graph_name, signature):
+            self.compile(graph_def.op_list)
+
+    def print_job_set(self):
+        self._check_status(self.SessionStatus.OPEN, self.SessionStatus.RUNNING)
+        job_set = c_api_util.GetJobSet()
+        for job in job_set.job:
+            print("job_name:", job.job_conf.job_name)
+            for op_conf in job.net.op:
+                print("\top_name:", op_conf.name)
+
+    def list_jobs(self):
+        self._check_status(self.SessionStatus.RUNNING)
+        return list(self.job_name2job_conf_.keys())
+
+    def list_inputs(self):
+        self._check_status(self.SessionStatus.RUNNING)
+        input_names = []
+        for (
+            input_name,
+            _,
+        ) in self.inter_user_job_info_.input_or_var_op_name2push_job_name.items():
+            input_names.append(input_name)
+        return tuple(input_names)
+
+    def list_outputs(self):
+        self._check_status(self.SessionStatus.RUNNING)
+        output_names = []
+        for (
+            output_name,
+            _,
+        ) in self.inter_user_job_info_.output_or_var_op_name2pull_job_name.items():
+            output_names.append(output_name)
+        return tuple(output_names)
+
+    def input_info(self, input_name, job_name=None):
+        return self._get_op_blob_info(job_name, input_name, "out")
+
+    def output_info(self, output_name, job_name=None):
+        return self._get_op_blob_info(job_name, output_name, "in")
+
+    def _get_op_blob_info(self, job_name, op_name, blob_name):
+        self._check_status(self.SessionStatus.OPEN, self.SessionStatus.RUNNING)
+        if op_name in self.inferface_name2info_:
+            return self.inferface_name2info_[op_name]
+        job_name = job_name or self.cur_job_name_
+        if job_name is None:
+            raise ValueError("please specify job_name")
+        lbn = oneflow._oneflow_internal.JobBuildAndInferCtx_GetOpBlobLbn(
+            job_name, op_name, blob_name
+        )
+        shape = c_api_util.JobBuildAndInferCtx_GetStaticShape(job_name, lbn)
+        dtype = c_api_util.JobBuildAndInferCtx_GetDataType(job_name, lbn)
+        dtype = dtype_util.convert_proto_dtype_to_oneflow_dtype(dtype)
+        info = dict(shape=shape, dtype=dtype)
+        self.inferface_name2info_[op_name] = info
+        return info
+
+    def run(self, job_name, **kwargs):
+        self._check_status(self.SessionStatus.RUNNING)
+        return self.event_loop_.run_until_complete(self.async_run(job_name, **kwargs))
+
+    async def async_run(self, job_name, **kwargs):
+        self._check_status(self.SessionStatus.RUNNING)
+        self._run_push_jobs(**kwargs)
+        job_inst = job_instance_util.MakeUserJobInstance(job_name)
+        self._run_job(job_inst)
+        output_futures = tuple(self._run_pull_jobs(job_name).values())
+        return await asyncio.gather(*output_futures)
+
+    def _run_job(self, job_inst):
+        future = self.event_loop_.create_future()
+
+        def job_finish_cb(_):
+            self.event_loop_.call_soon_threadsafe(future.set_result, None)
+
+        job_inst.AddPostFinishCallback(job_finish_cb)
+        oneflow._oneflow_internal.LaunchJob(job_inst)
+        self.job_futures_.append(future)
+
+    def _run_push_jobs(self, **kwargs):
+        for (
+            input_name,
+            push_job_name,
+        ) in self.inter_user_job_info_.input_or_var_op_name2push_job_name.items():
+            if input_name not in kwargs:
+                raise ValueError('input "{}" is absent'.format(input_name))
+            input_numpy = kwargs[input_name]
+            if not isinstance(input_numpy, np.ndarray):
+                raise ValueError('input "{}" requires numpy.ndarray'.format(input_name))
+            push_fn = input_blob_util._MakePushNdarrayCallback(input_numpy)
+            push_job_inst = job_instance_util.MakePushJobInstance(
+                push_job_name, input_name, push_fn
+            )
+            self._run_job(push_job_inst)
+
+    def _run_pull_jobs(self, user_job_name):
+        output_futures = {}
+        for (
+            output_name,
+            pull_job_name,
+        ) in self.inter_user_job_info_.output_or_var_op_name2pull_job_name.items():
+            future = self.event_loop_.create_future()
+            pull_fn = self._make_pull_job_cb(output_name, user_job_name, future)
+            pull_job_inst = job_instance_util.MakePullJobInstance(
+                pull_job_name, output_name, pull_fn
+            )
+            self._run_job(pull_job_inst)
+            output_futures[output_name] = future
+        return output_futures
+
+    def _make_pull_job_cb(self, output_name, user_job_name, future):
+        output_lbn = oneflow._oneflow_internal.JobBuildAndInferCtx_GetOpBlobLbn(
+            user_job_name, output_name, "out"
+        )
+        split_axis = c_api_util.JobBuildAndInferCtx_GetSplitAxisFromProducerView(
+            user_job_name, output_lbn
+        )
+
+        def pull_fn(ofblob):
+            ndarray = ofblob.CopyToNdarray()
+            self.event_loop_.call_soon_threadsafe(future.set_result, ndarray)
+
+        return pull_fn
+
+    def _run_load_checkpoint_job(self):
+        if self.checkpoint_path_ is None:
+            raise ValueError("checkpoint path not set")
+
+        def copy_model_load_path(ofblob):
+            ofblob.CopyFromNdarray(
+                np.frombuffer(self.checkpoint_path_.encode("ascii"), dtype=np.int8)
+            )
+
+        load_checkpoint_job_inst = job_instance_util.MakeJobInstance(
+            self.inter_user_job_info_.global_model_load_job_name,
+            push_cb=copy_model_load_path,
+        )
+        self._run_job(load_checkpoint_job_inst)
+
+    async def wait_for_all_jobs_finished(self):
+        await asyncio.gather(*self.job_futures_)
+        self.job_futures_ = []
diff --git a/python/oneflow/serving/saved_model_builder.py b/python/oneflow/serving/saved_model_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..34c98e8557a86fefa0691b335adc4e521b1d7436
--- /dev/null
+++ b/python/oneflow/serving/saved_model_builder.py
@@ -0,0 +1,312 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+import typing
+
+from google.protobuf import text_format
+
+import oneflow as flow
+import oneflow._oneflow_internal
+import oneflow.core.job.job_conf_pb2 as job_conf_pb
+import oneflow.core.job.sbp_parallel_pb2 as sbp_parallel_pb
+import oneflow.core.operator.interface_blob_conf_pb2 as interface_blob_conf_pb
+import oneflow.core.register.logical_blob_id_pb2 as logical_blob_id_pb
+import oneflow.core.serving.saved_model_pb2 as saved_model_pb
+import oneflow.framework.c_api_util as c_api_util
+import oneflow.framework.session_context as session_ctx
+
+
+class ModelBuilder(object):
+    DEFAULT_CHECKPOINT_DIR = "variables"
+    DEFAULT_SAVED_MODEL_FILE_BASENAME = "saved_model"
+
+    def __init__(self, save_path: str):
+        if not isinstance(save_path, str):
+            raise ValueError(
+                "param 'save_path' must be str, but got {}".format(save_path)
+            )
+        self.version_ = None
+        self.checkpoint_dir_ = self.DEFAULT_CHECKPOINT_DIR
+        self.saved_model_dir_ = save_path
+        self.saved_model_pb_filename_ = "{}.pb".format(
+            self.DEFAULT_SAVED_MODEL_FILE_BASENAME
+        )
+        self.saved_model_pbtxt_filename_ = "{}.prototxt".format(
+            self.DEFAULT_SAVED_MODEL_FILE_BASENAME
+        )
+        self.saved_model_proto_ = saved_model_pb.SavedModel()
+        self.graph_builders_ = {}
+
+    @property
+    def proto(self):
+        return self.saved_model_proto_
+
+    def ModelName(self, model_name: str):
+        assert isinstance(model_name, str)
+        self.proto.name = model_name
+        return self
+
+    def Version(self, version: int):
+        assert isinstance(version, int)
+        self.version_ = version
+        return self
+
+    def AddFunction(self, func):
+        func_name = func.__name__
+        if func_name in self.graph_builders_:
+            raise ValueError("function with name {} already exists".format(func_name))
+        graph_builder = GraphBuilder(func_name, self)
+        self.graph_builders_[func_name] = graph_builder
+        if not self.proto.HasField("default_graph_name"):
+            self.proto.default_graph_name = func_name
+        return graph_builder
+
+    def _check_input_output_name_conflict(self):
+        name_set = set()
+        lbn_set = set()
+
+        def check_name_conflict(name, interface_def):
+            if name in name_set:
+                raise ValueError("input conflict, {} already exist".format(name))
+            name_set.add(name)
+            lbn = Lbi2Lbn(interface_def.lbi)
+            if lbn in lbn_set:
+                raise ValueError(
+                    "input conflict, {} already bind to other input".format(lbn)
+                )
+            lbn_set.add(lbn)
+
+        for (_, graph_def) in self.proto.graphs.items():
+            for (_, signature_def) in graph_def.signatures.items():
+                for (input_name, input_def) in signature_def.inputs.items():
+                    check_name_conflict(input_name, input_def)
+                for (output_name, output_def) in signature_def.outputs.items():
+                    check_name_conflict(output_name, output_def)
+
+    @session_ctx.try_init_default_session
+    def Save(self, save_model_before_graph_complete: bool = True):
+        self._check_input_output_name_conflict()
+        for (_, graph_builder) in self.graph_builders_.items():
+            if not graph_builder.finished:
+                graph_builder.Finish()
+        sess = session_ctx.GetDefaultSession()
+        for (graph_name, graph_def) in self.proto.graphs.items():
+            job = sess.Job(
+                graph_name
+                if save_model_before_graph_complete
+                else graph_name + "_after_complete"
+            )
+            graph_def.op_list.extend(list(job.net.op))
+        if not os.path.exists(self.saved_model_dir_):
+            os.makedirs(self.saved_model_dir_)
+        if self.version_ is None:
+            raise ValueError("model version is not set")
+        version_dir = os.path.join(self.saved_model_dir_, str(self.version_))
+        if os.path.exists(version_dir):
+            raise ValueError(
+                'Directory of model "{}" version "{}" already exist.'.format(
+                    self.saved_model_dir_, self.version_
+                )
+            )
+        os.makedirs(version_dir)
+        self.proto.version = self.version_
+        checkpoint_path = os.path.join(version_dir, self.checkpoint_dir_)
+        flow.checkpoint.save(checkpoint_path)
+        self.proto.checkpoint_dir = self.checkpoint_dir_
+        saved_model_pb_path = os.path.join(version_dir, self.saved_model_pb_filename_)
+        with open(saved_model_pb_path, "wb") as writer:
+            writer.write(self.saved_model_proto_.SerializeToString())
+        saved_model_pbtxt_path = os.path.join(
+            version_dir, self.saved_model_pbtxt_filename_
+        )
+        with open(saved_model_pbtxt_path, "wt") as writer:
+            writer.write(text_format.MessageToString(self.saved_model_proto_))
+
+
+class GraphBuilder(object):
+    def __init__(self, name: str, model_builder: typing.Optional[ModelBuilder] = None):
+        if not isinstance(name, str):
+            raise ValueError("param 'name' must be str, but got {}".format(name))
+        if not isinstance(model_builder, ModelBuilder) and model_builder is not None:
+            raise ValueError(
+                "param 'model_builder' must be a type of ModelBuilder or None"
+            )
+        if model_builder is not None:
+            if name in model_builder.proto.graphs:
+                raise ValueError(
+                    "graph function ({}) is already added to model ({})".format(
+                        name, model_builder.proto.name
+                    )
+                )
+            self.proto_ = model_builder.proto.graphs[name]
+            self.owner_ = model_builder
+        else:
+            self.proto_ = saved_model_pb.GraphDef()
+            self.owner_ = None
+        self.name_ = name
+        self.finished_ = False
+        self.signature_builders_ = {}
+
+    @property
+    def name(self):
+        return self.name_
+
+    @property
+    def proto(self):
+        return self.proto_
+
+    @property
+    def finished(self):
+        return self.finished_
+
+    def AddSignature(self, signature_name: str):
+        assert isinstance(signature_name, str)
+        if signature_name in self.signature_builders_:
+            raise ValueError("signature name {} already exists".format(signature_name))
+        signature_builder = SignatureBuilder(signature_name, self)
+        self.signature_builders_[signature_name] = signature_builder
+        if not self.proto.HasField("default_signature_name"):
+            self.proto.default_signature_name = signature_name
+        return signature_builder
+
+    def Finish(self):
+        assert self.finished is False
+        for (_, signature_def) in self.proto.signatures.items():
+            for (_, input_def) in signature_def.inputs.items():
+                input_lbn = Lbi2Lbn(input_def.lbi)
+                oneflow._oneflow_internal.JobBuildAndInferCtx_CheckLbnValidAndExist(
+                    self.name, input_lbn
+                )
+                GetInterfaceBlobConf(self.name, input_lbn, input_def.blob_conf)
+            for (_, output_def) in signature_def.outputs.items():
+                oneflow._oneflow_internal.JobBuildAndInferCtx_CheckLbnValidAndExist(
+                    self.name, Lbi2Lbn(output_def.lbi)
+                )
+        self.finished_ = True
+
+    def OwnerModelBuilder(self):
+        return self.owner_
+
+    def AsDefault(self):
+        if self.owner_ is not None:
+            self.owner_.proto.default_graph_name = self.name
+        return self
+
+
+class SignatureBuilder(object):
+    def __init__(self, name: str, graph_builder: typing.Optional[GraphBuilder] = None):
+        if not isinstance(name, str):
+            raise ValueError("param 'name' must be str, but got {}".format(name))
+        if not isinstance(graph_builder, GraphBuilder) and graph_builder is not None:
+            raise ValueError(
+                "param 'graph_builder' must be a type of GraphBuilder or None"
+            )
+        if graph_builder is not None:
+            if name in graph_builder.proto.signatures:
+                raise ValueError(
+                    "signature ({}) already exist in graph ({})".format(
+                        name, graph_builder.name
+                    )
+                )
+            self.proto_ = graph_builder.proto.signatures[name]
+            self.owner_ = graph_builder
+        else:
+            self.proto_ = job_conf_pb.JobSignatureDef()
+            self.owner_ = None
+        self.name_ = name
+
+    @property
+    def name(self):
+        return self.name_
+
+    @property
+    def proto(self):
+        return self.proto_
+
+    def Input(self, input_name: str, lbn: str):
+        assert isinstance(input_name, str)
+        assert isinstance(lbn, str)
+        assert "/" in lbn
+        if input_name in self.proto.inputs:
+            raise ValueError(
+                "input_name ({}) already exist in signature ({}) of graph ({})".format(
+                    input_name, self.name, self.graph_builder_.name
+                )
+            )
+        input_def = self.proto.inputs[input_name]
+        Lbn2Lbi(lbn, input_def.lbi)
+        return self
+
+    def Output(self, output_name: str, lbn: str):
+        assert isinstance(output_name, str)
+        assert isinstance(lbn, str)
+        assert "/" in lbn
+        if output_name in self.proto.outputs:
+            raise ValueError(
+                "output_name ({}) already exist in signature ({}) of graph ({})".format(
+                    output_name, self.name, self.graph_builder_.name
+                )
+            )
+        output_def = self.proto.outputs[output_name]
+        Lbn2Lbi(lbn, output_def.lbi)
+        return self
+
+    def OwnerGraphBuilder(self):
+        return self.owner_
+
+    def AsDefault(self):
+        if self.owner_ is not None:
+            self.owner_.proto.default_signature_name = self.name
+        return self
+
+
+def GetInterfaceBlobConf(job_name, lbn, blob_conf=None):
+    assert isinstance(job_name, str)
+    assert isinstance(lbn, str)
+    if blob_conf is None:
+        blob_conf = interface_blob_conf_pb.InterfaceBlobConf()
+    else:
+        assert isinstance(blob_conf, interface_blob_conf_pb.InterfaceBlobConf)
+    shape = c_api_util.JobBuildAndInferCtx_GetStaticShape(job_name, lbn)
+    dtype = c_api_util.JobBuildAndInferCtx_GetDataType(job_name, lbn)
+    split_axis = c_api_util.JobBuildAndInferCtx_GetSplitAxisFromProducerView(
+        job_name, lbn
+    )
+    is_dynamic = c_api_util.JobBuildAndInferCtx_IsDynamic(job_name, lbn)
+    blob_conf.shape.dim.extend(shape)
+    blob_conf.data_type = dtype
+    if split_axis is not None:
+        sbp_parallel = sbp_parallel_pb.SbpParallel()
+        sbp_parallel.split_parallel.axis = split_axis
+        blob_conf.parallel_distribution.sbp_parallel.extend([sbp_parallel])
+    blob_conf.is_dynamic = is_dynamic
+    return blob_conf
+
+
+def Lbn2Lbi(lbn, lbi=None):
+    assert isinstance(lbn, str)
+    assert "/" in lbn, 'invalid lbn "{}"'.format(lbn)
+    [op_name, blob_name] = lbn.split("/")
+    if lbi is None:
+        lbi = logical_blob_id_pb.LogicalBlobId()
+    lbi.op_name = op_name
+    lbi.blob_name = blob_name
+    return lbi
+
+
+def Lbi2Lbn(lbi):
+    assert isinstance(lbi, logical_blob_id_pb.LogicalBlobId)
+    return "{}/{}".format(lbi.op_name, lbi.blob_name)
diff --git a/python/oneflow/support/__init__.py b/python/oneflow/support/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/python/oneflow/support/async_util.py b/python/oneflow/support/async_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..8143316f54ae66eb5e431715bb009a4f140321f2
--- /dev/null
+++ b/python/oneflow/support/async_util.py
@@ -0,0 +1,38 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import threading
+
+
+def Await(counter, func):
+    assert counter > 0
+    cond_var = threading.Condition()
+    counter_box = [counter]
+    result_list = []
+
+    def Yield(result=None):
+        result_list.append(result)
+        cond_var.acquire()
+        assert counter_box[0] > 0
+        counter_box[0] -= 1
+        cond_var.notify()
+        cond_var.release()
+
+    func(Yield)
+    cond_var.acquire()
+    while counter_box[0] > 0:
+        cond_var.wait()
+    cond_var.release()
+    return result_list
diff --git a/python/oneflow/support/box.py b/python/oneflow/support/box.py
new file mode 100644
index 0000000000000000000000000000000000000000..91442247fc7b793255d6121467595229aa729f7e
--- /dev/null
+++ b/python/oneflow/support/box.py
@@ -0,0 +1,40 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+
+class Box(object):
+    def __init__(self, *arg):
+        assert len(arg) <= 1
+        self.has_value_ = len(arg) > 0
+        self.value_ = None
+        if self.has_value_:
+            self.value_ = arg[0]
+
+    @property
+    def value(self):
+        assert self.has_value_
+        return self.value_
+
+    @property
+    def value_setter(self):
+        return lambda val: self.set_value(val)
+
+    def set_value(self, val):
+        self.value_ = val
+        self.has_value_ = True
+
+    def has_value(self):
+        return self.has_value_
diff --git a/python/oneflow/support/enable_if.py b/python/oneflow/support/enable_if.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9e10337d80b5a8cc02f630f75b7dfc01bf1cf31
--- /dev/null
+++ b/python/oneflow/support/enable_if.py
@@ -0,0 +1,103 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import inspect
+
+import oneflow.support.traceinfo as traceinfo
+
+
+def condition(hob_expr):
+    def Decorator(func):
+        func.__oneflow_condition_hob__ = hob_expr
+        return func
+
+    return Decorator
+
+
+def get_condition_hob(func):
+    assert hasattr(func, "__oneflow_condition_hob__")
+    return func.__oneflow_condition_hob__
+
+
+def set_condition_hob(func, hob):
+    func.__oneflow_condition_hob__ = hob
+
+
+def unique(arg_funcs, context=None, default=None):
+    assert isinstance(arg_funcs, (list, tuple))
+    conditional_functions = []
+    for arg_func in arg_funcs:
+        if isinstance(arg_func, tuple):
+            (func, hob_expr) = arg_func
+        elif inspect.isfunction(arg_func):
+            func = arg_func
+            assert hasattr(func, "__oneflow_condition_hob__")
+            hob_expr = func.__oneflow_condition_hob__
+        else:
+            raise NotImplementedError
+        debug_str = func.__name__
+        if hasattr(func, "__debug_str__"):
+            debug_str = func.__debug_str__
+        conditional_functions.append((hob_expr, func, debug_str))
+    if default is None:
+
+        def default(get_failed_info, *args, **kwargs):
+            raise NotImplementedError(get_failed_info())
+
+    matched_func = GetMatchedFunction(default, conditional_functions, context=context)
+    if matched_func is not None:
+        return matched_func
+    return MakeDefaultFunction(default, conditional_functions, context=context)
+
+
+def GetMatchedFunction(default, conditional_functions, context=None):
+    select_triple = (None, None, None)
+    for triple in conditional_functions:
+        if not triple[0](context):
+            continue
+        if select_triple[1] is not None:
+            return _MultiMatchedErrorFunction(
+                default, [select_triple, triple], context=context
+            )
+        select_triple = triple
+    return select_triple[1]
+
+
+def MakeDefaultFunction(default, conditional_functions, context=None):
+    def get_failed_info(customized_prompt=None):
+        failed_info = "no avaliable function found.\n"
+        for (bf, func, location) in conditional_functions:
+            prompt = location if customized_prompt is None else customized_prompt
+            failed_info += "\n%s: \x1b[1;31mFAILED\x1b[0m\n\t%s\n" % (
+                prompt,
+                bf.debug_str(context),
+            )
+        return failed_info
+
+    return lambda *args, **kwargs: default(get_failed_info, *args, **kwargs)
+
+
+def _MultiMatchedErrorFunction(default, matched_functions, context=None):
+    def get_failed_info(customized_prompt=None):
+        failed_info = "at least two conditional functions matched.\n"
+        for (bf, func, location) in matched_functions:
+            prompt = location if customized_prompt is None else customized_prompt
+            failed_info += "\n%s: \x1b[1;31mPASSED\x1b[0m\n\t%s\n" % (
+                prompt,
+                bf.debug_str(context),
+            )
+        return failed_info
+
+    return lambda *args, **kwargs: default(get_failed_info, *args, **kwargs)
diff --git a/python/oneflow/support/func_inspect_util.py b/python/oneflow/support/func_inspect_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..acfb0ced42062f6104d93523835ff0b0890d48fc
--- /dev/null
+++ b/python/oneflow/support/func_inspect_util.py
@@ -0,0 +1,49 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import inspect
+import sys
+
+if sys.version_info > (2, 7) and sys.version_info < (3, 0):
+
+    def GetArgNameAndDefaultTuple(func):
+        """
+      returns a dictionary of arg_name:default_values for the input function
+      """
+        (args, varargs, keywords, defaults) = inspect.getargspec(func)
+        defaults = list(defaults) if defaults is not None else []
+        while len(defaults) < len(args):
+            defaults.insert(0, None)
+        return tuple(zip(args, defaults))
+
+
+elif sys.version_info >= (3, 0):
+
+    def GetArgNameAndDefaultTuple(func):
+        signature = inspect.signature(func)
+        return tuple(
+            [
+                (k, v.default if v.default is not inspect.Parameter.empty else None)
+                for (k, v) in signature.parameters.items()
+            ]
+        )
+
+
+else:
+    raise NotImplementedError
+
+
+def GetArgDefaults(func):
+    return tuple(map(lambda x: x[1], GetArgNameAndDefaultTuple(func)))
diff --git a/python/oneflow/support/high_order_bool.py b/python/oneflow/support/high_order_bool.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e813722ff81a1f4a31499318c466c8f71cf787c
--- /dev/null
+++ b/python/oneflow/support/high_order_bool.py
@@ -0,0 +1,207 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow
+import oneflow._oneflow_internal
+
+
+def bool_functor(verbose_debug_str):
+    def Decorator(match_function):
+        return HighOrderBool(verbose_debug_str, match_function)
+
+    return Decorator
+
+
+def hob_context_attr(attr_name):
+    def Decorator(attr_getter):
+        return HobContextAttr(attr_name, attr_getter)
+
+    return Decorator
+
+
+class BoolFunctor(object):
+    def debug_str(self, ctx, display_result=True):
+        if hasattr(self, "__debug_str__"):
+            if display_result:
+                return '"%s"[%s]' % (self.__debug_str__, self(ctx))
+            else:
+                return '"%s"' % self.__debug_str__
+        return self.verbose_debug_str(ctx, display_result=display_result)
+
+    def verbose_debug_str(self, ctx, display_result=True):
+        raise NotImplementedError
+
+    def __call__(self, ctx):
+        raise NotImplementedError
+
+    def __and__(self, rhs):
+        return _AndBoolFunctor(self, rhs)
+
+    def __or__(self, rhs):
+        return _OrBoolFunctor(self, rhs)
+
+    def __invert__(self):
+        return _NotBoolFunctor(self)
+
+
+class HighOrderBool(BoolFunctor):
+    def __init__(self, verbose_debug_str, function):
+        self.verbose_debug_str_ = verbose_debug_str
+        self.function_ = function
+
+    def verbose_debug_str(self, ctx, display_result=True):
+        if display_result:
+            return '"%s"[%s]' % (self.verbose_debug_str_, self.function_(ctx))
+        else:
+            return '"%s"' % self.verbose_debug_str_
+
+    def __call__(self, ctx):
+        return self.function_(ctx)
+
+
+always_true = HighOrderBool("Always true", lambda: True)
+always_false = HighOrderBool("Always false", lambda: False)
+
+
+class _AndBoolFunctor(BoolFunctor):
+    def __init__(self, lhs, rhs):
+        assert isinstance(lhs, BoolFunctor)
+        assert isinstance(rhs, BoolFunctor)
+        self.lhs_ = lhs
+        self.rhs_ = rhs
+
+    def verbose_debug_str(self, ctx, display_result=True):
+        left_display = self.lhs_.debug_str(ctx, display_result)
+        display_result = display_result and self.lhs_(ctx)
+        right_display = self.rhs_.debug_str(ctx, display_result)
+        return "(%s and %s)" % (left_display, right_display)
+
+    def __call__(self, ctx):
+        return self.lhs_(ctx) and self.rhs_(ctx)
+
+
+class _OrBoolFunctor(BoolFunctor):
+    def __init__(self, lhs, rhs):
+        assert isinstance(lhs, BoolFunctor)
+        assert isinstance(rhs, BoolFunctor)
+        self.lhs_ = lhs
+        self.rhs_ = rhs
+
+    def verbose_debug_str(self, ctx, display_result=True):
+        left_display = self.lhs_.debug_str(ctx, display_result)
+        display_result = display_result and (not self.lhs_(ctx))
+        right_display = self.rhs_.debug_str(ctx, display_result)
+        return "(%s or %s)" % (left_display, right_display)
+
+    def __call__(self, ctx):
+        return self.lhs_(ctx) or self.rhs_(ctx)
+
+
+class _NotBoolFunctor(BoolFunctor):
+    def __init__(self, x):
+        assert isinstance(x, BoolFunctor)
+        self.x_ = x
+
+    def verbose_debug_str(self, ctx, display_result=True):
+        return "(not %s)" % self.x_.debug_str(ctx, display_result)
+
+    def __call__(self, ctx):
+        return not self.x_(ctx)
+
+
+class HobContextGetter(object):
+    def __init__(self, attr_name, attr_getter):
+        self.attr_name_ = attr_name
+        self.attr_getter_ = attr_getter
+
+    @property
+    def attr_name(self):
+        return self.attr_name_
+
+    @property
+    def attr_getter(self):
+        return self.attr_getter_
+
+    def __eq__(self, other):
+        if not isinstance(other, HobContextGetter):
+            other = HobContextConstant(other)
+        return self._MakeHob(other, "==", lambda a, b: a == b)
+
+    def __ne__(self, other):
+        if not isinstance(other, HobContextGetter):
+            other = HobContextConstant(other)
+        return self._MakeHob(other, "!=", lambda a, b: a != b)
+
+    def __gt__(self, other):
+        if not isinstance(other, HobContextGetter):
+            other = HobContextConstant(other)
+        return self._MakeHob(other, ">", lambda a, b: a > b)
+
+    def __ge__(self, other):
+        if not isinstance(other, HobContextGetter):
+            other = HobContextConstant(other)
+        return self._MakeHob(other, ">=", lambda a, b: a >= b)
+
+    def __lt__(self, other):
+        if not isinstance(other, HobContextGetter):
+            other = HobContextConstant(other)
+        return self._MakeHob(other, "<", lambda a, b: a < b)
+
+    def __le__(self, other):
+        if not isinstance(other, HobContextGetter):
+            other = HobContextConstant(other)
+        return self._MakeHob(other, "<=", lambda a, b: a <= b)
+
+    def _MakeHob(self, other, cmp_str, cmp_func):
+        @bool_functor("%s %s %s" % (self.attr_name, cmp_str, other.attr_name))
+        def HobHob(context):
+            return cmp_func(self.attr_getter(context), other.attr_getter(context))
+
+        return HobHob
+
+
+class HobContextConstant(HobContextGetter):
+    def __init__(self, value):
+        HobContextGetter.__init__(self, str(value), lambda ctx: value)
+
+
+class HobContextAttr(HobContextGetter):
+    def __init__(self, attr_name, attr_getter):
+        HobContextGetter.__init__(self, attr_name, attr_getter)
+
+    def __getattr__(self, attr_name):
+        @hob_context_attr("%s.%s" % (self.attr_name, attr_name))
+        def HobCtxAttr(ctx):
+            obj = self.attr_getter(ctx)
+            if isinstance(obj, oneflow._oneflow_internal.CfgMessage):
+                return getattr(obj, attr_name)()
+            else:
+                return getattr(obj, attr_name)
+
+        return HobCtxAttr
+
+    def HasField(self, attr_name):
+        @bool_functor('%s.HasField("%s")' % (self.attr_name, attr_name))
+        def BoolFunctor(ctx):
+            obj = self.attr_getter(ctx)
+            if isinstance(obj, oneflow._oneflow_internal.CfgMessage):
+                assert hasattr(obj, "has_" + attr_name), type(obj)
+                return getattr(obj, "has_" + attr_name)()
+            elif hasattr(obj, "HasField"):
+                return obj.HasField(attr_name)
+            else:
+                return hasattr(obj, attr_name)
+
+        return BoolFunctor
diff --git a/python/oneflow/support/lazy.py b/python/oneflow/support/lazy.py
new file mode 100644
index 0000000000000000000000000000000000000000..27660fd03d5676f0c922128188101b471b82e8e1
--- /dev/null
+++ b/python/oneflow/support/lazy.py
@@ -0,0 +1,29 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+
+class Lazy(object):
+    def __init__(self, get_value):
+        self.value_ = None
+        self.has_value_ = False
+        self.get_value_ = get_value
+
+    @property
+    def value(self):
+        if not self.has_value_:
+            self.value_ = self.get_value_()
+            self.has_value_ = True
+        return self.value_
diff --git a/python/oneflow/support/pb_util.py b/python/oneflow/support/pb_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1299a91f2395a251dc078da5d9c8c39e6a4884ad
--- /dev/null
+++ b/python/oneflow/support/pb_util.py
@@ -0,0 +1,92 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+
+def PythonDict2CFG(value, msg):
+    def extend_dict(values, msg):
+        for (k, v) in values.items():
+            if type(v) is dict:
+                extend_dict(v, getattr(msg, "mutable_" + k)())
+            elif type(v) is list or type(v) is tuple:
+                extend_list_or_tuple(v, msg, k)
+            else:
+                getattr(msg, "set_" + k)(v)
+
+    def extend_list_or_tuple(values, msg, attr):
+        if len(values) == 0 or type(values[0]) is dict:
+            msg = getattr(msg, "mutable_" + attr)()
+            for v in values:
+                cmd = msg.Add()
+                extend_dict(v, cmd)
+        else:
+            for v in values:
+                getattr(msg, "add_" + attr)(v)
+
+    extend_dict(value, msg)
+    return msg
+
+
+def PythonDict2PbMessage(value, msg):
+    def extend_dict(values, msg):
+        for (k, v) in values.items():
+            if type(v) is dict:
+                extend_dict(v, getattr(msg, k))
+            elif type(v) is list or type(v) is tuple:
+                extend_list_or_tuple(v, getattr(msg, k))
+            else:
+                setattr(msg, k, v)
+        else:
+            msg.SetInParent()
+
+    def extend_list_or_tuple(values, msg):
+        if len(values) == 0:
+            return
+        if type(values[0]) is dict:
+            for v in values:
+                cmd = msg.add()
+                extend_dict(v, cmd)
+        else:
+            msg.extend(values)
+
+    extend_dict(value, msg)
+    return msg
+
+
+def MergePbMessage(dst, src):
+    assert type(dst) is type(src)
+    for field in dst.DESCRIPTOR.fields:
+        field_name = field.name
+        if field.containing_oneof is not None:
+            if dst.WhichOneof(field.containing_oneof.name) is not None:
+                continue
+            src_field_name = src.WhichOneof(field.containing_oneof.name)
+            if src_field_name is None:
+                continue
+            if field_name != src_field_name:
+                continue
+        else:
+            if dst.HasField(field_name):
+                continue
+            if not src.HasField(field_name):
+                continue
+        _MergePbMessageField(dst, src, field)
+
+
+def _MergePbMessageField(dst, src, field):
+    if field.message_type is None:
+        setattr(dst, field.name, getattr(src, field.name))
+    else:
+        MergePbMessage(getattr(dst, field.name), getattr(src, field.name))
diff --git a/python/oneflow/support/scope_stack.py b/python/oneflow/support/scope_stack.py
new file mode 100644
index 0000000000000000000000000000000000000000..28014d06035a6133ed37a2cf9b254e5a35372066
--- /dev/null
+++ b/python/oneflow/support/scope_stack.py
@@ -0,0 +1,34 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from contextlib import contextmanager
+
+
+class ScopeStack(object):
+    def __init__(self, init=[]):
+        if not isinstance(init, list):
+            init = [init]
+        assert isinstance(init, list)
+        self.stack_ = init
+
+    def Current(self):
+        assert len(self.stack_) > 0
+        return self.stack_[0]
+
+    @contextmanager
+    def NewScope(self, scope):
+        self.stack_.insert(0, scope)
+        yield
+        self.stack_.pop(0)
diff --git a/python/oneflow/support/traceinfo.py b/python/oneflow/support/traceinfo.py
new file mode 100644
index 0000000000000000000000000000000000000000..11702a78c478c30e54d310bf9b94b7813ff58255
--- /dev/null
+++ b/python/oneflow/support/traceinfo.py
@@ -0,0 +1,34 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+import traceback
+
+
+def GetFrameLocationStr(depth=-1):
+    assert depth < 0
+    frame = traceback.extract_stack()[depth - 1]
+    return "%s:%d" % (frame[0], frame[1])
+
+
+def GetStackInfoExcludeOneflowPythonFile():
+    import oneflow
+
+    dirname = os.path.dirname(oneflow.__file__)
+    stack_info = traceback.extract_stack()
+    filtered_stack_info = filter(
+        lambda x: x[0].startswith(dirname) == False, stack_info
+    )
+    return list(filtered_stack_info)
diff --git a/python/oneflow/sysconfig.py b/python/oneflow/sysconfig.py
new file mode 100644
index 0000000000000000000000000000000000000000..428c8384487d6159ef0a0a2d3dbfed84ac0610b1
--- /dev/null
+++ b/python/oneflow/sysconfig.py
@@ -0,0 +1,26 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.framework.sysconfig import (
+    cmake_build_type,
+    get_compile_flags,
+    get_include,
+    get_lib,
+    get_link_flags,
+    has_rpc_backend_grpc,
+    has_rpc_backend_local,
+    with_cuda,
+    with_xla,
+)
diff --git a/python/oneflow/system.py b/python/oneflow/system.py
new file mode 100644
index 0000000000000000000000000000000000000000..270738abdb3ed7d30f57e19390283b8292c03335
--- /dev/null
+++ b/python/oneflow/system.py
@@ -0,0 +1,16 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.ops.assign_op import api_system_assign as assign
diff --git a/python/oneflow/tensorrt.py b/python/oneflow/tensorrt.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccfcbb537dab45e686fbbee6e50359fee62c2ef2
--- /dev/null
+++ b/python/oneflow/tensorrt.py
@@ -0,0 +1,19 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.contrib.tensorrt.tensorrt_api import (
+    cache_int8_calibration,
+    write_int8_calibration,
+)
diff --git a/python/oneflow/test/dataloader/test_numpy_dataset.py b/python/oneflow/test/dataloader/test_numpy_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..995df2b0f459a6246428e490239faeb4bedf5625
--- /dev/null
+++ b/python/oneflow/test/dataloader/test_numpy_dataset.py
@@ -0,0 +1,54 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow as flow
+import oneflow.unittest
+import oneflow.utils.data as Data
+
+
+class ScpDataset(Data.Dataset):
+    def __init__(self, chunksize=200, dim=81, length=2000):
+        self.chunksize = chunksize
+        self.dim = dim
+        self.length = length
+
+    def __getitem__(self, index):
+        np.random.seed(index)
+        return np.random.randn(self.chunksize, self.dim)
+
+    def __len__(self):
+        return self.length
+
+
+@flow.unittest.skip_unless_1n1d()
+@unittest.skipIf(
+    not flow.unittest.env.eager_execution_enabled(),
+    ".numpy() doesn't work in lazy mode",
+)
+class TestNumpyDataset(flow.unittest.TestCase):
+    def test_numpy_dataset(test_case):
+        dataset = ScpDataset()
+        dataloader = Data.DataLoader(dataset, batch_size=16, shuffle=True)
+        for X in dataloader:
+            test_case.assertEqual(X.shape, flow.Size([16, 200, 81]))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/dataloader/test_tensor_dataset.py b/python/oneflow/test/dataloader/test_tensor_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..046e730a1b75c77c9ec62e1dc108fa69d6f69bae
--- /dev/null
+++ b/python/oneflow/test/dataloader/test_tensor_dataset.py
@@ -0,0 +1,76 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow as flow
+import oneflow.nn as nn
+import oneflow.unittest
+import oneflow.utils.data as Data
+
+
+class LinearNet(nn.Module):
+    def __init__(self, n_feature):
+        super(LinearNet, self).__init__()
+        self.linear = nn.Linear(n_feature, 1)
+
+    def forward(self, x):
+        y = self.linear(x)
+        return y
+
+
+@flow.unittest.skip_unless_1n1d()
+@unittest.skipIf(
+    not flow.unittest.env.eager_execution_enabled(),
+    ".numpy() doesn't work in lazy mode",
+)
+class TestTensorDataset(flow.unittest.TestCase):
+    def test_tensor_dataset(test_case):
+        num_inputs = 2
+        num_examples = 1000
+        true_w = [2, -3.4]
+        true_b = 4.2
+        net = LinearNet(num_inputs)
+        flow.nn.init.normal_(net.linear.weight, mean=0, std=0.01)
+        flow.nn.init.constant_(net.linear.bias, val=0)
+        loss = nn.MSELoss()
+        optimizer = flow.optim.SGD(net.parameters(), lr=0.03)
+        features = flow.tensor(
+            np.random.normal(0, 1, (num_examples, num_inputs)), dtype=flow.float
+        )
+        labels = true_w[0] * features[:, 0] + true_w[1] * features[:, 1] + true_b
+        labels += flow.tensor(
+            np.random.normal(0, 0.01, size=labels.size()), dtype=flow.float
+        )
+        batch_size = 10
+        dataset = Data.TensorDataset(features, labels)
+        data_iter = Data.DataLoader(dataset, batch_size, shuffle=True, num_workers=0)
+        num_epochs = 10
+        for epoch in range(1, num_epochs + 1):
+            for (X, y) in data_iter:
+                output = net(X)
+                l = loss(output, y)
+                optimizer.zero_grad()
+                l.backward()
+                optimizer.step()
+            if epoch == num_epochs:
+                test_case.assertLess(l.numpy(), 0.00019)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/graph/test_forward_graph.py b/python/oneflow/test/graph/test_forward_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5cbd4efcd1d1882085f7396c75add2b4d2cc6fc
--- /dev/null
+++ b/python/oneflow/test/graph/test_forward_graph.py
@@ -0,0 +1,80 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import oneflow
+import oneflow as flow
+import oneflow.unittest
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestForwardGraph(flow.unittest.TestCase):
+    def test_forward_graph(test_case):
+        class SubModule(flow.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.weight = flow.nn.Parameter(flow.Tensor(6, 6))
+                self.relu = flow.nn.ReLU()
+
+            def forward(self, x, y):
+                x = oneflow.F.matmul(x, self.weight)
+                x = self.relu(x)
+                y = self.relu(y)
+                return (x, y)
+
+        class CustomModule(flow.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.layer = SubModule()
+                self.register_buffer("dummy_buff", flow.Tensor(6, 8))
+
+            def forward(self, x, y):
+                (x, y) = self.layer(x, y)
+                x = oneflow.F.flatten(x, 1)
+                x = oneflow.F.matmul(x, self.dummy_buff)
+                return (x, y)
+
+        class CustomGraph(flow.nn.Graph):
+            def __init__(self, module):
+                super().__init__()
+                self.m = module
+
+            def build(self, x, y):
+                out = self.m(x, y)
+                return out
+
+        m = CustomModule()
+        m.to("cuda")
+        g = CustomGraph(m)
+        x = flow.Tensor(6, 6)
+        flow.nn.init.uniform_(x, a=-1.0, b=1.0)
+        x = x.to("cuda")
+        y = flow.Tensor(10, 10)
+        flow.nn.init.uniform_(y, a=-1.0, b=1.0)
+        y = y.to("cuda")
+        print(repr(g))
+        (z, a) = g._compile(x, y)
+        test_case.assertEqual(z.shape, (6, 8))
+        test_case.assertEqual(z.is_lazy, False)
+        test_case.assertEqual(a.shape, (10, 10))
+        test_case.assertEqual(a.is_lazy, False)
+        print("graph proto: ", g._graph_proto)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/graph/test_graph.py b/python/oneflow/test/graph/test_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e1c741f1c78a375cd4050d67755401be10d63d0
--- /dev/null
+++ b/python/oneflow/test/graph/test_graph.py
@@ -0,0 +1,279 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import numpy as np
+
+import oneflow
+import oneflow as flow
+import oneflow.framework.graph_build_util as graph_build_util
+import oneflow.unittest
+
+
+class SubModule(flow.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = flow.nn.Conv2d(1, 1, 5)
+        self.relu = flow.nn.ReLU()
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.relu(x)
+        return x
+
+
+class CustomModule(flow.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.layer = SubModule()
+        self.fc1 = flow.nn.Linear(36, 4)
+        self.register_buffer("dummy_buff", flow.Tensor(1, 4))
+
+    def forward(self, x):
+        x = self.layer(x)
+        x = oneflow.F.flatten(x, 1)
+        x = self.fc1(x) + self.dummy_buff
+        return x
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestGraph(flow.unittest.TestCase):
+    def test_add_nested_module(test_case):
+        x = flow.Tensor(1, 1, 10, 10)
+        flow.nn.init.uniform_(x, a=-1.0, b=1.0)
+        m = CustomModule()
+        y = m(x)
+
+        class CustomGraphNestedModule(flow.nn.Graph):
+            def __init__(self):
+                super().__init__()
+                self.m = m
+
+            def build(self, x):
+                return self.m(x)
+
+        g = CustomGraphNestedModule()
+        test_case.assertEqual(g.name, g._c_nn_graph.name)
+        test_case.assertTrue(isinstance(g.m, flow.nn.graph.Block))
+        test_case.assertEqual(g.m.type, "MODULE")
+        test_case.assertEqual(g.m.name, "m")
+        test_case.assertTrue(isinstance(g.m.dummy_buff, flow.nn.graph.Block))
+        test_case.assertEqual(g.m.dummy_buff.type, "BUFFER")
+        test_case.assertTrue(isinstance(g.m.layer.conv1, flow.nn.graph.Block))
+        test_case.assertEqual(g.m.layer.conv1.name, "conv1")
+        test_case.assertEqual(g.m.layer.conv1.name_prefix, "m.layer.")
+        test_case.assertTrue(isinstance(g.m.layer.conv1.weight, flow.nn.graph.Block))
+        test_case.assertEqual(g.m.layer.conv1.weight.type, "PARAMETER")
+        g.m.layer.conv1._is_executing_forward = True
+        test_case.assertTrue(isinstance(g.m.layer.conv1.weight, flow.Tensor))
+        g.m.layer.conv1._is_executing_forward = False
+        test_case.assertEqual(g.m.layer.conv1.kernel_size, (5, 5))
+        z = g.build(x)
+        test_case.assertTrue(np.array_equal(y.numpy(), z.numpy()))
+
+    def test_graph_config(test_case):
+        print("cclog: CustomGraphConfig begin")
+
+        class CustomGraphConfig(flow.nn.Graph):
+            def __init__(self):
+                super().__init__()
+                self.m = CustomModule()
+                self.config.enable_auto_mixed_precision(True)
+
+            def build(self, x):
+                x = self.m(x)
+                return x
+
+        g = CustomGraphConfig()
+        test_case.assertEqual(g.config.training, False)
+        g.config.enable_fuse_add_to_output(True)
+        g.config.enable_fuse_add_to_output(False)
+        for s in g._state():
+            print("g state: ", repr(s))
+        print(repr(g))
+        print("cclog: CustomGraphConfig done")
+
+    def test_graph_name(test_case):
+        print("cclog: GraphName begin")
+
+        class ACustomGraph(flow.nn.Graph):
+            def __init__(self):
+                super().__init__()
+
+            def build(self, x):
+                return x
+
+        class BCustomGraph(flow.nn.Graph):
+            def __init__(self):
+                super().__init__()
+
+            def build(self, x):
+                return x
+
+        class CBCustomGraph(BCustomGraph):
+            def __init__(self):
+                super().__init__()
+
+        def create_graph(cnt):
+            a = ACustomGraph()
+            test_case.assertEqual(a.name, "ACustomGraph_" + str(cnt))
+            b = BCustomGraph()
+            test_case.assertEqual(b.name, "BCustomGraph_" + str(cnt))
+            cb = CBCustomGraph()
+            test_case.assertEqual(cb.name, "CBCustomGraph_" + str(cnt))
+
+        flow.nn.Graph._child_init_cnt.clear()
+        for i in range(0, 3):
+            create_graph(i)
+        flow.nn.Graph._child_init_cnt.clear()
+        for i in range(0, 3):
+            create_graph(i)
+        print("cclog: GraphName done")
+
+    def test_graph_build_ctx(test_case):
+        test_case.assertEqual(graph_build_util.lazy_mode.is_enabled(), False)
+        with graph_build_util.lazy_mode.gard(True):
+            test_case.assertEqual(graph_build_util.lazy_mode.is_enabled(), True)
+            with graph_build_util.lazy_mode.gard(False):
+                test_case.assertEqual(graph_build_util.lazy_mode.is_enabled(), False)
+            test_case.assertEqual(graph_build_util.lazy_mode.is_enabled(), True)
+        test_case.assertEqual(graph_build_util.lazy_mode.is_enabled(), False)
+
+        class CustomGraphGraphBuildCtx(flow.nn.Graph):
+            def __init__(self):
+                super().__init__()
+                self.config.enable_auto_mixed_precision(True)
+
+            def build(self, x):
+                test_case.assertEqual(graph_build_util.lazy_mode.is_enabled(), True)
+                import oneflow.framework.session_context as session_ctx
+                from oneflow.framework.multi_client_session import MultiClientSession
+
+                session = session_ctx.GetDefaultSession()
+                test_case.assertEqual(type(session), MultiClientSession)
+                import oneflow.framework.scope_util as scope_util
+
+                scope = oneflow.current_scope()
+                scope_proto = graph_build_util.scope_to_proto(scope)
+                test_case.assertEqual(session.id, scope_proto.session_id)
+                test_case.assertEqual(
+                    oneflow._oneflow_internal.JobBuildAndInferCtx_GetCurrentJobName(),
+                    self.name,
+                )
+                return x
+
+        test_case.assertTrue(oneflow._oneflow_internal.IsMultiClient())
+        g = CustomGraphGraphBuildCtx()
+        test_case.assertEqual(graph_build_util.lazy_mode.is_enabled(), False)
+        data = np.array([2.0, 1.0, 0.0, -1.0, -2.0])
+        x = flow.tensor(data, dtype=flow.float32)
+        g._compile(x)
+        print("graph proto", g._graph_proto)
+        test_case.assertEqual(graph_build_util.lazy_mode.is_enabled(), False)
+
+    def test_block_scope(test_case):
+        class SubModule0(flow.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv1 = flow.nn.Conv2d(1, 1, 5)
+
+            def forward(self, x):
+                scope = oneflow.current_scope()
+                scope_proto = graph_build_util.scope_to_proto(scope)
+                ck_bool = scope_proto.attr_name2attr_value["checkpointing"].at_bool
+                test_case.assertEqual(ck_bool, True)
+                stage_int = scope_proto.attr_name2attr_value[
+                    "pipeline_stage_id_hint"
+                ].at_int64
+                test_case.assertEqual(stage_int, 0)
+                weight = self.conv1.weight
+                test_case.assertEqual(type(weight), flow.nn.graph.Block)
+                return self.conv1(x)
+
+        class SubModule1(flow.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.fc1 = flow.nn.Linear(36, 4, False)
+                self.register_buffer("dummy_buff", flow.Tensor(1, 4))
+
+            def forward(self, x):
+                scope = oneflow.current_scope()
+                scope_proto = graph_build_util.scope_to_proto(scope)
+                test_case.assertEqual(
+                    scope_proto.parent_scope_symbol_id, self.prev_scope.symbol_id
+                )
+                ck_bool = scope_proto.attr_name2attr_value["checkpointing"]
+                test_case.assertEqual(ck_bool.WhichOneof("value"), None)
+                stage_int = scope_proto.attr_name2attr_value[
+                    "pipeline_stage_id_hint"
+                ].at_int64
+                test_case.assertEqual(stage_int, 1)
+                name = self.name_prefix + self.name
+                prefixes = []
+                for prefix in scope_proto.scope_op_name_prefixes:
+                    prefixes.append(prefix)
+                name_in_scope = ".".join(prefixes)
+                test_case.assertEqual(name, name_in_scope)
+                b = self.dummy_buff
+                dummy_buff_scope_proto = graph_build_util.scope_to_proto(
+                    self._buffers["dummy_buff"].scope
+                )
+                test_case.assertEqual(
+                    dummy_buff_scope_proto.parent_scope_symbol_id, scope.symbol_id
+                )
+                x = self.fc1(x)
+                return x + b
+
+        class CustomModule1(flow.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.layer0 = SubModule0()
+                self.layer1 = SubModule1()
+
+            def forward(self, x, y):
+                print("x0: ", x.shape)
+                x = self.layer0(x)
+                print("x1: ", x.shape)
+                print("y0: ", y.shape)
+                y = self.layer1(y)
+                print("y1: ", y.shape)
+                return (x, y)
+
+        m = CustomModule1()
+
+        class CustomGraphBlockScope(flow.nn.Graph):
+            def __init__(self):
+                super().__init__()
+                self.m = m
+                self.m.layer0.config.stage_id = 0
+                self.m.layer0.config.activation_checkpointing = True
+                self.m.layer1.config.stage_id = 1
+
+            def build(self, x, y):
+                return self.m(x, y)
+
+        g = CustomGraphBlockScope()
+        x = np.ones((1, 1, 10, 10))
+        x = flow.tensor(x, dtype=flow.float32)
+        y = np.ones((16, 36))
+        y = flow.tensor(y, dtype=flow.float32)
+        g._compile(x, y)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/graph/test_graph_optimizer.py b/python/oneflow/test/graph/test_graph_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ad55d369c2f1166578ab07d5d94c288d42d5f55
--- /dev/null
+++ b/python/oneflow/test/graph/test_graph_optimizer.py
@@ -0,0 +1,98 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import numpy as np
+
+import oneflow
+import oneflow as flow
+import oneflow.unittest
+
+
+@unittest.skip(
+    " NOTE(chengcheng): nn.Graph train cannot run right now for JobCompleter."
+)
+class TestGraphOptimizer(flow.unittest.TestCase):
+    def test_optimizer(test_case):
+        class CustomModule(flow.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.para0 = flow.nn.Parameter(flow.Tensor(1, 4))
+                self.para1 = flow.nn.Parameter(flow.Tensor(1, 4))
+                self.para2 = flow.nn.Parameter(flow.Tensor(1, 4))
+                self.para2.requires_grad_(False)
+                self.para3 = flow.nn.Parameter(flow.Tensor(1, 4))
+                self.para4 = flow.nn.Parameter(flow.Tensor(1, 4))
+
+            def forward(self, x):
+                return x
+
+        m = CustomModule()
+        learning_rate = 0.1
+        momentum = 0.2
+        scale = 0.3
+        sgd0 = flow.optim.SGD(
+            [
+                {
+                    "params": [m.para0, m.para1, m.para2],
+                    "lr": learning_rate,
+                    "momentum": momentum,
+                    "scale": scale,
+                }
+            ]
+        )
+        sgd1 = flow.optim.SGD(
+            [
+                {
+                    "params": [m.para3],
+                    "lr": learning_rate,
+                    "momentum": momentum,
+                    "scale": scale,
+                },
+                {
+                    "params": [m.para4],
+                    "lr": learning_rate,
+                    "momentum": momentum,
+                    "scale": scale,
+                },
+            ]
+        )
+
+        class CustomGraph0(flow.nn.Graph):
+            def __init__(self):
+                super().__init__()
+                self.m = m
+                self.add_optimizer("sgd0", sgd0)
+                self.add_optimizer("sgd1", sgd1)
+
+            def build(self, x):
+                out = self.m(x)
+                out.backward()
+                return out
+
+        g = CustomGraph0()
+        x = flow.Tensor(1, 1, 10, 10)
+        flow.nn.init.uniform_(x, a=-1.0, b=1.0)
+        z = g._compile(x)
+        print(repr(g))
+        print("g.config.proto: \n", g.config.proto)
+        print("graph proto: \n", g._graph_proto)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/graph/test_graph_relu.py b/python/oneflow/test/graph/test_graph_relu.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea123e3822fe292a88b0edd6e43b249bd7b0c883
--- /dev/null
+++ b/python/oneflow/test/graph/test_graph_relu.py
@@ -0,0 +1,51 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import numpy as np
+
+import oneflow as flow
+import oneflow.framework.graph_build_util as graph_build_util
+import oneflow.unittest
+
+
+@unittest.skip(" nn.Graph cannnot run right now ")
+class TestReluGraph(flow.unittest.TestCase):
+    def test_relu_graph(test_case):
+        data = np.array([2.0, 1.0, 0.0, -1.0, -2.0])
+        x = flow.tensor(data, dtype=flow.float32)
+        MyRelu = flow.nn.ReLU()
+        y_eager = MyRelu(x)
+        print("eager out :", y_eager)
+
+        class ReluGraph(flow.nn.Graph):
+            def __init__(self):
+                super().__init__()
+                self.cc_relu = MyRelu
+
+            def build(self, x):
+                return self.cc_relu(x)
+
+        relu_g = ReluGraph()
+        y_lazy = relu_g(x)[0]
+        print("lazy out :", y_lazy)
+        test_case.assertTrue(np.array_equal(y_eager.numpy(), y_lazy.numpy()))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/graph/test_input_op_expr.py b/python/oneflow/test/graph/test_input_op_expr.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ea839a3573f546618091c21ff94d1d8e80c083a
--- /dev/null
+++ b/python/oneflow/test/graph/test_input_op_expr.py
@@ -0,0 +1,72 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import numpy as np
+
+import oneflow
+import oneflow as flow
+import oneflow._oneflow_internal
+import oneflow.framework.c_api_util as c_api_util
+import oneflow.framework.session_context as session_ctx
+import oneflow.unittest
+from oneflow.framework.multi_client_session import MultiClientSession
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestFeedInputTensor(unittest.TestCase):
+    def test_feed_input_tensor(test_case):
+        test_case.assertTrue(oneflow.distributed.is_multi_client())
+        test_case.assertTrue(oneflow.framework.env_util.HasAllMultiClientEnvVars())
+        x = flow.Tensor(1, 1, 10, 10)
+        flow.nn.init.uniform_(x, a=-1.0, b=1.0)
+        session = session_ctx.GetDefaultSession()
+        test_case.assertTrue(isinstance(session, MultiClientSession))
+        session.TryInit()
+        with oneflow._oneflow_internal.lazy_mode.gard(True):
+            oneflow._oneflow_internal.JobBuildAndInferCtx_Open(
+                "cc_test_input_op_expr_job"
+            )
+            job_conf = (
+                oneflow._oneflow_internal.oneflow.core.job.job_conf.JobConfigProto()
+            )
+            job_conf.set_job_name("cc_test_input_op_expr_job")
+            job_conf.mutable_predict_conf()
+            c_api_util.CurJobBuildAndInferCtx_SetJobConf(job_conf)
+            op_name = "cc_Input_0"
+            input_conf = (
+                oneflow._oneflow_internal.oneflow.core.operator.op_conf.FeedInputOpConf()
+            )
+            input_conf.set_in_0("EagerTensorInput")
+            input_conf.set_out_0("out_0")
+            input_op = oneflow._oneflow_internal.one.FeedInputOpExpr(
+                op_name, input_conf, ["in_0"], ["out_0"]
+            )
+            attrs = oneflow._oneflow_internal.MutableCfgAttrMap()
+            if not x.is_determined:
+                x.determine()
+            x_tensor_in_c = x._local_or_consistent_tensor
+            out_tensor = input_op.apply([x_tensor_in_c], attrs)[0]
+            test_case.assertEqual(out_tensor.shape, (1, 1, 10, 10))
+            test_case.assertTrue(out_tensor.is_lazy)
+            test_case.assertTrue(out_tensor.is_local)
+            oneflow._oneflow_internal.JobBuildAndInferCtx_Close()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/graph/test_multi_client_session.py b/python/oneflow/test/graph/test_multi_client_session.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f7e8db64bba35900da9fa95fcca050716194c22
--- /dev/null
+++ b/python/oneflow/test/graph/test_multi_client_session.py
@@ -0,0 +1,45 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import oneflow
+import oneflow as flow
+import oneflow.framework.session_context as session_ctx
+import oneflow.unittest
+from oneflow.framework.multi_client_session import MultiClientSession
+
+
+class TestMultiClientSession(unittest.TestCase):
+    def test_case1(self):
+        self.assertTrue(flow.distributed.is_multi_client())
+        sess = session_ctx.GetDefaultSession()
+        self.assertTrue(isinstance(sess, MultiClientSession))
+        sess.TryInit()
+        self.assertEqual(sess.status, sess.Status.INITED)
+
+    def test_case2(self):
+        print("test_case2")
+        self.assertTrue(flow.distributed.is_multi_client())
+        sess = session_ctx.GetDefaultSession()
+        self.assertTrue(isinstance(sess, MultiClientSession))
+        sess.TryInit()
+        self.assertEqual(sess.status, sess.Status.INITED)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/graph/test_output_op_expr.py b/python/oneflow/test/graph/test_output_op_expr.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b5097e1dffd591f68676fcaa5295d592026f54c
--- /dev/null
+++ b/python/oneflow/test/graph/test_output_op_expr.py
@@ -0,0 +1,83 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import numpy as np
+
+import oneflow
+import oneflow as flow
+import oneflow._oneflow_internal
+import oneflow.framework.c_api_util as c_api_util
+import oneflow.framework.session_context as session_ctx
+import oneflow.unittest
+from oneflow.framework.multi_client_session import MultiClientSession
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestFetchOutputTensor(unittest.TestCase):
+    def test_fetch_output_tensor(test_case):
+        test_case.assertTrue(oneflow.distributed.is_multi_client())
+        test_case.assertTrue(oneflow.framework.env_util.HasAllMultiClientEnvVars())
+        x = flow.Tensor(1, 1, 10, 10)
+        flow.nn.init.uniform_(x, a=-1.0, b=1.0)
+        session = session_ctx.GetDefaultSession()
+        test_case.assertTrue(isinstance(session, MultiClientSession))
+        session.TryInit()
+        with oneflow._oneflow_internal.lazy_mode.gard(True):
+            oneflow._oneflow_internal.JobBuildAndInferCtx_Open(
+                "cc_test_output_op_expr_job"
+            )
+            job_conf = (
+                oneflow._oneflow_internal.oneflow.core.job.job_conf.JobConfigProto()
+            )
+            job_conf.set_job_name("cc_test_output_op_expr_job")
+            job_conf.mutable_predict_conf()
+            c_api_util.CurJobBuildAndInferCtx_SetJobConf(job_conf)
+            attrs = oneflow._oneflow_internal.MutableCfgAttrMap()
+            input_conf = (
+                oneflow._oneflow_internal.oneflow.core.operator.op_conf.FeedInputOpConf()
+            )
+            input_conf.set_in_0("EagerTensorInput")
+            input_conf.set_out_0("out_0")
+            input_op = oneflow._oneflow_internal.one.FeedInputOpExpr(
+                "cc_Input_0", input_conf, ["in_0"], ["out_0"]
+            )
+            output_conf = (
+                oneflow._oneflow_internal.oneflow.core.operator.op_conf.FetchOutputOpConf()
+            )
+            output_conf.set_in_0("LazyTensorInput")
+            output_conf.set_out_0("out_0")
+            output_op = oneflow._oneflow_internal.one.FetchOutputOpExpr(
+                "cc_Output_0", output_conf, ["in_0"], ["out_0"]
+            )
+            if not x.is_determined:
+                x.determine()
+            x_tensor_in_c = x._local_or_consistent_tensor
+            lazy_tensor = input_op.apply([x_tensor_in_c], attrs)[0]
+            test_case.assertEqual(lazy_tensor.shape, (1, 1, 10, 10))
+            test_case.assertTrue(lazy_tensor.is_lazy)
+            test_case.assertTrue(lazy_tensor.is_local)
+            eager_tensor = output_op.apply([lazy_tensor], attrs)[0]
+            test_case.assertEqual(eager_tensor.shape, (1, 1, 10, 10))
+            test_case.assertTrue(not eager_tensor.is_lazy)
+            test_case.assertTrue(eager_tensor.is_local)
+            oneflow._oneflow_internal.JobBuildAndInferCtx_Close()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/graph/test_user_op_expr.py b/python/oneflow/test/graph/test_user_op_expr.py
new file mode 100644
index 0000000000000000000000000000000000000000..89f4ca341c55708a8c98907a049bae094cccb851
--- /dev/null
+++ b/python/oneflow/test/graph/test_user_op_expr.py
@@ -0,0 +1,117 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import numpy as np
+
+import oneflow
+import oneflow as flow
+import oneflow._oneflow_internal
+import oneflow.framework.c_api_util as c_api_util
+import oneflow.framework.session_context as session_ctx
+import oneflow.unittest
+from oneflow.framework.multi_client_session import MultiClientSession
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestUserOpGraph(unittest.TestCase):
+    def test_user_op_graph(test_case):
+        test_case.assertTrue(oneflow.distributed.is_multi_client())
+        test_case.assertTrue(oneflow.framework.env_util.HasAllMultiClientEnvVars())
+        x0 = flow.Tensor(20, 30)
+        weight0 = flow.Tensor(30, 50)
+        x1 = flow.Tensor(50, 70)
+        flow.nn.init.uniform_(x0, a=-1.0, b=1.0)
+        flow.nn.init.uniform_(x1, a=-1.0, b=1.0)
+        flow.nn.init.uniform_(weight0, a=-1.0, b=1.0)
+        session = session_ctx.GetDefaultSession()
+        test_case.assertTrue(isinstance(session, MultiClientSession))
+        session.TryInit()
+        with oneflow._oneflow_internal.lazy_mode.gard(True):
+            oneflow._oneflow_internal.JobBuildAndInferCtx_Open(
+                "cc_test_user_op_expr_job"
+            )
+            job_conf = (
+                oneflow._oneflow_internal.oneflow.core.job.job_conf.JobConfigProto()
+            )
+            job_conf.set_job_name("cc_test_user_op_expr_job")
+            job_conf.mutable_predict_conf()
+            c_api_util.CurJobBuildAndInferCtx_SetJobConf(job_conf)
+            x0_conf = (
+                oneflow._oneflow_internal.oneflow.core.operator.op_conf.FeedInputOpConf()
+            )
+            x0_op = oneflow._oneflow_internal.one.FeedInputOpExpr(
+                "cc_Input_0", x0_conf, ["in_0"], ["out_0"]
+            )
+            x1_conf = (
+                oneflow._oneflow_internal.oneflow.core.operator.op_conf.FeedInputOpConf()
+            )
+            x1_op = oneflow._oneflow_internal.one.FeedInputOpExpr(
+                "cc_Input_1", x1_conf, ["in_0"], ["out_0"]
+            )
+            weight0_conf = (
+                oneflow._oneflow_internal.oneflow.core.operator.op_conf.FeedVariableOpConf()
+            )
+            weight0_op = oneflow._oneflow_internal.one.FeedVariableOpExpr(
+                "cc_Variable_0", weight0_conf, ["in_0"], ["out_0"]
+            )
+            output_conf = (
+                oneflow._oneflow_internal.oneflow.core.operator.op_conf.FetchOutputOpConf()
+            )
+            output_op = oneflow._oneflow_internal.one.FetchOutputOpExpr(
+                "cc_Output_0", output_conf, ["in_0"], ["out_0"]
+            )
+            attrs = oneflow._oneflow_internal.MutableCfgAttrMap()
+            if not x0.is_determined:
+                x0.determine()
+            x0_tensor_in_c = x0._local_or_consistent_tensor
+            if not x1.is_determined:
+                x1.determine()
+            x1_tensor_in_c = x1._local_or_consistent_tensor
+            if not weight0.is_determined:
+                weight0.determine()
+            weight0_tensor_in_c = weight0._local_or_consistent_tensor
+            x0_lazy_tensor = x0_op.apply([x0_tensor_in_c], attrs)[0]
+            x1_lazy_tensor = x1_op.apply([x1_tensor_in_c], attrs)[0]
+            weight0_lazy_tensor = weight0_op.apply([weight0_tensor_in_c], attrs)[0]
+            test_case.assertEqual(x0_lazy_tensor.shape, (20, 30))
+            test_case.assertTrue(x0_lazy_tensor.is_lazy)
+            test_case.assertEqual(weight0_lazy_tensor.shape, (30, 50))
+            test_case.assertTrue(weight0_lazy_tensor.is_lazy)
+            test_case.assertEqual(x1_lazy_tensor.shape, (50, 70))
+            test_case.assertTrue(x1_lazy_tensor.is_lazy)
+            out0 = flow.F.matmul(x0_lazy_tensor, weight0_lazy_tensor)
+            test_case.assertEqual(out0.shape, (20, 50))
+            test_case.assertTrue(out0.is_lazy)
+            y0 = flow.F.relu(out0)
+            test_case.assertEqual(y0.shape, (20, 50))
+            test_case.assertTrue(y0.is_lazy)
+            out1 = flow.F.matmul(y0, x1_lazy_tensor)
+            test_case.assertEqual(out1.shape, (20, 70))
+            test_case.assertTrue(out1.is_lazy)
+            y1 = flow.F.relu(out1)
+            test_case.assertEqual(y1.shape, (20, 70))
+            test_case.assertTrue(y1.is_lazy)
+            eager_output = output_op.apply([y1], attrs)[0]
+            test_case.assertEqual(eager_output.shape, (20, 70))
+            test_case.assertTrue(not eager_output.is_lazy)
+            oneflow._oneflow_internal.JobBuildAndInferCtx_Close()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/graph/test_variable_op_expr.py b/python/oneflow/test/graph/test_variable_op_expr.py
new file mode 100644
index 0000000000000000000000000000000000000000..544222e84ebd3b67f8f689c7c3e63e36d02ab3f3
--- /dev/null
+++ b/python/oneflow/test/graph/test_variable_op_expr.py
@@ -0,0 +1,72 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import numpy as np
+
+import oneflow
+import oneflow as flow
+import oneflow._oneflow_internal
+import oneflow.framework.c_api_util as c_api_util
+import oneflow.framework.session_context as session_ctx
+import oneflow.unittest
+from oneflow.framework.multi_client_session import MultiClientSession
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestFeedVariableTensor(unittest.TestCase):
+    def test_feed_var_tensor(test_case):
+        test_case.assertTrue(oneflow.distributed.is_multi_client())
+        test_case.assertTrue(oneflow.framework.env_util.HasAllMultiClientEnvVars())
+        x = flow.Tensor(1, 1, 10, 10)
+        flow.nn.init.uniform_(x, a=-1.0, b=1.0)
+        session = session_ctx.GetDefaultSession()
+        test_case.assertTrue(isinstance(session, MultiClientSession))
+        session.TryInit()
+        with oneflow._oneflow_internal.lazy_mode.gard(True):
+            oneflow._oneflow_internal.JobBuildAndInferCtx_Open(
+                "cc_test_variable_op_expr_job"
+            )
+            job_conf = (
+                oneflow._oneflow_internal.oneflow.core.job.job_conf.JobConfigProto()
+            )
+            job_conf.set_job_name("cc_test_variable_op_expr_job")
+            job_conf.mutable_predict_conf()
+            c_api_util.CurJobBuildAndInferCtx_SetJobConf(job_conf)
+            op_name = "cc_Variable_0"
+            var_conf = (
+                oneflow._oneflow_internal.oneflow.core.operator.op_conf.FeedVariableOpConf()
+            )
+            var_conf.set_in_0("EagerTensorInput")
+            var_conf.set_out_0("out_0")
+            var_op = oneflow._oneflow_internal.one.FeedVariableOpExpr(
+                op_name, var_conf, ["in_0"], ["out_0"]
+            )
+            attrs = oneflow._oneflow_internal.MutableCfgAttrMap()
+            if not x.is_determined:
+                x.determine()
+            x_tensor_in_c = x._local_or_consistent_tensor
+            out_tensor = var_op.apply([x_tensor_in_c], attrs)[0]
+            test_case.assertEqual(out_tensor.shape, (1, 1, 10, 10))
+            test_case.assertTrue(out_tensor.is_lazy)
+            test_case.assertTrue(out_tensor.is_local)
+            oneflow._oneflow_internal.JobBuildAndInferCtx_Close()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/models/1node_test.py b/python/oneflow/test/models/1node_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d23c8fd8b28d01ac08b24c3146450a6eedb88bef
--- /dev/null
+++ b/python/oneflow/test/models/1node_test.py
@@ -0,0 +1,62 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+
+import env_1node
+from absl import app
+from absl.testing import absltest
+from cnns_tests import (
+    TestAlexNetMixin,
+    TestInceptionV3Mixin,
+    TestResNet50Mixin,
+    TestVgg16Mixin,
+)
+from test_1node_mixin import Test1NodeMixin
+
+import oneflow as flow
+
+
+class TestAlexNet(Test1NodeMixin, TestAlexNetMixin, absltest.TestCase):
+    pass
+
+
+class TestResNet50(Test1NodeMixin, TestResNet50Mixin, absltest.TestCase):
+    pass
+
+
+class TestVgg16(Test1NodeMixin, TestVgg16Mixin, absltest.TestCase):
+    pass
+
+
+class TestInceptionV3(Test1NodeMixin, TestInceptionV3Mixin, absltest.TestCase):
+    pass
+
+
+flow.unittest.register_test_cases(
+    scope=globals(),
+    directory=os.path.dirname(os.path.realpath(__file__)),
+    filter_by_num_nodes=lambda x: x == 1,
+    base_class=absltest.TestCase,
+)
+
+
+def main(argv):
+    env_1node.Init()
+    absltest.main()
+
+
+if __name__ == "__main__":
+    app.run(main)
diff --git a/python/oneflow/test/models/2node_test.py b/python/oneflow/test/models/2node_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..847854e02013c6b63d22a41fd5cd58abd7cb72e3
--- /dev/null
+++ b/python/oneflow/test/models/2node_test.py
@@ -0,0 +1,60 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+
+import cnns_tests
+import env_2node
+import numpy
+from absl import app
+from absl.testing import absltest
+from test_2node_mixin import Test2NodeMixin
+
+import oneflow as flow
+
+
+class TestAlexNet(Test2NodeMixin, cnns_tests.TestAlexNetMixin, absltest.TestCase):
+    pass
+
+
+class TestResNet50(Test2NodeMixin, cnns_tests.TestResNet50Mixin, absltest.TestCase):
+    pass
+
+
+class TestVgg16(Test2NodeMixin, cnns_tests.TestVgg16Mixin, absltest.TestCase):
+    pass
+
+
+class TestInceptionV3(
+    Test2NodeMixin, cnns_tests.TestInceptionV3Mixin, absltest.TestCase
+):
+    pass
+
+
+flow.unittest.register_test_cases(
+    scope=globals(),
+    directory=os.path.dirname(os.path.realpath(__file__)),
+    filter_by_num_nodes=lambda x: x == 2,
+    base_class=absltest.TestCase,
+)
+
+
+def main(argv):
+    env_2node.Init()
+    absltest.main()
+
+
+if __name__ == "__main__":
+    app.run(main)
diff --git a/python/oneflow/test/models/alexnet.py b/python/oneflow/test/models/alexnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf08637330a8545652a02ba479b055518de2b899
--- /dev/null
+++ b/python/oneflow/test/models/alexnet.py
@@ -0,0 +1,288 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import argparse
+import os
+from datetime import datetime
+
+import numpy
+
+import oneflow as flow
+import oneflow.core.job.initializer_conf_pb2 as initializer_conf_util
+import oneflow.core.operator.op_conf_pb2 as op_conf_util
+
+_DATA_DIR = "/dataset/PNGS/PNG227/of_record_repeated"
+_MODEL_SAVE_DIR = "./model_save-{}".format(
+    str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))
+)
+_MODEL_LOAD = "/dataset/PNGS/cnns_model_for_test/alexnet/models/of_model_bk"
+NODE_LIST = "192.168.1.12,192.168.1.14"
+
+
+class DLNetSpec(object):
+    def __init__(self, enable_auto_mixed_precision):
+        self.batch_size = 8
+        self.data_part_num = 32
+        self.eval_dir = _DATA_DIR
+        self.train_dir = _DATA_DIR
+        self.model_save_dir = _MODEL_SAVE_DIR
+        self.model_load_dir = _MODEL_LOAD
+        self.num_nodes = 1
+        self.gpu_num_per_node = 1
+        self.iter_num = 10
+        self.enable_auto_mixed_precision = enable_auto_mixed_precision
+
+
+parser = argparse.ArgumentParser(description="flags for multi-node and resource")
+parser.add_argument("-nn", "--num_nodes", type=str, default=1, required=False)
+parser.add_argument("-g", "--gpu_num_per_node", type=int, default=1, required=False)
+parser.add_argument("-i", "--iter_num", type=int, default=10, required=False)
+parser.add_argument(
+    "-m", "--multinode", default=False, action="store_true", required=False
+)
+parser.add_argument("-n", "--node_list", type=str, default=NODE_LIST, required=False)
+parser.add_argument(
+    "-s", "--skip_scp_binary", default=False, action="store_true", required=False
+)
+parser.add_argument(
+    "-c",
+    "--scp_binary_without_uuid",
+    default=False,
+    action="store_true",
+    required=False,
+)
+parser.add_argument(
+    "-r", "--remote_by_hand", default=False, action="store_true", required=False
+)
+parser.add_argument("-e", "--eval_dir", type=str, default=_DATA_DIR, required=False)
+parser.add_argument("-t", "--train_dir", type=str, default=_DATA_DIR, required=False)
+parser.add_argument(
+    "-load", "--model_load_dir", type=str, default=_MODEL_LOAD, required=False
+)
+parser.add_argument(
+    "-save", "--model_save_dir", type=str, default=_MODEL_SAVE_DIR, required=False
+)
+parser.add_argument("-dn", "--data_part_num", type=int, default=32, required=False)
+parser.add_argument("-b", "--batch_size", type=int, default=8, required=False)
+
+
+def _conv2d_layer(
+    args,
+    name,
+    input,
+    filters,
+    kernel_size=3,
+    strides=1,
+    padding="SAME",
+    data_format="NCHW",
+    dilation_rate=1,
+    activation=op_conf_util.kRelu,
+    use_bias=False,
+    weight_initializer=flow.random_uniform_initializer(),
+    bias_initializer=flow.random_uniform_initializer(),
+):
+    weight_shape = (filters, input.shape[1], kernel_size, kernel_size)
+    weight = flow.get_variable(
+        name + "-weight",
+        shape=weight_shape,
+        dtype=input.dtype,
+        initializer=weight_initializer,
+    )
+    output = flow.nn.conv2d(
+        input, weight, strides, padding, None, data_format, dilation_rate, name=name
+    )
+    if use_bias:
+        bias = flow.get_variable(
+            name + "-bias",
+            shape=(filters,),
+            dtype=input.dtype,
+            initializer=bias_initializer,
+        )
+        output = flow.nn.bias_add(output, bias, data_format)
+    if activation is not None:
+        if activation == op_conf_util.kRelu:
+            output = flow.nn.relu(output)
+        else:
+            raise NotImplementedError
+    return output
+
+
+def _data_load_layer(args, data_dir):
+    node_num = args.num_nodes
+    total_batch_size = args.batch_size * args.gpu_num_per_node * node_num
+    rgb_mean = [123.68, 116.78, 103.94]
+    (image, label) = flow.data.ofrecord_image_classification_reader(
+        data_dir,
+        batch_size=total_batch_size,
+        data_part_num=args.data_part_num,
+        image_feature_name="encoded",
+        label_feature_name="class/label",
+        color_space="RGB",
+        name="decode",
+    )
+    rsz = flow.image.resize(image, resize_x=227, resize_y=227, color_space="RGB")
+    normal = flow.image.crop_mirror_normalize(
+        rsz,
+        color_space="RGB",
+        output_layout="NCHW",
+        mean=rgb_mean,
+        output_dtype=flow.float,
+    )
+    return (label, normal)
+
+
+def alexnet(args, images, labels, trainable=True):
+    conv1 = _conv2d_layer(
+        args, "conv1", images, filters=64, kernel_size=11, strides=4, padding="VALID"
+    )
+    pool1 = flow.nn.avg_pool2d(conv1, 3, 2, "VALID", "NCHW", name="pool1")
+    conv2 = _conv2d_layer(args, "conv2", pool1, filters=192, kernel_size=5)
+    pool2 = flow.nn.avg_pool2d(conv2, 3, 2, "VALID", "NCHW", name="pool2")
+    conv3 = _conv2d_layer(args, "conv3", pool2, filters=384)
+    conv4 = _conv2d_layer(args, "conv4", conv3, filters=384)
+    conv5 = _conv2d_layer(args, "conv5", conv4, filters=256)
+    pool5 = flow.nn.avg_pool2d(conv5, 3, 2, "VALID", "NCHW", name="pool5")
+
+    def _get_initializer():
+        kernel_initializer = initializer_conf_util.InitializerConf()
+        kernel_initializer.truncated_normal_conf.std = 0.816496580927726
+        return kernel_initializer
+
+    if len(pool5.shape) > 2:
+        pool5 = flow.reshape(pool5, shape=(pool5.shape[0], -1))
+    fc1 = flow.layers.dense(
+        inputs=pool5,
+        units=4096,
+        activation=flow.math.relu,
+        use_bias=False,
+        kernel_initializer=_get_initializer(),
+        bias_initializer=False,
+        trainable=trainable,
+        name="fc1",
+    )
+    dropout1 = fc1
+    fc2 = flow.layers.dense(
+        inputs=dropout1,
+        units=4096,
+        activation=flow.math.relu,
+        use_bias=False,
+        kernel_initializer=_get_initializer(),
+        bias_initializer=False,
+        trainable=trainable,
+        name="fc2",
+    )
+    dropout2 = fc2
+    fc3 = flow.layers.dense(
+        inputs=dropout2,
+        units=1001,
+        activation=None,
+        use_bias=False,
+        kernel_initializer=_get_initializer(),
+        bias_initializer=False,
+        trainable=trainable,
+        name="fc3",
+    )
+    loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+        labels, fc3, name="softmax_loss"
+    )
+    return loss
+
+
+def main(args):
+    flow.config.machine_num(args.num_nodes)
+    flow.config.gpu_device_num(args.gpu_num_per_node)
+    flow.config.enable_legacy_model_io(True)
+    func_config = flow.FunctionConfig()
+    func_config.default_logical_view(flow.scope.consistent_view())
+    func_config.default_data_type(flow.float)
+    func_config.cudnn_conv_force_fwd_algo(0)
+    func_config.cudnn_conv_force_bwd_data_algo(1)
+    func_config.cudnn_conv_force_bwd_filter_algo(1)
+    func_config.enable_auto_mixed_precision(args.enable_auto_mixed_precision)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def alexnet_train_job():
+        (labels, images) = _data_load_layer(args, args.train_dir)
+        loss = alexnet(args, images, labels)
+        flow.optimizer.SGD(
+            flow.optimizer.PiecewiseConstantScheduler([], [1e-05]), momentum=0
+        ).minimize(loss)
+        return loss
+
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+    func_config.enable_auto_mixed_precision(args.enable_auto_mixed_precision)
+
+    @flow.global_function(function_config=func_config)
+    def alexnet_eval_job():
+        with flow.scope.consistent_view():
+            (labels, images) = _data_load_layer(args, args.eval_dir)
+            return alexnet(args, images, labels, False)
+
+    check_point = flow.train.CheckPoint()
+    if not args.model_load_dir:
+        check_point.init()
+    else:
+        check_point.load(args.model_load_dir)
+    num_nodes = args.num_nodes
+    print(
+        "Traning alexnet: num_gpu_per_node = {}, num_nodes = {}.".format(
+            args.gpu_num_per_node, num_nodes
+        )
+    )
+    print("{:>12}  {:>12}  {:>12}".format("iter", "loss type", "loss value"))
+    loss = []
+    for i in range(args.iter_num):
+        train_loss = alexnet_train_job().get().mean()
+        loss.append(train_loss)
+        fmt_str = "{:>12}  {:>12}  {:>12.6f}"
+        print(fmt_str.format(i, "train loss:", train_loss))
+        if (i + 1) % 100 == 0:
+            check_point.save(_MODEL_SAVE_DIR + str(i))
+    loss_file = "{}n{}c.npy".format(
+        str(num_nodes), str(args.gpu_num_per_node * num_nodes)
+    )
+    loss_path = "./of_loss/alexnet"
+    if not os.path.exists(loss_path):
+        os.makedirs(loss_path)
+    numpy.save(os.path.join(loss_path, loss_file), loss)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    args.num_nodes = len(args.node_list.strip().split(",")) if args.multinode else 1
+    flow.env.ctrl_port(9788)
+    if args.multinode:
+        flow.env.ctrl_port(12138)
+        nodes = []
+        for n in args.node_list.strip().split(","):
+            addr_dict = {}
+            addr_dict["addr"] = n
+            nodes.append(addr_dict)
+        flow.env.machine(nodes)
+        if args.remote_by_hand is False:
+            if args.scp_binary_without_uuid:
+                flow.deprecated.init_worker(scp_binary=True, use_uuid=False)
+            elif args.skip_scp_binary:
+                flow.deprecated.init_worker(scp_binary=False, use_uuid=False)
+            else:
+                flow.deprecated.init_worker(scp_binary=True, use_uuid=True)
+    main(args)
+    if (
+        args.multinode
+        and args.skip_scp_binary is False
+        and (args.scp_binary_without_uuid is False)
+    ):
+        flow.deprecated.delete_worker()
diff --git a/python/oneflow/test/models/alexnet_with_unpack.py b/python/oneflow/test/models/alexnet_with_unpack.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ac58d9f09d643cbbeeba6b41b28be934eff4c63
--- /dev/null
+++ b/python/oneflow/test/models/alexnet_with_unpack.py
@@ -0,0 +1,348 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import argparse
+import os
+from datetime import datetime
+
+import numpy
+
+import oneflow as flow
+import oneflow.core.job.initializer_conf_pb2 as initializer_conf_util
+import oneflow.core.operator.op_conf_pb2 as op_conf_util
+
+_DATA_DIR = "/dataset/PNGS/PNG227/of_record_repeated"
+_MODEL_SAVE_DIR = "./model_save-{}".format(
+    str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))
+)
+_MODEL_LOAD = "/dataset/PNGS/cnns_model_for_test/alexnet/models/of_model_bk"
+NODE_LIST = "192.168.1.12,192.168.1.14"
+
+
+class DLNetSpec(object):
+    def __init__(self):
+        self.batch_size = 8
+        self.data_part_num = 32
+        self.eval_dir = _DATA_DIR
+        self.train_dir = _DATA_DIR
+        self.model_save_dir = _MODEL_SAVE_DIR
+        self.model_load_dir = _MODEL_LOAD
+        self.num_nodes = 1
+        self.gpu_num_per_node = 1
+        self.iter_num = 10
+        self.num_unpack = 2
+
+
+parser = argparse.ArgumentParser(description="flags for multi-node and resource")
+parser.add_argument("-nn", "--num_nodes", type=str, default=1, required=False)
+parser.add_argument("-g", "--gpu_num_per_node", type=int, default=1, required=False)
+parser.add_argument("-i", "--iter_num", type=int, default=10, required=False)
+parser.add_argument(
+    "-m", "--multinode", default=False, action="store_true", required=False
+)
+parser.add_argument("-n", "--node_list", type=str, default=NODE_LIST, required=False)
+parser.add_argument(
+    "-s", "--skip_scp_binary", default=False, action="store_true", required=False
+)
+parser.add_argument(
+    "-c",
+    "--scp_binary_without_uuid",
+    default=False,
+    action="store_true",
+    required=False,
+)
+parser.add_argument(
+    "-r", "--remote_by_hand", default=False, action="store_true", required=False
+)
+parser.add_argument("-e", "--eval_dir", type=str, default=_DATA_DIR, required=False)
+parser.add_argument("-t", "--train_dir", type=str, default=_DATA_DIR, required=False)
+parser.add_argument(
+    "-load", "--model_load_dir", type=str, default=_MODEL_LOAD, required=False
+)
+parser.add_argument(
+    "-save", "--model_save_dir", type=str, default=_MODEL_SAVE_DIR, required=False
+)
+parser.add_argument("-dn", "--data_part_num", type=int, default=32, required=False)
+parser.add_argument("-b", "--batch_size", type=int, default=8, required=False)
+parser.add_argument("-p", "--num_piece_in_batch", type=int, default=2, required=False)
+
+
+def _conv2d_layer(
+    args,
+    name,
+    input,
+    filters,
+    kernel_size=3,
+    strides=1,
+    padding="SAME",
+    data_format="NCHW",
+    dilation_rate=1,
+    activation=op_conf_util.kRelu,
+    use_bias=False,
+    weight_initializer=flow.random_uniform_initializer(),
+    bias_initializer=flow.random_uniform_initializer(),
+):
+    weight_shape = (filters, input.shape[1], kernel_size, kernel_size)
+    weight = flow.get_variable(
+        name + "-weight",
+        shape=weight_shape,
+        dtype=input.dtype,
+        initializer=weight_initializer,
+    )
+    weight = flow.identity(weight)
+    weight = flow.repeat(weight, args.num_piece_in_batch)
+    output = flow.nn.conv2d(
+        input, weight, strides, padding, None, data_format, dilation_rate, name=name
+    )
+    if use_bias:
+        bias = flow.get_variable(
+            name + "-bias",
+            shape=(filters,),
+            dtype=input.dtype,
+            initializer=bias_initializer,
+        )
+        bias = flow.identity(bias)
+        bias = flow.repeat(bias, args.num_piece_in_batch)
+        output = flow.nn.bias_add(output, bias, data_format)
+    if activation is not None:
+        if activation == op_conf_util.kRelu:
+            output = flow.math.relu(output)
+        else:
+            raise NotImplementedError
+    return output
+
+
+def _data_load_layer(args, data_dir):
+    node_num = args.num_nodes
+    total_batch_size = args.batch_size * args.gpu_num_per_node * node_num
+    rgb_mean = [123.68, 116.78, 103.94]
+    ofrecord = flow.data.ofrecord_reader(
+        data_dir,
+        batch_size=total_batch_size,
+        data_part_num=args.data_part_num,
+        name="decode",
+    )
+    image = flow.data.ofrecord_image_decoder(ofrecord, "encoded", color_space="RGB")
+    label = flow.data.ofrecord_raw_decoder(
+        ofrecord, "class/label", shape=(), dtype=flow.int32
+    )
+    rsz = flow.image.resize(image, resize_x=227, resize_y=227, color_space="RGB")
+    normal = flow.image.crop_mirror_normalize(
+        rsz,
+        color_space="RGB",
+        output_layout="NCHW",
+        mean=rgb_mean,
+        output_dtype=flow.float,
+    )
+    return (
+        flow.unpack(label, args.num_piece_in_batch),
+        flow.unpack(normal, args.num_piece_in_batch),
+    )
+
+
+def _dense_layer(
+    inputs,
+    units,
+    activation=None,
+    use_bias=True,
+    kernel_initializer=None,
+    bias_initializer=None,
+    trainable=True,
+    name=None,
+):
+    in_shape = inputs.shape
+    in_num_axes = len(in_shape)
+    assert in_num_axes >= 2
+    name_prefix = name if name is not None else id_util.UniqueStr("Dense_")
+    inputs = flow.reshape(inputs, (-1, in_shape[-1])) if in_num_axes > 2 else inputs
+    weight = flow.get_variable(
+        name="{}-weight".format(name_prefix),
+        shape=(units, inputs.shape[1]),
+        dtype=inputs.dtype,
+        initializer=kernel_initializer
+        if kernel_initializer is not None
+        else flow.constant_initializer(0),
+        trainable=trainable,
+        model_name="weight",
+    )
+    weight = flow.identity(weight)
+    weight = flow.repeat(weight, args.num_piece_in_batch)
+    out = flow.matmul(
+        a=inputs, b=weight, transpose_b=True, name="{}_matmul".format(name_prefix)
+    )
+    if use_bias:
+        bias = flow.get_variable(
+            name="{}-bias".format(name_prefix),
+            shape=(units,),
+            dtype=inputs.dtype,
+            initializer=bias_initializer
+            if bias_initializer is not None
+            else flow.constant_initializer(0),
+            trainable=trainable,
+            model_name="bias",
+        )
+        bias = flow.identity(bias)
+        bias = flow.repeat(bias, args.num_piece_in_batch)
+        out = flow.nn.bias_add(out, bias, name="{}_bias_add".format(name_prefix))
+    out = (
+        activation(out, name="{}_activation".format(name_prefix))
+        if activation is not None
+        else out
+    )
+    out = flow.reshape(out, in_shape[:-1] + (units,)) if in_num_axes > 2 else out
+    return out
+
+
+def alexnet(args, images, labels, trainable=True):
+    conv1 = _conv2d_layer(
+        args, "conv1", images, filters=64, kernel_size=11, strides=4, padding="VALID"
+    )
+    pool1 = flow.nn.avg_pool2d(conv1, 3, 2, "VALID", "NCHW", name="pool1")
+    conv2 = _conv2d_layer(args, "conv2", pool1, filters=192, kernel_size=5)
+    pool2 = flow.nn.avg_pool2d(conv2, 3, 2, "VALID", "NCHW", name="pool2")
+    conv3 = _conv2d_layer(args, "conv3", pool2, filters=384)
+    conv4 = _conv2d_layer(args, "conv4", conv3, filters=384)
+    conv5 = _conv2d_layer(args, "conv5", conv4, filters=256)
+    pool5 = flow.nn.avg_pool2d(conv5, 3, 2, "VALID", "NCHW", name="pool5")
+
+    def _get_initializer():
+        kernel_initializer = initializer_conf_util.InitializerConf()
+        kernel_initializer.truncated_normal_conf.std = 0.816496580927726
+        return kernel_initializer
+
+    if len(pool5.shape) > 2:
+        pool5 = flow.reshape(pool5, shape=(pool5.shape[0], -1))
+    fc1 = _dense_layer(
+        inputs=pool5,
+        units=4096,
+        activation=flow.math.relu,
+        use_bias=False,
+        kernel_initializer=_get_initializer(),
+        bias_initializer=False,
+        trainable=trainable,
+        name="fc1",
+    )
+    dropout1 = fc1
+    fc2 = _dense_layer(
+        inputs=dropout1,
+        units=4096,
+        activation=flow.math.relu,
+        use_bias=False,
+        kernel_initializer=_get_initializer(),
+        bias_initializer=False,
+        trainable=trainable,
+        name="fc2",
+    )
+    dropout2 = fc2
+    fc3 = _dense_layer(
+        inputs=dropout2,
+        units=1001,
+        activation=None,
+        use_bias=False,
+        kernel_initializer=_get_initializer(),
+        bias_initializer=False,
+        trainable=trainable,
+        name="fc3",
+    )
+    loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+        labels, fc3, name="softmax_loss"
+    )
+    return loss
+
+
+def main(args):
+    flow.config.machine_num(args.num_nodes)
+    flow.config.gpu_device_num(args.gpu_num_per_node)
+    func_config = flow.FunctionConfig()
+    func_config.default_logical_view(flow.scope.consistent_view())
+    func_config.default_data_type(flow.float)
+    func_config.cudnn_conv_force_fwd_algo(0)
+    func_config.cudnn_conv_force_bwd_data_algo(1)
+    func_config.cudnn_conv_force_bwd_filter_algo(1)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def alexnet_train_job():
+        (labels, images) = _data_load_layer(args, args.train_dir)
+        loss = alexnet(args, images, labels)
+        flow.optimizer.SGD(
+            flow.optimizer.PiecewiseConstantScheduler([], [1e-05]), momentum=0
+        ).minimize(loss)
+        return flow.pack(loss, args.num_piece_in_batch)
+
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    @flow.global_function(function_config=func_config)
+    def alexnet_eval_job():
+        with flow.scope.consistent_view():
+            (labels, images) = _data_load_layer(args, args.eval_dir)
+            loss = alexnet(args, images, labels)
+            return flow.pack(loss, args.num_piece_in_batch)
+
+    check_point = flow.train.CheckPoint()
+    if not args.model_load_dir:
+        check_point.init()
+    else:
+        check_point.load(args.model_load_dir)
+    num_nodes = args.num_nodes
+    print(
+        "Traning alexnet: num_gpu_per_node = {}, num_nodes = {}.".format(
+            args.gpu_num_per_node, num_nodes
+        )
+    )
+    print("{:>12}  {:>12}  {:>12}".format("iter", "loss type", "loss value"))
+    loss = []
+    for i in range(args.iter_num):
+        train_loss = alexnet_train_job().get().mean()
+        loss.append(train_loss)
+        fmt_str = "{:>12}  {:>12}  {:>12.6f}"
+        print(fmt_str.format(i, "train loss:", train_loss))
+        if (i + 1) % 100 == 0:
+            check_point.save(_MODEL_SAVE_DIR + str(i))
+    loss_file = "{}n{}c.npy".format(
+        str(num_nodes), str(args.gpu_num_per_node * num_nodes)
+    )
+    loss_path = "./of_loss/alexnet"
+    if not os.path.exists(loss_path):
+        os.makedirs(loss_path)
+    numpy.save(os.path.join(loss_path, loss_file), loss)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    args.num_nodes = len(args.node_list.strip().split(",")) if args.multinode else 1
+    flow.env.ctrl_port(9788)
+    if args.multinode:
+        flow.env.ctrl_port(12138)
+        nodes = []
+        for n in args.node_list.strip().split(","):
+            addr_dict = {}
+            addr_dict["addr"] = n
+            nodes.append(addr_dict)
+        flow.env.machine(nodes)
+        if args.remote_by_hand is False:
+            if args.scp_binary_without_uuid:
+                flow.deprecated.init_worker(scp_binary=True, use_uuid=False)
+            elif args.skip_scp_binary:
+                flow.deprecated.init_worker(scp_binary=False, use_uuid=False)
+            else:
+                flow.deprecated.init_worker(scp_binary=True, use_uuid=True)
+    main(args)
+    if (
+        args.multinode
+        and args.skip_scp_binary is False
+        and (args.scp_binary_without_uuid is False)
+    ):
+        flow.deprecated.delete_worker()
diff --git a/python/oneflow/test/models/bert.py b/python/oneflow/test/models/bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..14856c38558cd2856be4e6dd1ea741d923f47ef7
--- /dev/null
+++ b/python/oneflow/test/models/bert.py
@@ -0,0 +1,399 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import math
+
+import oneflow as flow
+import oneflow.core.common.data_type_pb2 as data_type_util
+import oneflow.core.operator.op_conf_pb2 as op_conf_util
+
+
+class BertBackbone(object):
+    def __init__(
+        self,
+        input_ids_blob,
+        input_mask_blob,
+        token_type_ids_blob,
+        vocab_size,
+        seq_length=512,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        initializer_range=0.02,
+    ):
+        with flow.scope.namespace("bert"):
+            with flow.scope.namespace("embeddings"):
+                (self.embedding_output_, self.embedding_table_) = _EmbeddingLookup(
+                    input_ids_blob=input_ids_blob,
+                    vocab_size=vocab_size,
+                    embedding_size=hidden_size,
+                    initializer_range=initializer_range,
+                    word_embedding_name="word_embeddings",
+                )
+                self.embedding_output_ = _EmbeddingPostprocessor(
+                    input_blob=self.embedding_output_,
+                    seq_length=seq_length,
+                    embedding_size=hidden_size,
+                    use_token_type=True,
+                    token_type_ids_blob=token_type_ids_blob,
+                    token_type_vocab_size=type_vocab_size,
+                    token_type_embedding_name="token_type_embeddings",
+                    use_position_embeddings=True,
+                    position_embedding_name="position_embeddings",
+                    initializer_range=initializer_range,
+                    max_position_embeddings=max_position_embeddings,
+                    dropout_prob=hidden_dropout_prob,
+                )
+            with flow.scope.namespace("encoder"):
+                addr_blob = _CreateAttentionMaskFromInputMask(
+                    input_mask_blob,
+                    from_seq_length=seq_length,
+                    to_seq_length=seq_length,
+                )
+                self.all_encoder_layers_ = _TransformerModel(
+                    input_blob=self.embedding_output_,
+                    addr_blob=addr_blob,
+                    seq_length=seq_length,
+                    hidden_size=hidden_size,
+                    num_hidden_layers=num_hidden_layers,
+                    num_attention_heads=num_attention_heads,
+                    intermediate_size=intermediate_size,
+                    intermediate_act_fn=GetActivation(hidden_act),
+                    hidden_dropout_prob=hidden_dropout_prob,
+                    attention_probs_dropout_prob=attention_probs_dropout_prob,
+                    initializer_range=initializer_range,
+                    do_return_all_layers=False,
+                )
+            self.sequence_output_ = self.all_encoder_layers_[-1]
+
+    def embedding_output(self):
+        return self.embedding_output_
+
+    def all_encoder_layers(self):
+        return self.all_encoder_layers_
+
+    def sequence_output(self):
+        return self.sequence_output_
+
+    def embedding_table(self):
+        return self.embedding_table_
+
+
+def CreateInitializer(std):
+    return flow.truncated_normal(std)
+
+
+def _Gelu(in_blob):
+    return flow.math.gelu(in_blob)
+
+
+def _TransformerModel(
+    input_blob,
+    addr_blob,
+    seq_length,
+    hidden_size=768,
+    num_hidden_layers=12,
+    num_attention_heads=12,
+    intermediate_size=3072,
+    intermediate_act_fn=_Gelu,
+    hidden_dropout_prob=0.1,
+    attention_probs_dropout_prob=0.1,
+    initializer_range=0.02,
+    do_return_all_layers=False,
+):
+    assert hidden_size % num_attention_heads == 0
+    attention_head_size = int(hidden_size / num_attention_heads)
+    input_width = hidden_size
+    prev_output_blob = flow.reshape(input_blob, (-1, input_width))
+    all_layer_output_blobs = []
+    for layer_idx in range(num_hidden_layers):
+        with flow.scope.namespace("layer_%d" % layer_idx):
+            layer_input_blob = prev_output_blob
+            with flow.scope.namespace("attention"):
+                with flow.scope.namespace("self"):
+                    attention_output_blob = _AttentionLayer(
+                        from_blob=layer_input_blob,
+                        to_blob=layer_input_blob,
+                        addr_blob=addr_blob,
+                        num_attention_heads=num_attention_heads,
+                        size_per_head=attention_head_size,
+                        attention_probs_dropout_prob=attention_probs_dropout_prob,
+                        initializer_range=initializer_range,
+                        do_return_2d_tensor=True,
+                        from_seq_length=seq_length,
+                        to_seq_length=seq_length,
+                    )
+                with flow.scope.namespace("output"):
+                    attention_output_blob = _FullyConnected(
+                        attention_output_blob,
+                        input_size=num_attention_heads * attention_head_size,
+                        units=hidden_size,
+                        weight_initializer=CreateInitializer(initializer_range),
+                        name="dense",
+                    )
+                    attention_output_blob = _Dropout(
+                        attention_output_blob, hidden_dropout_prob
+                    )
+                    attention_output_blob = attention_output_blob + layer_input_blob
+                    attention_output_blob = _LayerNorm(
+                        attention_output_blob, hidden_size
+                    )
+            with flow.scope.namespace("intermediate"):
+                if callable(intermediate_act_fn):
+                    act_fn = op_conf_util.kNone
+                else:
+                    act_fn = intermediate_act_fn
+                intermediate_output_blob = _FullyConnected(
+                    attention_output_blob,
+                    input_size=num_attention_heads * attention_head_size,
+                    units=intermediate_size,
+                    activation=act_fn,
+                    weight_initializer=CreateInitializer(initializer_range),
+                    name="dense",
+                )
+                if callable(intermediate_act_fn):
+                    intermediate_output_blob = intermediate_act_fn(
+                        intermediate_output_blob
+                    )
+            with flow.scope.namespace("output"):
+                layer_output_blob = _FullyConnected(
+                    intermediate_output_blob,
+                    input_size=intermediate_size,
+                    units=hidden_size,
+                    weight_initializer=CreateInitializer(initializer_range),
+                    name="dense",
+                )
+                layer_output_blob = _Dropout(layer_output_blob, hidden_dropout_prob)
+                layer_output_blob = layer_output_blob + attention_output_blob
+                layer_output_blob = _LayerNorm(layer_output_blob, hidden_size)
+                prev_output_blob = layer_output_blob
+                all_layer_output_blobs.append(layer_output_blob)
+    input_shape = (-1, seq_length, hidden_size)
+    if do_return_all_layers:
+        final_output_blobs = []
+        for layer_output_blob in all_layer_output_blobs:
+            final_output_blob = flow.reshape(layer_output_blob, input_shape)
+            final_output_blobs.append(final_output_blob)
+        return final_output_blobs
+    else:
+        final_output_blob = flow.reshape(prev_output_blob, input_shape)
+        return [final_output_blob]
+
+
+def _AttentionLayer(
+    from_blob,
+    to_blob,
+    addr_blob,
+    num_attention_heads=1,
+    size_per_head=512,
+    query_act=op_conf_util.kNone,
+    key_act=op_conf_util.kNone,
+    value_act=op_conf_util.kNone,
+    attention_probs_dropout_prob=0.0,
+    initializer_range=0.02,
+    do_return_2d_tensor=False,
+    batch_size=None,
+    from_seq_length=None,
+    to_seq_length=None,
+):
+    def TransposeForScores(input_blob, num_attention_heads, seq_length, width):
+        output_blob = flow.reshape(
+            input_blob, [-1, seq_length, num_attention_heads, width]
+        )
+        output_blob = flow.transpose(output_blob, perm=[0, 2, 1, 3])
+        return output_blob
+
+    from_blob_2d = flow.reshape(from_blob, [-1, num_attention_heads * size_per_head])
+    to_blob_2d = flow.reshape(to_blob, [-1, num_attention_heads * size_per_head])
+    query_blob = _FullyConnected(
+        from_blob_2d,
+        input_size=num_attention_heads * size_per_head,
+        units=num_attention_heads * size_per_head,
+        activation=query_act,
+        name="query",
+        weight_initializer=CreateInitializer(initializer_range),
+    )
+    key_blob = _FullyConnected(
+        to_blob_2d,
+        input_size=num_attention_heads * size_per_head,
+        units=num_attention_heads * size_per_head,
+        activation=key_act,
+        name="key",
+        weight_initializer=CreateInitializer(initializer_range),
+    )
+    value_blob = _FullyConnected(
+        to_blob_2d,
+        input_size=num_attention_heads * size_per_head,
+        units=num_attention_heads * size_per_head,
+        activation=value_act,
+        name="value",
+        weight_initializer=CreateInitializer(initializer_range),
+    )
+    query_blob = TransposeForScores(
+        query_blob, num_attention_heads, from_seq_length, size_per_head
+    )
+    key_blob = TransposeForScores(
+        key_blob, num_attention_heads, to_seq_length, size_per_head
+    )
+    attention_scores_blob = flow.matmul(query_blob, key_blob, transpose_b=True)
+    attention_scores_blob = attention_scores_blob * (
+        1.0 / math.sqrt(float(size_per_head))
+    )
+    attention_scores_blob = attention_scores_blob + addr_blob
+    attention_probs_blob = flow.nn.softmax(attention_scores_blob)
+    attention_probs_blob = _Dropout(attention_probs_blob, attention_probs_dropout_prob)
+    value_blob = flow.reshape(
+        value_blob, [-1, to_seq_length, num_attention_heads, size_per_head]
+    )
+    value_blob = flow.transpose(value_blob, perm=[0, 2, 1, 3])
+    context_blob = flow.matmul(attention_probs_blob, value_blob)
+    context_blob = flow.transpose(context_blob, perm=[0, 2, 1, 3])
+    if do_return_2d_tensor:
+        context_blob = flow.reshape(
+            context_blob, [-1, num_attention_heads * size_per_head]
+        )
+    else:
+        context_blob = flow.reshape(
+            context_blob, [-1, from_seq_length, num_attention_heads * size_per_head]
+        )
+    return context_blob
+
+
+def _FullyConnected(
+    input_blob, input_size, units, activation=None, name=None, weight_initializer=None
+):
+    weight_blob = flow.get_variable(
+        name=name + "-weight",
+        shape=[input_size, units],
+        dtype=input_blob.dtype,
+        model_name="weight",
+        initializer=weight_initializer,
+    )
+    bias_blob = flow.get_variable(
+        name=name + "-bias",
+        shape=[units],
+        dtype=input_blob.dtype,
+        model_name="bias",
+        initializer=flow.constant_initializer(0.0),
+    )
+    output_blob = flow.matmul(input_blob, weight_blob)
+    output_blob = flow.nn.bias_add(output_blob, bias_blob)
+    return output_blob
+
+
+def _Dropout(input_blob, dropout_prob):
+    if dropout_prob == 0.0:
+        return input_blob
+    return flow.nn.dropout(input_blob, rate=dropout_prob)
+
+
+def _LayerNorm(input_blob, hidden_size):
+    return flow.layers.layer_norm(
+        input_blob, name="LayerNorm", begin_norm_axis=-1, begin_params_axis=-1
+    )
+
+
+def _CreateAttentionMaskFromInputMask(to_mask_blob, from_seq_length, to_seq_length):
+    output = flow.cast(to_mask_blob, dtype=flow.float)
+    output = flow.reshape(output, [-1, 1, to_seq_length])
+    zeros = flow.constant(0.0, dtype=flow.float, shape=[from_seq_length, to_seq_length])
+    attention_mask_blob = zeros + output
+    attention_mask_blob = flow.reshape(
+        attention_mask_blob, [-1, 1, from_seq_length, to_seq_length]
+    )
+    attention_mask_blob = flow.cast(attention_mask_blob, dtype=flow.float)
+    addr_blob = (attention_mask_blob - 1.0) * 10000.0
+    return addr_blob
+
+
+def _EmbeddingPostprocessor(
+    input_blob,
+    seq_length,
+    embedding_size,
+    use_token_type=False,
+    token_type_ids_blob=None,
+    token_type_vocab_size=16,
+    token_type_embedding_name="token_type_embeddings",
+    use_position_embeddings=True,
+    position_embedding_name="position_embeddings",
+    initializer_range=0.02,
+    max_position_embeddings=512,
+    dropout_prob=0.1,
+):
+    output = input_blob
+    if use_token_type:
+        assert token_type_ids_blob is not None
+        token_type_table = flow.get_variable(
+            name=token_type_embedding_name,
+            shape=[token_type_vocab_size, embedding_size],
+            dtype=input_blob.dtype,
+            initializer=CreateInitializer(initializer_range),
+        )
+        token_type_embeddings = flow.gather(
+            params=token_type_table, indices=token_type_ids_blob, axis=0
+        )
+        output = output + token_type_embeddings
+    if use_position_embeddings:
+        position_table = flow.get_variable(
+            name=position_embedding_name,
+            shape=[1, max_position_embeddings, embedding_size],
+            dtype=input_blob.dtype,
+            initializer=CreateInitializer(initializer_range),
+        )
+        assert seq_length <= max_position_embeddings
+        if seq_length != max_position_embeddings:
+            position_table = flow.slice(
+                position_table, begin=[None, 0, 0], size=[None, seq_length, -1]
+            )
+        output = output + position_table
+    output = _LayerNorm(output, embedding_size)
+    output = _Dropout(output, dropout_prob)
+    return output
+
+
+def _EmbeddingLookup(
+    input_ids_blob,
+    vocab_size,
+    embedding_size=128,
+    initializer_range=0.02,
+    word_embedding_name="word_embeddings",
+):
+    embedding_table = flow.get_variable(
+        name=word_embedding_name,
+        shape=[vocab_size, embedding_size],
+        dtype=flow.float,
+        initializer=CreateInitializer(initializer_range),
+    )
+    output = flow.gather(params=embedding_table, indices=input_ids_blob, axis=0)
+    return (output, embedding_table)
+
+
+def GetActivation(name):
+    if name == "linear":
+        return None
+    elif name == "relu":
+        return flow.math.relu
+    elif name == "tanh":
+        return flow.math.tanh
+    elif name == "gelu":
+        return flow.math.gelu
+    else:
+        raise Exception("unsupported activation")
diff --git a/python/oneflow/test/models/cnns_tests.py b/python/oneflow/test/models/cnns_tests.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8898c9887b4d75f5449b5dfdc22db22b5be8fc3
--- /dev/null
+++ b/python/oneflow/test/models/cnns_tests.py
@@ -0,0 +1,197 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import imp
+import os
+import sys
+
+import numpy
+from absl import app, flags
+
+import oneflow
+
+FLAGS = flags.FLAGS
+flags.DEFINE_string("python_bin", "python3", "python binary program name or filepath.")
+flags.DEFINE_boolean(
+    "enable_auto_mixed_precision",
+    False,
+    "automatically change float net to mixed precision net",
+)
+
+
+class TestNetMixin:
+    """
+    Base Tester
+  """
+
+    def setUp(self):
+        self.net = ""
+        self.tf_loss_dir = ""
+        self.of_loss_dir = ""
+        self.num_iter = 10
+        if os.getenv("ONEFLOW_TEST_CPU_ONLY"):
+            self.num_iter = 3
+        self.set_params()
+        oneflow.clear_default_session()
+
+    def set_params(self):
+        pass
+
+    def assert_tolerance_4_mixed_precision(self):
+        raise AssertionError
+
+    def run_net(self, num_gpu_per_node, num_node=1, node_list=""):
+        net_modudle = _Import(self.net)
+        spec = net_modudle.DLNetSpec(FLAGS.enable_auto_mixed_precision)
+        spec.num_nodes = num_node
+        spec.gpu_num_per_node = num_gpu_per_node
+        if os.getenv("ONEFLOW_TEST_CPU_ONLY"):
+            spec.iter_num = 3
+        net_modudle.main(spec)
+        return
+        if num_node > 1:
+            os.system(
+                "{} {}.py -g {} -m -n {}".format(
+                    FLAGS.python_bin, self.net, num_gpu_per_node, node_list
+                )
+            )
+        else:
+            os.system(
+                "{} {}.py -g {}".format(FLAGS.python_bin, self.net, num_gpu_per_node)
+            )
+
+    def load_tf_loss(self):
+        tf_loss = numpy.load(os.path.join(self.tf_loss_dir, "1n1c.npy"))
+        return tf_loss[0 : self.num_iter]
+
+    def load_of_loss(self, test_type):
+        path = os.path.join(self.of_loss_dir, test_type + ".npy")
+        if os.path.exists(path):
+            of_loss = numpy.load(path)
+        else:
+            of_loss = numpy.zeros(self.num_iter)
+        return of_loss[0 : self.num_iter]
+
+    def print_and_check_result(self, result_name):
+        if os.getenv("ONEFLOW_TEST_CPU_ONLY"):
+            if self.net == "resnet50":
+                print("WARNING: skipping check for resnet50 cpu due to GEMM NaN")
+                return
+        loss_dict = {}
+        loss_dict["tensorflow"] = self.load_tf_loss()
+        loss_dict["oneflow"] = self.load_of_loss(result_name)
+        print("==".ljust(64, "="))
+        print(" ".ljust(2, " ") + self.net + " loss report")
+        print("==".ljust(64, "="))
+        fmt_str = "{:>6}  {:>12}  {:>12}"
+        print(fmt_str.format("iter", "tensorflow", "oneflow-" + result_name))
+        for i in range(self.num_iter):
+            fmt_str = "{:>6}  {:>12.6f}  {:>12.6f}"
+            print(
+                fmt_str.format(i, loss_dict["tensorflow"][i], loss_dict["oneflow"][i])
+            )
+        if FLAGS.enable_auto_mixed_precision:
+            tolerance = self.assert_tolerance_4_mixed_precision()
+            rtol = tolerance["rtol"]
+            atol = tolerance["atol"]
+            print(
+                "assert tolerance for mixed_precision are: rtol", rtol, ", atol", atol
+            )
+            self.assertTrue(
+                numpy.allclose(
+                    loss_dict["tensorflow"], loss_dict["oneflow"], rtol=rtol, atol=atol
+                )
+            )
+        else:
+            self.assertTrue(
+                numpy.allclose(loss_dict["tensorflow"], loss_dict["oneflow"])
+            )
+
+
+class TestAlexNetMixin(TestNetMixin):
+    """
+    AlexNet Tester
+  """
+
+    def set_params(self):
+        self.net = "alexnet"
+        self.tf_loss_dir = os.path.join(
+            "/dataset/PNGS/cnns_model_for_test/tf_loss", self.net
+        )
+        self.of_loss_dir = os.path.join("./of_loss", self.net)
+
+    def assert_tolerance_4_mixed_precision(self):
+        return {"rtol": 1e-05, "atol": 0.01}
+
+
+class TestResNet50Mixin(TestNetMixin):
+    """
+    AlexNet Tester
+  """
+
+    def set_params(self):
+        self.net = "resnet50"
+        self.tf_loss_dir = os.path.join(
+            "/dataset/PNGS/cnns_model_for_test/tf_loss", self.net
+        )
+        self.of_loss_dir = os.path.join("./of_loss", self.net)
+
+    def assert_tolerance_4_mixed_precision(self):
+        return {"rtol": 1e-08, "atol": 1e-05}
+
+
+class TestVgg16Mixin(TestNetMixin):
+    """
+    Vgg16 Tester
+  """
+
+    def set_params(self):
+        self.net = "vgg16"
+        self.tf_loss_dir = os.path.join(
+            "/dataset/PNGS/cnns_model_for_test/tf_loss", self.net
+        )
+        self.of_loss_dir = os.path.join("./of_loss", self.net)
+
+    def assert_tolerance_4_mixed_precision(self):
+        return {"rtol": 0.0001, "atol": 0.1}
+
+
+class TestInceptionV3Mixin(TestNetMixin):
+    """
+    InceptionV3 Tester
+  """
+
+    def set_params(self):
+        self.net = "inceptionv3"
+        self.tf_loss_dir = os.path.join(
+            "/dataset/PNGS/cnns_model_for_test/tf_loss", self.net
+        )
+        self.of_loss_dir = os.path.join("./of_loss", self.net)
+
+    def assert_tolerance_4_mixed_precision(self):
+        return {"rtol": 1e-05, "atol": 0.01}
+
+
+def _Import(name, globals=None, locals=None, fromlist=None):
+    try:
+        return sys.modules[name]
+    except KeyError:
+        pass
+    (fp, pathname, description) = imp.find_module(name)
+    try:
+        return imp.load_module(name, fp, pathname, description)
+    finally:
+        if fp:
+            fp.close()
diff --git a/python/oneflow/test/models/eager_1node_test.py b/python/oneflow/test/models/eager_1node_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..bfee26ea7bd3620562c52cdc72076d62e32a0520
--- /dev/null
+++ b/python/oneflow/test/models/eager_1node_test.py
@@ -0,0 +1,77 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+
+import env_1node
+from absl import app
+from absl.testing import absltest
+from cnns_tests import (
+    TestAlexNetMixin,
+    TestInceptionV3Mixin,
+    TestResNet50Mixin,
+    TestVgg16Mixin,
+)
+from test_1node_mixin import Test1NodeMixin
+
+import oneflow as flow
+
+
+class TestAlexNet(Test1NodeMixin, TestAlexNetMixin, absltest.TestCase):
+    def setUp(self):
+        super().setUp()
+        flow.enable_eager_execution(True)
+
+
+class TestResNet50(Test1NodeMixin, TestResNet50Mixin, absltest.TestCase):
+    def setUp(self):
+        super().setUp()
+        flow.enable_eager_execution(True)
+
+
+class TestVgg16(Test1NodeMixin, TestVgg16Mixin, absltest.TestCase):
+    def setUp(self):
+        super().setUp()
+        flow.enable_eager_execution(True)
+
+
+class TestInceptionV3(Test1NodeMixin, TestInceptionV3Mixin, absltest.TestCase):
+    def setUp(self):
+        super().setUp()
+        flow.enable_eager_execution(True)
+
+
+class TestEagerMixin(object):
+    def setUp(self):
+        flow.clear_default_session()
+        flow.enable_eager_execution(True)
+
+
+flow.unittest.register_test_cases(
+    scope=globals(),
+    directory=os.path.dirname(os.path.realpath(__file__)),
+    filter_by_num_nodes=lambda x: x == 1,
+    base_class=absltest.TestCase,
+    test_case_mixin=TestEagerMixin,
+)
+
+
+def main(argv):
+    env_1node.Init()
+    absltest.main()
+
+
+if __name__ == "__main__":
+    app.run(main)
diff --git a/python/oneflow/test/models/env_1node.py b/python/oneflow/test/models/env_1node.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf337b843a9cb9ecd126b8cea135174e72015c75
--- /dev/null
+++ b/python/oneflow/test/models/env_1node.py
@@ -0,0 +1,20 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow
+
+
+def Init():
+    oneflow.env.init()
diff --git a/python/oneflow/test/models/env_2node.py b/python/oneflow/test/models/env_2node.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e5950f937c5c46b992399d0efd4ba86be7bcc90
--- /dev/null
+++ b/python/oneflow/test/models/env_2node.py
@@ -0,0 +1,33 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import atexit
+
+from absl import flags
+
+import oneflow as flow
+
+FLAGS = flags.FLAGS
+flags.DEFINE_string(
+    "nodes_list", "192.168.1.15,192.168.1.16", "nodes list seperated by comma"
+)
+flags.DEFINE_integer("ctrl_port", "9524", "control port")
+
+
+def Init():
+    flow.env.machine(FLAGS.nodes_list.split(","))
+    flow.env.ctrl_port(FLAGS.ctrl_port)
+    flow.deprecated.init_worker(scp_binary=True, use_uuid=True)
+    atexit.register(flow.deprecated.delete_worker)
diff --git a/python/oneflow/test/models/inceptionv3.py b/python/oneflow/test/models/inceptionv3.py
new file mode 100644
index 0000000000000000000000000000000000000000..81e775d4f640c1810356332fd8271afb17d19de4
--- /dev/null
+++ b/python/oneflow/test/models/inceptionv3.py
@@ -0,0 +1,637 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import argparse
+import os
+from datetime import datetime
+
+import numpy
+
+import oneflow as flow
+import oneflow.core.operator.op_conf_pb2 as op_conf_util
+
+_DATA_DIR = "/dataset/PNGS/PNG299/of_record_repeated"
+_EVAL_DIR = _DATA_DIR
+_TRAIN_DIR = _DATA_DIR
+_MODEL_LOAD = "/dataset/PNGS/cnns_model_for_test/inceptionv3/models/of_model"
+_MODEL_SAVE_DIR = "./model_save-{}".format(
+    str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))
+)
+NODE_LIST = "192.168.1.12,192.168.1.14"
+
+
+class DLNetSpec(object):
+    def __init__(self, enable_auto_mixed_precision):
+        self.batch_size = 8
+        self.data_part_num = 32
+        self.eval_dir = _DATA_DIR
+        self.train_dir = _DATA_DIR
+        self.model_save_dir = _MODEL_SAVE_DIR
+        self.model_load_dir = _MODEL_LOAD
+        self.num_nodes = 1
+        self.gpu_num_per_node = 1
+        self.iter_num = 10
+        self.enable_auto_mixed_precision = enable_auto_mixed_precision
+
+
+parser = argparse.ArgumentParser(description="flags for multi-node and resource")
+parser.add_argument("-g", "--gpu_num_per_node", type=int, default=1, required=False)
+parser.add_argument("-i", "--iter_num", type=int, default=10, required=False)
+parser.add_argument("-b", "--batch_size", type=int, default=8, required=False)
+parser.add_argument(
+    "-m", "--multinode", default=False, action="store_true", required=False
+)
+parser.add_argument("-n", "--node_list", type=str, default=NODE_LIST, required=False)
+parser.add_argument(
+    "-s", "--skip_scp_binary", default=False, action="store_true", required=False
+)
+parser.add_argument(
+    "-c",
+    "--scp_binary_without_uuid",
+    default=False,
+    action="store_true",
+    required=False,
+)
+parser.add_argument(
+    "-r", "--remote_by_hand", default=False, action="store_true", required=False
+)
+parser.add_argument("-e", "--eval_dir", type=str, default=_DATA_DIR, required=False)
+parser.add_argument("-t", "--train_dir", type=str, default=_DATA_DIR, required=False)
+parser.add_argument(
+    "-load", "--model_load_dir", type=str, default=_MODEL_LOAD, required=False
+)
+parser.add_argument(
+    "-save", "--model_save_dir", type=str, default=_MODEL_SAVE_DIR, required=False
+)
+parser.add_argument("-dn", "--data_part_num", type=int, default=32, required=False)
+
+
+def _conv2d_layer(
+    name,
+    input,
+    filters,
+    kernel_size=3,
+    strides=1,
+    padding="SAME",
+    data_format="NCHW",
+    dilation_rate=1,
+    activation=op_conf_util.kSigmoid,
+    use_bias=True,
+    weight_initializer=flow.random_uniform_initializer(),
+    bias_initializer=flow.constant_initializer(),
+):
+    if isinstance(kernel_size, int):
+        kernel_size = (kernel_size, kernel_size)
+    else:
+        kernel_size = tuple(kernel_size)
+    weight_shape = (filters, input.shape[1]) + kernel_size
+    weight = flow.get_variable(
+        name + "-weight",
+        shape=weight_shape,
+        dtype=input.dtype,
+        initializer=weight_initializer,
+    )
+    output = flow.nn.conv2d(
+        input, weight, strides, padding, None, data_format, dilation_rate, name=name
+    )
+    if use_bias:
+        bias = flow.get_variable(
+            name + "-bias",
+            shape=(filters,),
+            dtype=input.dtype,
+            initializer=bias_initializer,
+        )
+        output = flow.nn.bias_add(output, bias, data_format)
+    if activation is not None:
+        if activation == op_conf_util.kRelu:
+            output = flow.math.relu(output)
+        elif activation == op_conf_util.kSigmoid:
+            output = flow.math.sigmoid(output)
+        else:
+            raise NotImplementedError
+    return output
+
+
+def _data_load_layer(args, data_dir):
+    node_num = args.num_nodes
+    total_batch_size = args.batch_size * args.gpu_num_per_node * node_num
+    rgb_mean = [123.68, 116.78, 103.94]
+    ofrecord = flow.data.ofrecord_reader(
+        data_dir,
+        batch_size=total_batch_size,
+        data_part_num=args.data_part_num,
+        name="decode",
+    )
+    image = flow.data.ofrecord_image_decoder(ofrecord, "encoded", color_space="RGB")
+    label = flow.data.ofrecord_raw_decoder(
+        ofrecord, "class/label", shape=(), dtype=flow.int32
+    )
+    rsz = flow.image.resize(image, resize_x=299, resize_y=299, color_space="RGB")
+    normal = flow.image.crop_mirror_normalize(
+        rsz,
+        color_space="RGB",
+        output_layout="NCHW",
+        mean=rgb_mean,
+        output_dtype=flow.float,
+    )
+    return (normal, label)
+
+
+def InceptionA(in_blob, index):
+    with flow.scope.namespace("mixed_{}".format(index)):
+        with flow.scope.namespace("branch1x1"):
+            branch1x1 = _conv2d_layer(
+                "conv0", in_blob, filters=64, kernel_size=1, strides=1, padding="SAME"
+            )
+        with flow.scope.namespace("branch5x5"):
+            branch5x5_1 = _conv2d_layer(
+                "conv0", in_blob, filters=48, kernel_size=1, strides=1, padding="SAME"
+            )
+            branch5x5_2 = _conv2d_layer(
+                "conv1",
+                branch5x5_1,
+                filters=64,
+                kernel_size=5,
+                strides=1,
+                padding="SAME",
+            )
+        with flow.scope.namespace("branch3x3dbl"):
+            branch3x3dbl_1 = _conv2d_layer(
+                "conv0", in_blob, filters=64, kernel_size=1, strides=1, padding="SAME"
+            )
+            branch3x3dbl_2 = _conv2d_layer(
+                "conv1",
+                branch3x3dbl_1,
+                filters=96,
+                kernel_size=3,
+                strides=1,
+                padding="SAME",
+            )
+            branch3x3dbl_3 = _conv2d_layer(
+                "conv2",
+                branch3x3dbl_2,
+                filters=96,
+                kernel_size=3,
+                strides=1,
+                padding="SAME",
+            )
+        with flow.scope.namespace("branch_pool"):
+            branch_pool_1 = flow.nn.avg_pool2d(
+                in_blob,
+                ksize=3,
+                strides=1,
+                padding="SAME",
+                data_format="NCHW",
+                name="pool",
+            )
+            branch_pool_2 = _conv2d_layer(
+                "conv",
+                branch_pool_1,
+                filters=32 if index == 0 else 64,
+                kernel_size=1,
+                strides=1,
+                padding="SAME",
+            )
+        inceptionA_bn = []
+        inceptionA_bn.append(branch1x1)
+        inceptionA_bn.append(branch5x5_2)
+        inceptionA_bn.append(branch3x3dbl_3)
+        inceptionA_bn.append(branch_pool_2)
+        mixed_concat = flow.concat(values=inceptionA_bn, axis=1, name="concat")
+    return mixed_concat
+
+
+def InceptionB(in_blob, index):
+    with flow.scope.namespace("mixed_{}".format(index)):
+        with flow.scope.namespace("branch3x3"):
+            branch3x3 = _conv2d_layer(
+                "conv0", in_blob, filters=384, kernel_size=3, strides=2, padding="VALID"
+            )
+        with flow.scope.namespace("branch3x3dbl"):
+            branch3x3dbl_1 = _conv2d_layer(
+                "conv0", in_blob, filters=64, kernel_size=1, strides=1, padding="SAME"
+            )
+            branch3x3dbl_2 = _conv2d_layer(
+                "conv1",
+                branch3x3dbl_1,
+                filters=96,
+                kernel_size=3,
+                strides=1,
+                padding="SAME",
+            )
+            branch3x3dbl_3 = _conv2d_layer(
+                "conv2",
+                branch3x3dbl_2,
+                filters=96,
+                kernel_size=3,
+                strides=2,
+                padding="VALID",
+            )
+        with flow.scope.namespace("branch_pool"):
+            branch_pool = flow.nn.max_pool2d(
+                in_blob,
+                ksize=3,
+                strides=2,
+                padding="VALID",
+                data_format="NCHW",
+                name="pool0",
+            )
+        inceptionB_bn = []
+        inceptionB_bn.append(branch3x3)
+        inceptionB_bn.append(branch3x3dbl_3)
+        inceptionB_bn.append(branch_pool)
+        mixed_concat = flow.concat(values=inceptionB_bn, axis=1, name="concat")
+    return mixed_concat
+
+
+def InceptionC(in_blob, index, filters):
+    with flow.scope.namespace("mixed_{}".format(index)):
+        with flow.scope.namespace("branch1x1"):
+            branch1x1 = _conv2d_layer(
+                "conv0", in_blob, filters=192, kernel_size=1, strides=1, padding="SAME"
+            )
+        with flow.scope.namespace("branch7x7"):
+            branch7x7_1 = _conv2d_layer(
+                "conv0",
+                in_blob,
+                filters=filters,
+                kernel_size=1,
+                strides=1,
+                padding="SAME",
+            )
+            branch7x7_2 = _conv2d_layer(
+                "conv1",
+                branch7x7_1,
+                filters=filters,
+                kernel_size=[1, 7],
+                strides=1,
+                padding="SAME",
+            )
+            branch7x7_3 = _conv2d_layer(
+                "conv2",
+                branch7x7_2,
+                filters=192,
+                kernel_size=[7, 1],
+                strides=[1, 1],
+                padding="SAME",
+            )
+        with flow.scope.namespace("branch7x7dbl"):
+            branch7x7dbl_1 = _conv2d_layer(
+                "conv0",
+                in_blob,
+                filters=filters,
+                kernel_size=1,
+                strides=1,
+                padding="SAME",
+            )
+            branch7x7dbl_2 = _conv2d_layer(
+                "conv1",
+                branch7x7dbl_1,
+                filters=filters,
+                kernel_size=[7, 1],
+                strides=1,
+                padding="SAME",
+            )
+            branch7x7dbl_3 = _conv2d_layer(
+                "conv2",
+                branch7x7dbl_2,
+                filters=filters,
+                kernel_size=[1, 7],
+                strides=1,
+                padding="SAME",
+            )
+            branch7x7dbl_4 = _conv2d_layer(
+                "conv3",
+                branch7x7dbl_3,
+                filters=filters,
+                kernel_size=[7, 1],
+                strides=1,
+                padding="SAME",
+            )
+            branch7x7dbl_5 = _conv2d_layer(
+                "conv4",
+                branch7x7dbl_4,
+                filters=192,
+                kernel_size=[1, 7],
+                strides=1,
+                padding="SAME",
+            )
+        with flow.scope.namespace("branch_pool"):
+            branch_pool_1 = flow.nn.avg_pool2d(
+                in_blob,
+                ksize=3,
+                strides=1,
+                padding="SAME",
+                data_format="NCHW",
+                name="pool",
+            )
+            branch_pool_2 = _conv2d_layer(
+                "conv",
+                branch_pool_1,
+                filters=192,
+                kernel_size=[1, 1],
+                strides=1,
+                padding="SAME",
+            )
+        inceptionC_bn = []
+        inceptionC_bn.append(branch1x1)
+        inceptionC_bn.append(branch7x7_3)
+        inceptionC_bn.append(branch7x7dbl_5)
+        inceptionC_bn.append(branch_pool_2)
+        mixed_concat = flow.concat(values=inceptionC_bn, axis=1, name="concat")
+    return mixed_concat
+
+
+def InceptionD(in_blob, index):
+    with flow.scope.namespace("mixed_{}".format(index)):
+        with flow.scope.namespace("branch3x3"):
+            branch3x3_1 = _conv2d_layer(
+                "conv0", in_blob, filters=192, kernel_size=1, strides=1, padding="SAME"
+            )
+            branch3x3_2 = _conv2d_layer(
+                "conv1",
+                branch3x3_1,
+                filters=320,
+                kernel_size=3,
+                strides=2,
+                padding="VALID",
+            )
+        with flow.scope.namespace("branch7x7x3"):
+            branch7x7x3_1 = _conv2d_layer(
+                "conv0", in_blob, filters=192, kernel_size=1, strides=1, padding="SAME"
+            )
+            branch7x7x3_2 = _conv2d_layer(
+                "conv1",
+                branch7x7x3_1,
+                filters=192,
+                kernel_size=[1, 7],
+                strides=1,
+                padding="SAME",
+            )
+            branch7x7x3_3 = _conv2d_layer(
+                "conv2",
+                branch7x7x3_2,
+                filters=192,
+                kernel_size=[7, 1],
+                strides=1,
+                padding="SAME",
+            )
+            branch7x7x3_4 = _conv2d_layer(
+                "conv3",
+                branch7x7x3_3,
+                filters=192,
+                kernel_size=3,
+                strides=2,
+                padding="VALID",
+            )
+        with flow.scope.namespace("branch_pool"):
+            branch_pool = flow.nn.max_pool2d(
+                in_blob,
+                ksize=3,
+                strides=2,
+                padding="VALID",
+                data_format="NCHW",
+                name="pool",
+            )
+        inceptionD_bn = []
+        inceptionD_bn.append(branch3x3_2)
+        inceptionD_bn.append(branch7x7x3_4)
+        inceptionD_bn.append(branch_pool)
+        mixed_concat = flow.concat(values=inceptionD_bn, axis=1, name="concat")
+    return mixed_concat
+
+
+def InceptionE(in_blob, index):
+    with flow.scope.namespace("mixed_{}".format(index)):
+        with flow.scope.namespace("branch1x1"):
+            branch1x1 = _conv2d_layer(
+                "conv0", in_blob, filters=320, kernel_size=1, strides=1, padding="SAME"
+            )
+        with flow.scope.namespace("branch3x3"):
+            branch3x3_1 = _conv2d_layer(
+                "conv0", in_blob, filters=384, kernel_size=1, strides=1, padding="SAME"
+            )
+            branch3x3_2 = _conv2d_layer(
+                "conv1",
+                branch3x3_1,
+                filters=384,
+                kernel_size=[1, 3],
+                strides=1,
+                padding="SAME",
+            )
+            branch3x3_3 = _conv2d_layer(
+                "conv2",
+                branch3x3_1,
+                filters=384,
+                kernel_size=[3, 1],
+                strides=[1, 1],
+                padding="SAME",
+            )
+            inceptionE_1_bn = []
+            inceptionE_1_bn.append(branch3x3_2)
+            inceptionE_1_bn.append(branch3x3_3)
+            concat_branch3x3 = flow.concat(
+                values=inceptionE_1_bn, axis=1, name="concat"
+            )
+        with flow.scope.namespace("branch3x3dbl"):
+            branch3x3dbl_1 = _conv2d_layer(
+                "conv0", in_blob, filters=448, kernel_size=1, strides=1, padding="SAME"
+            )
+            branch3x3dbl_2 = _conv2d_layer(
+                "conv1",
+                branch3x3dbl_1,
+                filters=384,
+                kernel_size=3,
+                strides=1,
+                padding="SAME",
+            )
+            branch3x3dbl_3 = _conv2d_layer(
+                "conv2",
+                branch3x3dbl_2,
+                filters=384,
+                kernel_size=[1, 3],
+                strides=1,
+                padding="SAME",
+            )
+            branch3x3dbl_4 = _conv2d_layer(
+                "conv3",
+                branch3x3dbl_2,
+                filters=384,
+                kernel_size=[3, 1],
+                strides=1,
+                padding="SAME",
+            )
+            inceptionE_2_bn = []
+            inceptionE_2_bn.append(branch3x3dbl_3)
+            inceptionE_2_bn.append(branch3x3dbl_4)
+            concat_branch3x3dbl = flow.concat(
+                values=inceptionE_2_bn, axis=1, name="concat"
+            )
+        with flow.scope.namespace("branch_pool"):
+            branch_pool_1 = flow.nn.avg_pool2d(
+                in_blob,
+                ksize=3,
+                strides=1,
+                padding="SAME",
+                data_format="NCHW",
+                name="pool",
+            )
+            branch_pool_2 = _conv2d_layer(
+                "conv",
+                branch_pool_1,
+                filters=192,
+                kernel_size=[1, 1],
+                strides=1,
+                padding="SAME",
+            )
+        inceptionE_total_bn = []
+        inceptionE_total_bn.append(branch1x1)
+        inceptionE_total_bn.append(concat_branch3x3)
+        inceptionE_total_bn.append(concat_branch3x3dbl)
+        inceptionE_total_bn.append(branch_pool_2)
+        concat_total = flow.concat(values=inceptionE_total_bn, axis=1, name="concat")
+    return concat_total
+
+
+def InceptionV3(images, labels, trainable=True):
+    conv0 = _conv2d_layer(
+        "conv0", images, filters=32, kernel_size=3, strides=2, padding="VALID"
+    )
+    conv1 = _conv2d_layer(
+        "conv1", conv0, filters=32, kernel_size=3, strides=1, padding="VALID"
+    )
+    conv2 = _conv2d_layer(
+        "conv2", conv1, filters=64, kernel_size=3, strides=1, padding="SAME"
+    )
+    pool1 = flow.nn.max_pool2d(
+        conv2, ksize=3, strides=2, padding="VALID", data_format="NCHW", name="pool1"
+    )
+    conv3 = _conv2d_layer(
+        "conv3", pool1, filters=80, kernel_size=1, strides=1, padding="VALID"
+    )
+    conv4 = _conv2d_layer(
+        "conv4", conv3, filters=192, kernel_size=3, strides=1, padding="VALID"
+    )
+    pool2 = flow.nn.max_pool2d(
+        conv4, ksize=3, strides=2, padding="VALID", data_format="NCHW", name="pool2"
+    )
+    mixed_0 = InceptionA(pool2, 0)
+    mixed_1 = InceptionA(mixed_0, 1)
+    mixed_2 = InceptionA(mixed_1, 2)
+    mixed_3 = InceptionB(mixed_2, 3)
+    mixed_4 = InceptionC(mixed_3, 4, 128)
+    mixed_5 = InceptionC(mixed_4, 5, 160)
+    mixed_6 = InceptionC(mixed_5, 6, 160)
+    mixed_7 = InceptionC(mixed_6, 7, 192)
+    mixed_8 = InceptionD(mixed_7, 8)
+    mixed_9 = InceptionE(mixed_8, 9)
+    mixed_10 = InceptionE(mixed_9, 10)
+    pool3 = flow.nn.avg_pool2d(
+        mixed_10, ksize=8, strides=1, padding="VALID", data_format="NCHW", name="pool3"
+    )
+    with flow.scope.namespace("logits"):
+        pool3 = flow.reshape(pool3, [pool3.shape[0], -1])
+        weight = flow.get_variable(
+            "fc1-weight",
+            shape=(pool3.shape[1], 1001),
+            dtype=flow.float,
+            initializer=flow.truncated_normal(0.816496580927726),
+            model_name="weight",
+        )
+        bias = flow.get_variable(
+            "fc1-bias",
+            shape=(1001,),
+            dtype=flow.float,
+            initializer=flow.constant_initializer(),
+            model_name="bias",
+        )
+        fc1 = flow.matmul(pool3, weight)
+        fc1 = flow.nn.bias_add(fc1, bias)
+    loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+        labels=labels, logits=fc1, name="softmax_loss"
+    )
+    return loss
+
+
+def main(args):
+    flow.config.machine_num(args.num_nodes)
+    flow.config.gpu_device_num(args.gpu_num_per_node)
+    func_config = flow.FunctionConfig()
+    func_config.default_logical_view(flow.scope.consistent_view())
+    func_config.default_data_type(flow.float)
+    func_config.enable_auto_mixed_precision(args.enable_auto_mixed_precision)
+
+    @flow.global_function(type="train", function_config=func_config)
+    def TrainNet():
+        (images, labels) = _data_load_layer(args, args.train_dir)
+        loss = InceptionV3(images, labels)
+        flow.optimizer.SGD(
+            flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0
+        ).minimize(loss)
+        return loss
+
+    check_point = flow.train.CheckPoint()
+    if not args.model_load_dir:
+        check_point.init()
+    else:
+        check_point.load(args.model_load_dir)
+    num_nodes = args.num_nodes
+    print(
+        "Traning inceptionv3: num_gpu_per_node = {}, num_nodes = {}.".format(
+            args.gpu_num_per_node, num_nodes
+        )
+    )
+    print("{:>12}  {:>12}  {:>12}".format("iter", "loss type", "loss value"))
+    loss = []
+    for i in range(args.iter_num):
+        train_loss = TrainNet().get().mean()
+        loss.append(train_loss)
+        fmt_str = "{:>12}  {:>12}  {:>12.6f}"
+        print(fmt_str.format(i, "train loss:", train_loss))
+        if (i + 1) % 100 == 0:
+            check_point.save(_MODEL_SAVE_DIR + str(i))
+    loss_file = "{}n{}c.npy".format(
+        str(num_nodes), str(args.gpu_num_per_node * num_nodes)
+    )
+    loss_path = "./of_loss/inceptionv3"
+    if not os.path.exists(loss_path):
+        os.makedirs(loss_path)
+    numpy.save(os.path.join(loss_path, loss_file), loss)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    if args.multinode:
+        flow.env.ctrl_port(12138)
+        nodes = []
+        for n in args.node_list.strip().split(","):
+            addr_dict = {}
+            addr_dict["addr"] = n
+            nodes.append(addr_dict)
+        flow.env.machine(nodes)
+        if args.remote_by_hand is False:
+            if args.scp_binary_without_uuid:
+                flow.deprecated.init_worker(scp_binary=True, use_uuid=False)
+            elif args.skip_scp_binary:
+                flow.deprecated.init_worker(scp_binary=False, use_uuid=False)
+            else:
+                flow.deprecated.init_worker(scp_binary=True, use_uuid=True)
+    main(args)
+    if (
+        args.multinode
+        and args.skip_scp_binary is False
+        and (args.scp_binary_without_uuid is False)
+    ):
+        flow.deprecated.delete_worker()
diff --git a/python/oneflow/test/models/pretrain.py b/python/oneflow/test/models/pretrain.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe63708bdf1e791c70f1729abebf267f09ed23f8
--- /dev/null
+++ b/python/oneflow/test/models/pretrain.py
@@ -0,0 +1,191 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import bert as bert_util
+
+import oneflow as flow
+import oneflow.core.operator.op_conf_pb2 as op_conf_util
+
+
+def PreTrain(
+    input_ids_blob,
+    input_mask_blob,
+    token_type_ids_blob,
+    masked_lm_positions_blob,
+    masked_lm_ids_blob,
+    masked_lm_weights_blob,
+    next_sentence_label_blob,
+    vocab_size,
+    seq_length=512,
+    hidden_size=768,
+    num_hidden_layers=12,
+    num_attention_heads=12,
+    intermediate_size=3072,
+    hidden_act="gelu",
+    hidden_dropout_prob=0.1,
+    attention_probs_dropout_prob=0.1,
+    max_position_embeddings=512,
+    type_vocab_size=16,
+    max_predictions_per_seq=20,
+    initializer_range=0.02,
+):
+    backbone = bert_util.BertBackbone(
+        input_ids_blob=input_ids_blob,
+        input_mask_blob=input_mask_blob,
+        token_type_ids_blob=token_type_ids_blob,
+        vocab_size=vocab_size,
+        seq_length=seq_length,
+        hidden_size=hidden_size,
+        num_hidden_layers=num_hidden_layers,
+        num_attention_heads=num_attention_heads,
+        intermediate_size=intermediate_size,
+        hidden_act=hidden_act,
+        hidden_dropout_prob=hidden_dropout_prob,
+        attention_probs_dropout_prob=attention_probs_dropout_prob,
+        max_position_embeddings=max_position_embeddings,
+        type_vocab_size=type_vocab_size,
+        initializer_range=initializer_range,
+    )
+    (lm_loss, _, _) = _AddMaskedLanguageModelLoss(
+        input_blob=backbone.sequence_output(),
+        output_weights_blob=backbone.embedding_table(),
+        positions_blob=masked_lm_positions_blob,
+        label_id_blob=masked_lm_ids_blob,
+        label_weight_blob=masked_lm_weights_blob,
+        seq_length=seq_length,
+        hidden_size=hidden_size,
+        vocab_size=vocab_size,
+        max_predictions_per_seq=max_predictions_per_seq,
+        hidden_act=bert_util.GetActivation(hidden_act),
+        initializer_range=initializer_range,
+    )
+    pooled_output = PooledOutput(
+        backbone.sequence_output(), hidden_size, initializer_range
+    )
+    (ns_loss, _, _) = _AddNextSentenceOutput(
+        input_blob=pooled_output,
+        label_blob=next_sentence_label_blob,
+        hidden_size=hidden_size,
+        initializer_range=initializer_range,
+    )
+    with flow.scope.namespace("cls-loss"):
+        total_loss = lm_loss + ns_loss
+    return total_loss
+
+
+def PooledOutput(sequence_output, hidden_size, initializer_range):
+    with flow.scope.namespace("bert-pooler"):
+        first_token_tensor = flow.slice(sequence_output, [None, 0, 0], [None, 1, -1])
+        first_token_tensor = flow.reshape(first_token_tensor, [-1, hidden_size])
+        pooled_output = bert_util._FullyConnected(
+            first_token_tensor,
+            input_size=hidden_size,
+            units=hidden_size,
+            weight_initializer=bert_util.CreateInitializer(initializer_range),
+            name="dense",
+        )
+        pooled_output = flow.math.tanh(pooled_output)
+    return pooled_output
+
+
+def _AddMaskedLanguageModelLoss(
+    input_blob,
+    output_weights_blob,
+    positions_blob,
+    label_id_blob,
+    label_weight_blob,
+    seq_length,
+    hidden_size,
+    vocab_size,
+    max_predictions_per_seq,
+    hidden_act,
+    initializer_range,
+):
+    with flow.scope.namespace("other"):
+        sum_label_weight_blob = flow.math.reduce_sum(label_weight_blob, axis=[-1])
+        ones = sum_label_weight_blob * 0.0 + 1.0
+        sum_label_weight_blob = flow.math.reduce_sum(sum_label_weight_blob)
+        batch_size = flow.math.reduce_sum(ones)
+        sum_label_weight_blob = sum_label_weight_blob / batch_size
+    with flow.scope.namespace("cls-predictions"):
+        input_blob = _GatherIndexes(input_blob, positions_blob, seq_length, hidden_size)
+        with flow.scope.namespace("transform"):
+            if callable(hidden_act):
+                act_fn = op_conf_util.kNone
+            else:
+                act_fn = hidden_act
+            input_blob = bert_util._FullyConnected(
+                input_blob,
+                input_size=hidden_size,
+                units=hidden_size,
+                activation=act_fn,
+                weight_initializer=bert_util.CreateInitializer(initializer_range),
+                name="dense",
+            )
+            if callable(hidden_act):
+                input_blob = hidden_act(input_blob)
+                input_blob = bert_util._LayerNorm(input_blob, hidden_size)
+        output_bias = flow.get_variable(
+            name="output_bias",
+            shape=[vocab_size],
+            dtype=input_blob.dtype,
+            initializer=flow.constant_initializer(1.0),
+        )
+        logit_blob = flow.matmul(input_blob, output_weights_blob, transpose_b=True)
+        logit_blob = flow.nn.bias_add(logit_blob, output_bias)
+        label_id_blob = flow.reshape(label_id_blob, [-1])
+        pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+            logits=logit_blob, labels=label_id_blob
+        )
+        pre_example_loss = flow.reshape(pre_example_loss, [-1, max_predictions_per_seq])
+        numerator = pre_example_loss * label_weight_blob
+        with flow.scope.namespace("loss"):
+            numerator = flow.math.reduce_sum(numerator, axis=[-1])
+            denominator = sum_label_weight_blob + 1e-05
+            loss = numerator / denominator
+        return (loss, pre_example_loss, logit_blob)
+
+
+def _GatherIndexes(sequence_blob, positions_blob, seq_length, hidden_size):
+    output = flow.gather(
+        params=sequence_blob, indices=positions_blob, axis=2, batch_dims=2
+    )
+    output = flow.reshape(output, [-1, hidden_size])
+    return output
+
+
+def _AddNextSentenceOutput(input_blob, label_blob, hidden_size, initializer_range):
+    with flow.scope.namespace("cls-seq_relationship"):
+        output_weight_blob = flow.get_variable(
+            name="output_weights",
+            shape=[2, hidden_size],
+            dtype=input_blob.dtype,
+            model_name="weight",
+            initializer=bert_util.CreateInitializer(initializer_range),
+        )
+        output_bias_blob = flow.get_variable(
+            name="output_bias",
+            shape=[2],
+            dtype=input_blob.dtype,
+            model_name="bias",
+            initializer=flow.constant_initializer(0.0),
+        )
+        logit_blob = flow.matmul(input_blob, output_weight_blob, transpose_b=True)
+        logit_blob = flow.nn.bias_add(logit_blob, output_bias_blob)
+        pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+            logits=logit_blob, labels=label_blob
+        )
+        loss = pre_example_loss
+        return (loss, pre_example_loss, logit_blob)
diff --git a/python/oneflow/test/models/resnet50.py b/python/oneflow/test/models/resnet50.py
new file mode 100644
index 0000000000000000000000000000000000000000..d567fd068e9136de244311ceb6f94471219f0586
--- /dev/null
+++ b/python/oneflow/test/models/resnet50.py
@@ -0,0 +1,332 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import argparse
+import os
+from datetime import datetime
+
+import numpy
+
+import oneflow as flow
+import oneflow.core.operator.op_conf_pb2 as op_conf_util
+
+DATA_DIR = "/dataset/PNGS/PNG228/of_record_repeated"
+MODEL_LOAD = "/dataset/PNGS/cnns_model_for_test/resnet50/models/of_model"
+MODEL_SAVE = "./output/model_save-{}".format(
+    str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))
+)
+NODE_LIST = "192.168.1.12,192.168.1.14"
+IMAGE_SIZE = 228
+BLOCK_COUNTS = [3, 4, 6, 3]
+BLOCK_FILTERS = [256, 512, 1024, 2048]
+BLOCK_FILTERS_INNER = [64, 128, 256, 512]
+
+
+class DLNetSpec(object):
+    def __init__(self, enable_auto_mixed_precision):
+        self.batch_size = 8
+        self.data_part_num = 32
+        self.eval_dir = DATA_DIR
+        self.train_dir = DATA_DIR
+        self.model_save_dir = MODEL_SAVE
+        self.model_load_dir = MODEL_LOAD
+        self.num_nodes = 1
+        self.gpu_num_per_node = 1
+        self.iter_num = 10
+        self.enable_auto_mixed_precision = enable_auto_mixed_precision
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("-g", "--gpu_num_per_node", type=int, default=1, required=False)
+parser.add_argument("-i", "--iter_num", type=int, default=10, required=False)
+parser.add_argument(
+    "-m", "--multinode", default=False, action="store_true", required=False
+)
+parser.add_argument("-n", "--node_list", type=str, default=NODE_LIST, required=False)
+parser.add_argument(
+    "-s", "--skip_scp_binary", default=False, action="store_true", required=False
+)
+parser.add_argument(
+    "-c",
+    "--scp_binary_without_uuid",
+    default=False,
+    action="store_true",
+    required=False,
+)
+parser.add_argument(
+    "-r", "--remote_by_hand", default=False, action="store_true", required=False
+)
+parser.add_argument("-e", "--eval_dir", type=str, default=DATA_DIR, required=False)
+parser.add_argument("-t", "--train_dir", type=str, default=DATA_DIR, required=False)
+parser.add_argument(
+    "-load", "--model_load_dir", type=str, default=MODEL_LOAD, required=False
+)
+parser.add_argument(
+    "-save", "--model_save_dir", type=str, default=MODEL_SAVE, required=False
+)
+parser.add_argument("-dn", "--data_part_num", type=int, default=32, required=False)
+parser.add_argument("-b", "--batch_size", type=int, default=8, required=False)
+g_output_key = []
+g_trainable = True
+
+
+def _data_load(args, data_dir):
+    node_num = args.num_nodes
+    total_batch_size = args.batch_size * args.gpu_num_per_node * node_num
+    rgb_mean = [123.68, 116.78, 103.94]
+    ofrecord = flow.data.ofrecord_reader(
+        data_dir,
+        batch_size=total_batch_size,
+        data_part_num=args.data_part_num,
+        name="decode",
+    )
+    image = flow.data.ofrecord_image_decoder(ofrecord, "encoded", color_space="RGB")
+    label = flow.data.ofrecord_raw_decoder(
+        ofrecord, "class/label", shape=(), dtype=flow.int32
+    )
+    rsz = flow.image.resize(
+        image, resize_x=IMAGE_SIZE, resize_y=IMAGE_SIZE, color_space="RGB"
+    )
+    normal = flow.image.crop_mirror_normalize(
+        rsz,
+        color_space="RGB",
+        output_layout="NCHW",
+        mean=rgb_mean,
+        output_dtype=flow.float,
+    )
+    return (label, normal)
+
+
+def _conv2d(
+    name,
+    input,
+    filters,
+    kernel_size,
+    strides=1,
+    padding="SAME",
+    data_format="NCHW",
+    dilations=1,
+    weight_initializer=flow.variance_scaling_initializer(),
+):
+    weight = flow.get_variable(
+        name + "-weight",
+        shape=(filters, input.shape[1], kernel_size, kernel_size),
+        dtype=input.dtype,
+        initializer=weight_initializer,
+        trainable=g_trainable,
+    )
+    return flow.nn.conv2d(
+        input, weight, strides, padding, None, data_format, dilations, name=name
+    )
+
+
+def _batch_norm(inputs, name=None):
+    return flow.layers.batch_normalization(
+        inputs=inputs,
+        axis=1,
+        momentum=0.997,
+        epsilon=1e-05,
+        center=True,
+        scale=True,
+        trainable=g_trainable,
+        name=name,
+    )
+
+
+def conv2d_affine(
+    input, name, filters, kernel_size, strides, activation=op_conf_util.kNone
+):
+    padding = "SAME" if strides > 1 or kernel_size > 1 else "VALID"
+    output = _conv2d(name, input, filters, kernel_size, strides, padding)
+    return output
+
+
+def bottleneck_transformation(input, block_name, filters, filters_inner, strides):
+    a = conv2d_affine(
+        input,
+        block_name + "_branch2a",
+        filters_inner,
+        1,
+        1,
+        activation=op_conf_util.kRelu,
+    )
+    b = conv2d_affine(
+        a,
+        block_name + "_branch2b",
+        filters_inner,
+        1,
+        strides,
+        activation=op_conf_util.kRelu,
+    )
+    c = conv2d_affine(b, block_name + "_branch2c", filters, 1, 1)
+    return c
+
+
+def residual_block(input, block_name, filters, filters_inner, strides_init):
+    if strides_init != 1 or block_name == "res2_0":
+        shortcut = conv2d_affine(
+            input, block_name + "_branch1", filters, 1, strides_init
+        )
+    else:
+        shortcut = input
+    bottleneck = bottleneck_transformation(
+        input, block_name, filters, filters_inner, strides_init
+    )
+    return flow.math.relu(shortcut + bottleneck)
+
+
+def residual_stage(input, stage_name, counts, filters, filters_inner, stride_init=2):
+    output = input
+    for i in range(counts):
+        block_name = "%s_%d" % (stage_name, i)
+        output = residual_block(
+            output, block_name, filters, filters_inner, stride_init if i == 0 else 1
+        )
+    return output
+
+
+def resnet_conv_x_body(input, on_stage_end=lambda x: x):
+    output = input
+    for (i, (counts, filters, filters_inner)) in enumerate(
+        zip(BLOCK_COUNTS, BLOCK_FILTERS, BLOCK_FILTERS_INNER)
+    ):
+        stage_name = "res%d" % (i + 2)
+        output = residual_stage(
+            output, stage_name, counts, filters, filters_inner, 1 if i == 0 else 2
+        )
+        on_stage_end(output)
+        g_output_key.append(stage_name)
+    return output
+
+
+def resnet_stem(input):
+    conv1 = _conv2d("conv1", input, 64, 7, 2)
+    g_output_key.append("conv1")
+    conv1_bn = conv1
+    pool1 = flow.nn.avg_pool2d(
+        conv1_bn, ksize=3, strides=2, padding="VALID", data_format="NCHW", name="pool1"
+    )
+    g_output_key.append("pool1")
+    return pool1
+
+
+def resnet50(args, data_dir):
+    (labels, images) = _data_load(args, data_dir)
+    g_output_key.append("input_img")
+    with flow.scope.namespace("Resnet"):
+        stem = resnet_stem(images)
+        body = resnet_conv_x_body(stem, lambda x: x)
+        pool5 = flow.nn.avg_pool2d(
+            body, ksize=7, strides=1, padding="VALID", data_format="NCHW", name="pool5"
+        )
+        g_output_key.append("pool5")
+        fc1001 = flow.layers.dense(
+            flow.reshape(pool5, (pool5.shape[0], -1)),
+            units=1001,
+            use_bias=True,
+            kernel_initializer=flow.xavier_uniform_initializer(),
+            bias_initializer=flow.zeros_initializer(),
+            trainable=g_trainable,
+            name="fc1001",
+        )
+        g_output_key.append("fc1001")
+        loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+            labels, fc1001, name="softmax_loss"
+        )
+        g_output_key.append("cross_entropy")
+    return loss
+
+
+def _set_trainable(trainable):
+    global g_trainable
+    g_trainable = trainable
+
+
+def main(args):
+    flow.config.machine_num(args.num_nodes)
+    flow.config.gpu_device_num(args.gpu_num_per_node)
+    train_config = flow.FunctionConfig()
+    train_config.default_logical_view(flow.scope.consistent_view())
+    train_config.default_data_type(flow.float)
+    train_config.enable_auto_mixed_precision(args.enable_auto_mixed_precision)
+
+    @flow.global_function(type="train", function_config=train_config)
+    def TrainNet():
+        _set_trainable(True)
+        loss = resnet50(args, args.train_dir)
+        flow.optimizer.SGD(
+            flow.optimizer.PiecewiseConstantScheduler([], [0.0032]), momentum=0
+        ).minimize(loss)
+        return loss
+
+    eval_config = flow.FunctionConfig()
+    eval_config.default_data_type(flow.float)
+    eval_config.enable_auto_mixed_precision(args.enable_auto_mixed_precision)
+
+    @flow.global_function(function_config=eval_config)
+    def evaluate():
+        with flow.scope.consistent_view():
+            _set_trainable(False)
+            return resnet50(args, args.eval_dir)
+
+    check_point = flow.train.CheckPoint()
+    check_point.load(MODEL_LOAD)
+    loss = []
+    fmt_str = "{:>12}  {:>12}  {:.6f}"
+    print("{:>12}  {:>12}  {:>12}".format("iter", "loss type", "loss value"))
+    for i in range(args.iter_num):
+        train_loss = TrainNet().get().mean()
+        loss.append(train_loss)
+        print(fmt_str.format(i, "train loss:", train_loss))
+    loss_file = "{}n{}c.npy".format(
+        str(args.num_nodes), str(args.gpu_num_per_node * args.num_nodes)
+    )
+    loss_path = "./of_loss/resnet50"
+    if not os.path.exists(loss_path):
+        os.makedirs(loss_path)
+    numpy.save(os.path.join(loss_path, loss_file), loss)
+
+
+if __name__ == "__main__":
+    flow.env.log_dir("./output/log")
+    flow.env.ctrl_port(12138)
+    args = parser.parse_args()
+    if args.multinode:
+        flow.env.ctrl_port(12139)
+        nodes = []
+        for n in args.node_list.strip().split(","):
+            addr_dict = {}
+            addr_dict["addr"] = n
+            nodes.append(addr_dict)
+        flow.env.machine(nodes)
+        if args.scp_binary_without_uuid:
+            flow.deprecated.init_worker(scp_binary=True, use_uuid=False)
+        elif args.skip_scp_binary:
+            flow.deprecated.init_worker(scp_binary=False, use_uuid=False)
+        else:
+            flow.deprecated.init_worker(scp_binary=True, use_uuid=True)
+    num_nodes = len(args.node_list.strip().split(",")) if args.multinode else 1
+    print(
+        "Traning resnet50: num_gpu_per_node = {}, num_nodes = {}.".format(
+            args.gpu_num_per_node, num_nodes
+        )
+    )
+    main(args)
+    if (
+        args.multinode
+        and args.skip_scp_binary is False
+        and (args.scp_binary_without_uuid is False)
+    ):
+        flow.deprecated.delete_worker()
diff --git a/python/oneflow/test/models/run_cnns_test.py b/python/oneflow/test/models/run_cnns_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1add9a5fb9100d09724b0402e94f39d66c48b488
--- /dev/null
+++ b/python/oneflow/test/models/run_cnns_test.py
@@ -0,0 +1,55 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+
+import cnns_tests
+import env_2node
+import numpy
+from absl import app
+from absl.testing import absltest
+from test_1node_mixin import Test1NodeMixin
+from test_2node_mixin import Test2NodeMixin
+
+
+class TestNodeMixin(Test1NodeMixin, Test2NodeMixin):
+    pass
+
+
+class TestAlexNet(TestNodeMixin, cnns_tests.TestAlexNetMixin, absltest.TestCase):
+    pass
+
+
+class TestResNet50(TestNodeMixin, cnns_tests.TestResNet50Mixin, absltest.TestCase):
+    pass
+
+
+class TestVgg16(TestNodeMixin, cnns_tests.TestVgg16Mixin, absltest.TestCase):
+    pass
+
+
+class TestInceptionV3(
+    TestNodeMixin, cnns_tests.TestInceptionV3Mixin, absltest.TestCase
+):
+    pass
+
+
+def main(argv):
+    env_2node.Init()
+    absltest.main()
+
+
+if __name__ == "__main__":
+    app.run(main)
diff --git a/python/oneflow/test/models/test_1node_mixin.py b/python/oneflow/test/models/test_1node_mixin.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e45cbad4faa0910faed6119f4530faf01a37820
--- /dev/null
+++ b/python/oneflow/test/models/test_1node_mixin.py
@@ -0,0 +1,27 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import oneflow.unittest
+
+
+class Test1NodeMixin:
+    def test_1n1c(self):
+        self.run_net(1)
+        self.print_and_check_result("1n1c")
+
+    def test_1n4c(self):
+        self.run_net(4)
+        self.print_and_check_result("1n4c")
diff --git a/python/oneflow/test/models/test_2node_mixin.py b/python/oneflow/test/models/test_2node_mixin.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab32b3593579f72dfa3cbde6eca23899d90c2308
--- /dev/null
+++ b/python/oneflow/test/models/test_2node_mixin.py
@@ -0,0 +1,27 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from absl import flags
+
+import oneflow.unittest
+
+FLAGS = flags.FLAGS
+
+
+class Test2NodeMixin:
+    def test_2n8c(self):
+        self.run_net(4, 2, FLAGS.nodes_list)
+        self.print_and_check_result("2n8c")
diff --git a/python/oneflow/test/models/test_alexnet_model.py b/python/oneflow/test/models/test_alexnet_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..e80c74bec29ce83876f4a5da2dfa6a454cfdf637
--- /dev/null
+++ b/python/oneflow/test/models/test_alexnet_model.py
@@ -0,0 +1,255 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+from datetime import datetime
+
+import oneflow as flow
+import oneflow.core.job.initializer_conf_pb2 as initializer_conf_util
+import oneflow.core.operator.op_conf_pb2 as op_conf_util
+import oneflow.unittest
+
+_DATA_DIR = "/dataset/PNGS/PNG227/of_record_repeated"
+_MODEL_SAVE_DIR = "./model_save-{}".format(
+    str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))
+)
+_MODEL_LOAD = "/dataset/PNGS/cnns_model_for_test/alexnet/models/of_model_bk"
+
+
+class DLNetSpec(object):
+    def __init__(self):
+        self.batch_size = 8
+        self.data_part_num = 32
+        self.eval_dir = _DATA_DIR
+        self.train_dir = _DATA_DIR
+        self.model_save_dir = _MODEL_SAVE_DIR
+        self.model_load_dir = _MODEL_LOAD
+        self.num_nodes = 1
+        self.node_list = None
+        self.gpu_num_per_node = 1
+        self.iter_num = 10
+
+
+global_specs = DLNetSpec()
+
+
+class TrainData(flow.model.DataModule):
+    def __init__(self, specs):
+        super().__init__()
+        self.specs = specs
+
+    def forward(self, *args):
+        return _data_load_layer(self.specs, self.specs.train_dir)
+
+
+class ValData(flow.model.DataModule):
+    def __init__(self, specs):
+        super().__init__()
+        self.specs = specs
+
+    def forward(self, *args):
+        return _data_load_layer(self.specs, self.specs.eval_dir)
+
+
+class AlexNet(flow.model.Model):
+    def __init__(self, specs, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.specs = specs
+
+    def forward(self, images, trainable=False):
+        conv1 = _conv2d_layer(
+            "conv1", images, filters=64, kernel_size=11, strides=4, padding="VALID"
+        )
+        pool1 = flow.nn.avg_pool2d(conv1, 3, 2, "VALID", "NCHW", name="pool1")
+        conv2 = _conv2d_layer("conv2", pool1, filters=192, kernel_size=5)
+        pool2 = flow.nn.avg_pool2d(conv2, 3, 2, "VALID", "NCHW", name="pool2")
+        conv3 = _conv2d_layer("conv3", pool2, filters=384)
+        conv4 = _conv2d_layer("conv4", conv3, filters=384)
+        conv5 = _conv2d_layer("conv5", conv4, filters=256)
+        pool5 = flow.nn.avg_pool2d(conv5, 3, 2, "VALID", "NCHW", name="pool5")
+
+        def _get_initializer():
+            kernel_initializer = initializer_conf_util.InitializerConf()
+            kernel_initializer.truncated_normal_conf.std = 0.816496580927726
+            return kernel_initializer
+
+        if len(pool5.shape) > 2:
+            pool5 = flow.reshape(pool5, shape=(pool5.shape[0], -1))
+        fc1 = flow.layers.dense(
+            inputs=pool5,
+            units=4096,
+            activation=flow.math.relu,
+            use_bias=False,
+            kernel_initializer=_get_initializer(),
+            bias_initializer=False,
+            trainable=trainable,
+            name="fc1",
+        )
+        dropout1 = fc1
+        fc2 = flow.layers.dense(
+            inputs=dropout1,
+            units=4096,
+            activation=flow.math.relu,
+            use_bias=False,
+            kernel_initializer=_get_initializer(),
+            bias_initializer=False,
+            trainable=trainable,
+            name="fc2",
+        )
+        dropout2 = fc2
+        fc3 = flow.layers.dense(
+            inputs=dropout2,
+            units=1001,
+            activation=None,
+            use_bias=False,
+            kernel_initializer=_get_initializer(),
+            bias_initializer=False,
+            trainable=trainable,
+            name="fc3",
+        )
+        return fc3
+
+    def training_step(self, batch, optimizer_idx):
+        assert optimizer_idx == 0
+        (images, labels) = batch
+        fc3 = self(images, True)
+        loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+            labels, fc3, name="softmax_loss"
+        )
+        return loss
+
+    def validation_step(self, batch):
+        (images, labels) = batch
+        fc3 = self(images, False)
+        loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+            labels, fc3, name="softmax_loss"
+        )
+        return loss
+
+    def configure_optimizers(self):
+        return flow.optimizer.SGD(
+            flow.optimizer.PiecewiseConstantScheduler([], [1e-05]), momentum=0
+        )
+
+
+class LossMoniter(flow.model.Callback):
+    def on_training_step_end(self, step_idx, outputs, optimizer_idx):
+        assert optimizer_idx == 0
+        loss = outputs.mean()
+        fmt_str = "{:>12}  {:>12}  {:>12.6f}"
+        print(fmt_str.format(step_idx, "train loss:", loss))
+
+    def on_validation_step_end(self, step_idx, outputs):
+        loss = outputs.mean()
+        fmt_str = "{:>12}  {:>12}  {:>12.6f}"
+        print(fmt_str.format(step_idx, "validation loss:", loss))
+
+
+@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+def test_1n1c(test_case):
+    flow.env.ctrl_port(9788)
+    flow.config.machine_num(global_specs.num_nodes)
+    flow.config.gpu_device_num(global_specs.gpu_num_per_node)
+    train_exe_config = flow.ExecutionConfig()
+    train_exe_config.default_logical_view(flow.scope.consistent_view())
+    train_exe_config.default_data_type(flow.float)
+    train_config = flow.model.TrainingConfig()
+    train_config.config_execution(train_exe_config)
+    train_config.config_data(TrainData(global_specs))
+    val_exe_config = flow.ExecutionConfig()
+    val_exe_config.default_logical_view(flow.scope.consistent_view())
+    val_exe_config.default_data_type(flow.float)
+    val_config = flow.model.ValidationConfig()
+    val_config.config_execution(val_exe_config)
+    val_config.config_data(ValData(global_specs))
+    val_config.config_step_interval(10)
+    ck_config = flow.model.CheckpointConfig()
+    ck_config.config_load(dirpath=global_specs.model_load_dir)
+    ck_config.config_save(dirpath=global_specs.model_save_dir, step_interval=10)
+    loss_monitor_cb = LossMoniter()
+    alexnet_md = AlexNet(global_specs, is_deprecated_function_style=True)
+    alexnet_md.fit(
+        training_config=train_config,
+        validation_config=val_config,
+        checkpoint_config=ck_config,
+        callbacks=[loss_monitor_cb],
+        max_steps=20,
+    )
+
+
+def _conv2d_layer(
+    name,
+    input,
+    filters,
+    kernel_size=3,
+    strides=1,
+    padding="SAME",
+    data_format="NCHW",
+    dilation_rate=1,
+    activation=op_conf_util.kRelu,
+    use_bias=False,
+    weight_initializer=flow.random_uniform_initializer(),
+    bias_initializer=flow.random_uniform_initializer(),
+):
+    weight_shape = (filters, input.shape[1], kernel_size, kernel_size)
+    weight = flow.get_variable(
+        name + "-weight",
+        shape=weight_shape,
+        dtype=input.dtype,
+        initializer=weight_initializer,
+    )
+    output = flow.nn.conv2d(
+        input, weight, strides, padding, None, data_format, dilation_rate, name=name
+    )
+    if use_bias:
+        bias = flow.get_variable(
+            name + "-bias",
+            shape=(filters,),
+            dtype=input.dtype,
+            initializer=bias_initializer,
+        )
+        output = flow.nn.bias_add(output, bias, data_format)
+    if activation is not None:
+        if activation == op_conf_util.kRelu:
+            output = flow.nn.relu(output)
+        else:
+            raise NotImplementedError
+    return output
+
+
+def _data_load_layer(args, data_dir):
+    node_num = args.num_nodes
+    total_batch_size = args.batch_size * args.gpu_num_per_node * node_num
+    rgb_mean = [123.68, 116.78, 103.94]
+    (image, label) = flow.data.ofrecord_image_classification_reader(
+        data_dir,
+        batch_size=total_batch_size,
+        data_part_num=args.data_part_num,
+        image_feature_name="encoded",
+        label_feature_name="class/label",
+        color_space="RGB",
+        name="decode",
+    )
+    rsz = flow.image.resize(image, target_size=[227, 227], color_space="RGB")
+    normal = flow.image.crop_mirror_normalize(
+        rsz,
+        color_space="RGB",
+        output_layout="NCHW",
+        mean=rgb_mean,
+        output_dtype=flow.float,
+    )
+    return (normal, label)
diff --git a/python/oneflow/test/models/test_bert.py b/python/oneflow/test/models/test_bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..95f57749aec6fc1fddb3747f8071486f71cb2324
--- /dev/null
+++ b/python/oneflow/test/models/test_bert.py
@@ -0,0 +1,277 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import copy
+import os
+import sys
+import unittest
+
+import numpy as np
+from absl import flags
+from pretrain import PreTrain
+
+import oneflow as flow
+import oneflow.unittest
+
+FLAGS = flags.FLAGS
+flags.DEFINE_string("data_dir", "/dataset/bert/bert_seq_len_128_repeat1024", "")
+flags.DEFINE_string(
+    "model_load_dir", "/dataset/bert_regression_test/of_random_init_L-12_H-768_A-12", ""
+)
+flags.DEFINE_string("model_save_dir", "snapshots", "")
+flags.DEFINE_float("lr", 0.0001, "learning rate")
+flags.DEFINE_float("weight_decay_rate", 0.01, "")
+flags.DEFINE_integer("batch_size", 24, "")
+flags.DEFINE_integer("data_part_num", 8, "")
+flags.DEFINE_integer("seq_length", 128, "")
+flags.DEFINE_integer("max_predictions_per_seq", 20, "")
+flags.DEFINE_integer("num_hidden_layers", 12, "")
+flags.DEFINE_integer("num_attention_heads", 12, "")
+flags.DEFINE_integer("max_position_embeddings", 512, "")
+flags.DEFINE_integer("type_vocab_size", 2, "")
+flags.DEFINE_integer("vocab_size", 30522, "")
+flags.DEFINE_float("attention_probs_dropout_prob", 0.0, "")
+flags.DEFINE_float("hidden_dropout_prob", 0.0, "")
+flags.DEFINE_integer("hidden_size_per_head", 64, "")
+FLAGS(sys.argv)
+
+
+def _blob_conf(name, shape, dtype=flow.int32):
+    return flow.data.BlobConf(
+        name=name, shape=shape, dtype=dtype, codec=flow.data.RawCodec()
+    )
+
+
+def BertDecoder(
+    data_dir, batch_size=1, data_part_num=1, seq_length=128, max_predictions_per_seq=20
+):
+    ofrecord = flow.data.ofrecord_reader(
+        data_dir, batch_size=batch_size, data_part_num=data_part_num, name="decode"
+    )
+    input_ids = flow.data.ofrecord_raw_decoder(
+        ofrecord, "input_ids", shape=(seq_length,), dtype=flow.int32
+    )
+    next_sentence_labels = flow.data.ofrecord_raw_decoder(
+        ofrecord, "next_sentence_labels", shape=(1,), dtype=flow.int32
+    )
+    input_mask = flow.data.ofrecord_raw_decoder(
+        ofrecord, "input_mask", shape=(seq_length,), dtype=flow.int32
+    )
+    segment_ids = flow.data.ofrecord_raw_decoder(
+        ofrecord, "segment_ids", shape=(seq_length,), dtype=flow.int32
+    )
+    masked_lm_ids = flow.data.ofrecord_raw_decoder(
+        ofrecord, "masked_lm_ids", shape=(max_predictions_per_seq,), dtype=flow.int32
+    )
+    masked_lm_positions = flow.data.ofrecord_raw_decoder(
+        ofrecord,
+        "masked_lm_positions",
+        shape=(max_predictions_per_seq,),
+        dtype=flow.int32,
+    )
+    masked_lm_weights = flow.data.ofrecord_raw_decoder(
+        ofrecord,
+        "masked_lm_weights",
+        shape=(max_predictions_per_seq,),
+        dtype=flow.float,
+    )
+    return (
+        input_ids,
+        next_sentence_labels,
+        input_mask,
+        segment_ids,
+        masked_lm_ids,
+        masked_lm_positions,
+        masked_lm_weights,
+    )
+
+
+def BuildPreTrainNet(
+    batch_size,
+    data_part_num,
+    seq_length=128,
+    max_position_embeddings=512,
+    num_hidden_layers=12,
+    num_attention_heads=12,
+    hidden_dropout_prob=0.1,
+    attention_probs_dropout_prob=0.1,
+    vocab_size=30522,
+    type_vocab_size=2,
+    max_predictions_per_seq=20,
+):
+    hidden_size = 64 * num_attention_heads
+    intermediate_size = hidden_size * 4
+    if data_part_num == 1:
+        with flow.scope.placement("cpu", "0:0"):
+            decoders = BertDecoder(
+                FLAGS.data_dir,
+                batch_size,
+                data_part_num,
+                seq_length,
+                max_predictions_per_seq,
+            )
+    else:
+        assert data_part_num > 1
+        decoders = BertDecoder(
+            FLAGS.data_dir,
+            batch_size,
+            data_part_num,
+            seq_length,
+            max_predictions_per_seq,
+        )
+    input_ids = decoders[0]
+    next_sentence_labels = decoders[1]
+    input_mask = decoders[2]
+    token_type_ids = decoders[3]
+    masked_lm_ids = decoders[4]
+    masked_lm_positions = decoders[5]
+    masked_lm_weights = decoders[6]
+    return PreTrain(
+        input_ids,
+        input_mask,
+        token_type_ids,
+        masked_lm_positions,
+        masked_lm_ids,
+        masked_lm_weights,
+        next_sentence_labels,
+        vocab_size,
+        seq_length=seq_length,
+        hidden_size=hidden_size,
+        num_hidden_layers=num_hidden_layers,
+        num_attention_heads=num_attention_heads,
+        intermediate_size=intermediate_size,
+        hidden_act="gelu",
+        hidden_dropout_prob=hidden_dropout_prob,
+        attention_probs_dropout_prob=attention_probs_dropout_prob,
+        max_position_embeddings=max_position_embeddings,
+        type_vocab_size=type_vocab_size,
+        max_predictions_per_seq=max_predictions_per_seq,
+        initializer_range=0.02,
+    )
+
+
+def CreateOptimizer():
+    lr_warmup = flow.optimizer.warmup.linear(1000, 0)
+    lr_scheduler = flow.optimizer.PolynomialScheduler(
+        FLAGS.lr, 100000, 0.0, warmup=lr_warmup
+    )
+    return flow.optimizer.AdamW(
+        lr_scheduler,
+        epsilon=1e-06,
+        weight_decay=FLAGS.weight_decay_rate,
+        weight_decay_excludes=["bias", "LayerNorm", "layer_norm"],
+        grad_clipping=flow.optimizer.grad_clipping.by_global_norm(1.0),
+    )
+
+
+def PretrainJob():
+    total_loss = BuildPreTrainNet(
+        batch_size=FLAGS.batch_size,
+        data_part_num=FLAGS.data_part_num,
+        seq_length=FLAGS.seq_length,
+        max_position_embeddings=FLAGS.max_position_embeddings,
+        num_hidden_layers=FLAGS.num_hidden_layers,
+        num_attention_heads=FLAGS.num_attention_heads,
+        hidden_dropout_prob=FLAGS.hidden_dropout_prob,
+        attention_probs_dropout_prob=FLAGS.attention_probs_dropout_prob,
+        vocab_size=FLAGS.vocab_size,
+        type_vocab_size=FLAGS.type_vocab_size,
+        max_predictions_per_seq=FLAGS.max_predictions_per_seq,
+    )
+    opt = CreateOptimizer()
+    opt.minimize(total_loss)
+    return total_loss
+
+
+func_config = flow.FunctionConfig()
+func_config.default_logical_view(flow.scope.consistent_view())
+func_config.enable_auto_mixed_precision(FLAGS.enable_auto_mixed_precision)
+
+
+@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+def test_1n1c(test_case):
+    flow.config.enable_debug_mode(True)
+    flow.config.gpu_device_num(1)
+    pretrain_job = flow.global_function(type="train", function_config=func_config)(
+        PretrainJob
+    )
+    check_point = flow.train.CheckPoint()
+    check_point.load(FLAGS.model_load_dir)
+    of_loss = [pretrain_job().get().mean() for _ in range(10)]
+    print(of_loss)
+
+
+@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+def test_1n4c(test_case):
+    flow.config.gpu_device_num(4)
+    pretrain_job = flow.global_function(type="train", function_config=func_config)(
+        PretrainJob
+    )
+    check_point = flow.train.CheckPoint()
+    check_point.load(FLAGS.model_load_dir)
+    of_loss = [pretrain_job().get().mean() for _ in range(10)]
+    print(of_loss)
+
+
+@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+@flow.unittest.num_nodes_required(2)
+def test_2n8c(test_case):
+    flow.config.gpu_device_num(4)
+    pretrain_job = flow.global_function(type="train", function_config=func_config)(
+        PretrainJob
+    )
+    check_point = flow.train.CheckPoint()
+    check_point.load(FLAGS.model_load_dir)
+    of_loss = [pretrain_job().get().mean() for _ in range(10)]
+    print(of_loss)
+
+
+def test_inplace(test_case):
+    test_case.assertTrue(
+        np.allclose(GetSeveralLossesAsNumpy(True), GetSeveralLossesAsNumpy(False))
+    )
+
+
+def GetSeveralLossesAsNumpy(enable_inplace, num_iters=10):
+    flow.config.enable_debug_mode(True)
+    flow.config.gpu_device_num(1)
+    train_config = flow.FunctionConfig()
+    train_config.default_logical_view(flow.scope.consistent_view())
+    train_config.enable_inplace(enable_inplace)
+
+    @flow.global_function(type="train", function_config=train_config)
+    def PretrainJob():
+        loss = BuildPreTrainNet(
+            batch_size=FLAGS.batch_size,
+            data_part_num=FLAGS.data_part_num,
+            seq_length=FLAGS.seq_length,
+            max_position_embeddings=FLAGS.max_position_embeddings,
+            num_hidden_layers=1,
+            num_attention_heads=FLAGS.num_attention_heads,
+            hidden_dropout_prob=FLAGS.hidden_dropout_prob,
+            attention_probs_dropout_prob=FLAGS.attention_probs_dropout_prob,
+            vocab_size=FLAGS.vocab_size,
+            type_vocab_size=FLAGS.type_vocab_size,
+            max_predictions_per_seq=FLAGS.max_predictions_per_seq,
+        )
+        CreateOptimizer().minimize(loss)
+        return loss
+
+    check_point = flow.train.CheckPoint()
+    check_point.load(FLAGS.model_load_dir)
+    ret = [PretrainJob().get().mean() for _ in range(num_iters)]
+    flow.clear_default_session()
+    return np.array(ret)
diff --git a/python/oneflow/test/models/test_dcgan.py b/python/oneflow/test/models/test_dcgan.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e01dfbdb1074ac2d60b9f3d3d6854c55fbe09b8
--- /dev/null
+++ b/python/oneflow/test/models/test_dcgan.py
@@ -0,0 +1,326 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import numpy as np
+
+import oneflow as flow
+import oneflow.typing as oft
+import oneflow.unittest
+
+
+@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+def test_1n1c(test_case):
+    dcgan = DCGAN()
+    dcgan.compare_with_tf(1)
+
+
+@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+def test_1n4c(test_case):
+    dcgan = DCGAN()
+    dcgan.compare_with_tf(4)
+
+
+class DCGAN:
+    def __init__(self):
+        self.lr = 0.0001
+        self.z_dim = 100
+        self.batch_size = 32
+
+    def compare_with_tf(self, gpu_num, result_dir="/dataset/gan_test/dcgan/"):
+        flow.config.gpu_device_num(gpu_num)
+        func_config = flow.FunctionConfig()
+        func_config.default_data_type(flow.float)
+        func_config.default_logical_view(flow.scope.consistent_view())
+
+        @flow.global_function(type="train", function_config=func_config)
+        def test_generator(
+            z: oft.Numpy.Placeholder((self.batch_size, self.z_dim)),
+            label1: oft.Numpy.Placeholder((self.batch_size, 1)),
+        ):
+            g_out = self.generator(z, trainable=True, const_init=True)
+            g_logits = self.discriminator(g_out, trainable=False, const_init=True)
+            g_loss = flow.nn.sigmoid_cross_entropy_with_logits(
+                flow.ones_like(g_logits),
+                g_logits,
+                name="Gloss_sigmoid_cross_entropy_with_logits",
+            )
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [self.lr]), momentum=0
+            ).minimize(g_loss)
+            return g_loss
+
+        @flow.global_function(type="train", function_config=func_config)
+        def test_discriminator(
+            z: oft.Numpy.Placeholder((self.batch_size, 100)),
+            images: oft.Numpy.Placeholder((self.batch_size, 1, 28, 28)),
+            label1: oft.Numpy.Placeholder((self.batch_size, 1)),
+            label0: oft.Numpy.Placeholder((self.batch_size, 1)),
+        ):
+            g_out = self.generator(z, trainable=False, const_init=True)
+            g_logits = self.discriminator(g_out, trainable=True, const_init=True)
+            d_loss_fake = flow.nn.sigmoid_cross_entropy_with_logits(
+                flow.zeros_like(g_logits),
+                g_logits,
+                name="Dloss_fake_sigmoid_cross_entropy_with_logits",
+            )
+            d_logits = self.discriminator(
+                images, trainable=True, reuse=True, const_init=True
+            )
+            d_loss_real = flow.nn.sigmoid_cross_entropy_with_logits(
+                flow.ones_like(d_logits),
+                d_logits,
+                name="Dloss_real_sigmoid_cross_entropy_with_logits",
+            )
+            d_loss = d_loss_fake + d_loss_real
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [self.lr]), momentum=0
+            ).minimize(d_loss)
+            return d_loss
+
+        check_point = flow.train.CheckPoint()
+        check_point.init()
+        z = np.load(os.path.join(result_dir, "z.npy"))
+        imgs = np.load(os.path.join(result_dir, "img.npy")).transpose(0, 3, 1, 2)
+        label1 = np.ones((self.batch_size, 1)).astype(np.float32)
+        label0 = np.zeros((self.batch_size, 1)).astype(np.float32)
+        g_loss = test_generator(z, label1).get()
+        d_loss = test_discriminator(z, imgs, label1, label0).get()
+        tf_g_loss = np.load(os.path.join(result_dir, "g_loss.npy"))
+        tf_d_loss = np.load(os.path.join(result_dir, "d_loss.npy"))
+        if gpu_num == 1:
+            assert np.allclose(
+                g_loss.numpy(), tf_g_loss, rtol=0.01, atol=0.1
+            ), "{}-{}".format(g_loss.ndarray().mean(), tf_g_loss.mean())
+            assert np.allclose(
+                d_loss.numpy(), tf_d_loss, rtol=0.01, atol=0.1
+            ), "{}-{}".format(d_loss.ndarray().mean(), tf_d_loss.mean())
+
+    def generator(self, z, const_init=False, trainable=True):
+        h0 = layers.dense(
+            z, 7 * 7 * 256, name="g_fc1", const_init=const_init, trainable=trainable
+        )
+        h0 = layers.batchnorm(h0, axis=1, name="g_bn1")
+        h0 = flow.nn.leaky_relu(h0, 0.3)
+        h0 = flow.reshape(h0, (-1, 256, 7, 7))
+        h1 = layers.deconv2d(
+            h0,
+            128,
+            5,
+            strides=1,
+            name="g_deconv1",
+            const_init=const_init,
+            trainable=trainable,
+        )
+        h1 = layers.batchnorm(h1, name="g_bn2")
+        h1 = flow.nn.leaky_relu(h1, 0.3)
+        h2 = layers.deconv2d(
+            h1,
+            64,
+            5,
+            strides=2,
+            name="g_deconv2",
+            const_init=const_init,
+            trainable=trainable,
+        )
+        h2 = layers.batchnorm(h2, name="g_bn3")
+        h2 = flow.nn.leaky_relu(h2, 0.3)
+        out = layers.deconv2d(
+            h2,
+            1,
+            5,
+            strides=2,
+            name="g_deconv3",
+            const_init=const_init,
+            trainable=trainable,
+        )
+        out = flow.math.tanh(out)
+        return out
+
+    def discriminator(self, img, const_init=False, trainable=True, reuse=False):
+        h0 = layers.conv2d(
+            img,
+            64,
+            5,
+            name="d_conv1",
+            const_init=const_init,
+            trainable=trainable,
+            reuse=reuse,
+        )
+        h0 = flow.nn.leaky_relu(h0, 0.3)
+        h1 = layers.conv2d(
+            h0,
+            128,
+            5,
+            name="d_conv2",
+            const_init=const_init,
+            trainable=trainable,
+            reuse=reuse,
+        )
+        h1 = flow.nn.leaky_relu(h1, 0.3)
+        out = flow.reshape(h1, (self.batch_size, -1))
+        out = layers.dense(
+            out, 1, name="d_fc", const_init=const_init, trainable=trainable, reuse=reuse
+        )
+        return out
+
+
+class layers:
+    @staticmethod
+    def deconv2d(
+        input,
+        filters,
+        size,
+        name,
+        strides=2,
+        trainable=True,
+        reuse=False,
+        const_init=False,
+        use_bias=False,
+    ):
+        name_ = name if reuse == False else name + "_reuse"
+        weight_shape = (input.shape[1], filters, size, size)
+        output_shape = (
+            input.shape[0],
+            filters,
+            input.shape[2] * strides,
+            input.shape[3] * strides,
+        )
+        weight = flow.get_variable(
+            name + "-weight",
+            shape=weight_shape,
+            dtype=input.dtype,
+            initializer=flow.random_normal_initializer(stddev=0.02)
+            if not const_init
+            else flow.constant_initializer(0.002),
+            trainable=trainable,
+            reuse=reuse,
+        )
+        output = flow.nn.conv2d_transpose(
+            input,
+            weight,
+            strides=[strides, strides],
+            output_shape=output_shape,
+            padding="SAME",
+            data_format="NCHW",
+            name=name_,
+        )
+        if use_bias:
+            bias = flow.get_variable(
+                name + "-bias",
+                shape=(filters,),
+                dtype=input.dtype,
+                initializer=flow.constant_initializer(0.0),
+                trainable=trainable,
+                reuse=reuse,
+            )
+            output = flow.nn.bias_add(output, bias, "NCHW")
+        return output
+
+    @staticmethod
+    def conv2d(
+        input,
+        filters,
+        size,
+        name,
+        strides=2,
+        padding="same",
+        trainable=True,
+        reuse=False,
+        const_init=False,
+        use_bias=True,
+    ):
+        name_ = name if reuse == False else name + "_reuse"
+        weight_shape = (filters, input.shape[1], size, size)
+        weight = flow.get_variable(
+            name + "-weight",
+            shape=weight_shape,
+            dtype=input.dtype,
+            initializer=flow.random_normal_initializer(stddev=0.02)
+            if not const_init
+            else flow.constant_initializer(0.002),
+            trainable=trainable,
+            reuse=reuse,
+        )
+        output = flow.nn.compat_conv2d(
+            input,
+            weight,
+            strides=[strides, strides],
+            padding=padding,
+            data_format="NCHW",
+            name=name_,
+        )
+        if use_bias:
+            bias = flow.get_variable(
+                name + "-bias",
+                shape=(filters,),
+                dtype=input.dtype,
+                initializer=flow.constant_initializer(0.0),
+                trainable=trainable,
+                reuse=reuse,
+            )
+            output = flow.nn.bias_add(output, bias, "NCHW")
+        return output
+
+    @staticmethod
+    def dense(
+        input,
+        units,
+        name,
+        use_bias=False,
+        trainable=True,
+        reuse=False,
+        const_init=False,
+    ):
+        name_ = name if reuse == False else name + "_reuse"
+        in_shape = input.shape
+        in_num_axes = len(in_shape)
+        assert in_num_axes >= 2
+        inputs = flow.reshape(input, (-1, in_shape[-1])) if in_num_axes > 2 else input
+        weight = flow.get_variable(
+            name="{}-weight".format(name),
+            shape=(units, inputs.shape[1]),
+            dtype=inputs.dtype,
+            initializer=flow.random_normal_initializer(stddev=0.02)
+            if not const_init
+            else flow.constant_initializer(0.002),
+            trainable=trainable,
+            model_name="weight",
+            reuse=reuse,
+        )
+        out = flow.matmul(a=inputs, b=weight, transpose_b=True, name=name_ + "matmul")
+        if use_bias:
+            bias = flow.get_variable(
+                name="{}-bias".format(name),
+                shape=(units,),
+                dtype=inputs.dtype,
+                initializer=flow.random_normal_initializer()
+                if not const_init
+                else flow.constant_initializer(0.002),
+                trainable=trainable,
+                model_name="bias",
+                reuse=reuse,
+            )
+            out = flow.nn.bias_add(out, bias, name=name_ + "_bias_add")
+        out = flow.reshape(out, in_shape[:-1] + (units,)) if in_num_axes > 2 else out
+        return out
+
+    @staticmethod
+    def batchnorm(input, name, axis=1, reuse=False):
+        name_ = name if reuse == False else name + "_reuse"
+        return flow.layers.batch_normalization(input, axis=axis, name=name_)
diff --git a/python/oneflow/test/models/test_dcgan_model.py b/python/oneflow/test/models/test_dcgan_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..851e0995f99d1c3aa012bfdd45bdb0d502b89292
--- /dev/null
+++ b/python/oneflow/test/models/test_dcgan_model.py
@@ -0,0 +1,358 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import numpy as np
+
+import oneflow as flow
+import oneflow.unittest
+
+
+class DCGAN(flow.model.Model):
+    def __init__(self, gpu_num, batch_size, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.gpu_num = gpu_num
+        self.lr = 0.0001
+        self.z_dim = 100
+        self.batch_size = batch_size
+
+    def _generator(self, z, const_init=False, trainable=True):
+        h0 = Layers.dense(
+            z, 7 * 7 * 256, name="g_fc1", const_init=const_init, trainable=trainable
+        )
+        h0 = Layers.batchnorm(h0, axis=1, name="g_bn1")
+        h0 = flow.nn.leaky_relu(h0, 0.3)
+        h0 = flow.reshape(h0, (-1, 256, 7, 7))
+        h1 = Layers.deconv2d(
+            h0,
+            128,
+            5,
+            strides=1,
+            name="g_deconv1",
+            const_init=const_init,
+            trainable=trainable,
+        )
+        h1 = Layers.batchnorm(h1, name="g_bn2")
+        h1 = flow.nn.leaky_relu(h1, 0.3)
+        h2 = Layers.deconv2d(
+            h1,
+            64,
+            5,
+            strides=2,
+            name="g_deconv2",
+            const_init=const_init,
+            trainable=trainable,
+        )
+        h2 = Layers.batchnorm(h2, name="g_bn3")
+        h2 = flow.nn.leaky_relu(h2, 0.3)
+        out = Layers.deconv2d(
+            h2,
+            1,
+            5,
+            strides=2,
+            name="g_deconv3",
+            const_init=const_init,
+            trainable=trainable,
+        )
+        out = flow.math.tanh(out)
+        return out
+
+    def _discriminator(self, img, const_init=False, trainable=True, reuse=False):
+        h0 = Layers.conv2d(
+            img,
+            64,
+            5,
+            name="d_conv1",
+            const_init=const_init,
+            trainable=trainable,
+            reuse=reuse,
+        )
+        h0 = flow.nn.leaky_relu(h0, 0.3)
+        h1 = Layers.conv2d(
+            h0,
+            128,
+            5,
+            name="d_conv2",
+            const_init=const_init,
+            trainable=trainable,
+            reuse=reuse,
+        )
+        h1 = flow.nn.leaky_relu(h1, 0.3)
+        out = flow.reshape(h1, (self.batch_size, -1))
+        out = Layers.dense(
+            out, 1, name="d_fc", const_init=const_init, trainable=trainable, reuse=reuse
+        )
+        return out
+
+    def forward(self, batch, const_init=False, trainable=False):
+        return self._generator(batch, const_init=const_init, trainable=trainable)
+
+    def training_step(self, batch, optimizer_idx):
+        if optimizer_idx == 0:
+            (z,) = batch
+            g_out = self._generator(z, trainable=True, const_init=True)
+            g_logits = self._discriminator(g_out, trainable=False, const_init=True)
+            g_loss = flow.nn.sigmoid_cross_entropy_with_logits(
+                flow.ones_like(g_logits),
+                g_logits,
+                name="Gloss_sigmoid_cross_entropy_with_logits",
+            )
+            return (g_loss, g_out)
+        elif optimizer_idx == 1:
+            (z, images) = batch
+            g_out = self._generator(z, trainable=False, const_init=True)
+            g_logits = self._discriminator(g_out, trainable=True, const_init=True)
+            d_loss_fake = flow.nn.sigmoid_cross_entropy_with_logits(
+                flow.zeros_like(g_logits),
+                g_logits,
+                name="Dloss_fake_sigmoid_cross_entropy_with_logits",
+            )
+            d_logits = self._discriminator(
+                images, trainable=True, reuse=True, const_init=True
+            )
+            d_loss_real = flow.nn.sigmoid_cross_entropy_with_logits(
+                flow.ones_like(d_logits),
+                d_logits,
+                name="Dloss_real_sigmoid_cross_entropy_with_logits",
+            )
+            d_loss = d_loss_fake + d_loss_real
+            return d_loss
+
+    def configure_optimizers(self):
+        generator_opt = flow.optimizer.SGD(
+            flow.optimizer.PiecewiseConstantScheduler([], [self.lr]), momentum=0
+        )
+        discriminator_opt = flow.optimizer.SGD(
+            flow.optimizer.PiecewiseConstantScheduler([], [self.lr]), momentum=0
+        )
+        return [generator_opt, discriminator_opt]
+
+
+class LossMoniter(flow.model.Callback):
+    def __init__(self, result_dir):
+        self.result_dir = result_dir
+
+    def on_training_step_end(self, step_idx, outputs, optimizer_idx):
+        if optimizer_idx == 0:
+            (g_loss, g_out) = outputs
+            fmt_str = "{:>12}  {:>12}  {:>12.6f}"
+            print(fmt_str.format(step_idx, "train g_loss:", g_loss.numpy().mean()))
+            print(fmt_str.format(step_idx, "train g_out:", g_out.numpy().mean()))
+            tf_g_loss = np.load(os.path.join(self.result_dir, "g_loss.npy"))
+            assert np.allclose(
+                g_loss.numpy(), tf_g_loss, rtol=0.01, atol=0.1
+            ), "{}-{}".format(g_loss.numpy().mean(), tf_g_loss.mean())
+        elif optimizer_idx == 1:
+            d_loss = outputs
+            fmt_str = "{:>12}  {:>12}  {:>12.6f}"
+            print(fmt_str.format(step_idx, "train d_loss:", d_loss.numpy().mean()))
+            tf_d_loss = np.load(os.path.join(self.result_dir, "d_loss.npy"))
+            assert np.allclose(
+                d_loss.numpy(), tf_d_loss, rtol=0.01, atol=0.1
+            ), "{}-{}".format(d_loss.numpy().mean(), tf_d_loss.mean())
+
+
+class NumpyTrainData(flow.model.NumpyDataModule):
+    def __init__(self, result_dir, batch_size):
+        super().__init__()
+        self.z = np.load(os.path.join(result_dir, "z.npy"))
+        self.images = np.load(os.path.join(result_dir, "img.npy")).transpose(0, 3, 1, 2)
+
+    def forward(self, step_idx, optimizer_idx):
+        if optimizer_idx == 0:
+            return (self.z,)
+        else:
+            return (self.z, self.images)
+
+
+class NumpyValData(flow.model.NumpyDataModule):
+    def __init__(self, result_dir, batch_size):
+        super().__init__()
+        self.z = np.load(os.path.join(result_dir, "z.npy"))
+
+    def forward(self, step_idx):
+        return (self.z,)
+
+
+@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+def test_1n1c(test_case):
+    dcgan_compare = DCGANCompare()
+    dcgan_compare.compare_with_tf(1)
+
+
+class DCGANCompare:
+    def compare_with_tf(self, gpu_num, result_dir="/dataset/gan_test/dcgan/"):
+        batch_size = 32
+        flow.config.gpu_device_num(gpu_num)
+        train_exe_config = flow.ExecutionConfig()
+        train_exe_config.default_data_type(flow.float)
+        train_exe_config.default_logical_view(flow.scope.consistent_view())
+        train_config = flow.model.TrainingConfig()
+        train_config.config_execution(train_exe_config)
+        train_config.config_data(NumpyTrainData(result_dir, batch_size))
+        loss_monitor_cb = LossMoniter(result_dir)
+        dcgan_md = DCGAN(gpu_num, batch_size, is_deprecated_function_style=True)
+        dcgan_md.fit(
+            training_config=train_config, callbacks=[loss_monitor_cb], max_steps=3
+        )
+
+
+class Layers:
+    @staticmethod
+    def deconv2d(
+        input,
+        filters,
+        size,
+        name,
+        strides=2,
+        trainable=True,
+        reuse=False,
+        const_init=False,
+        use_bias=False,
+    ):
+        name_ = name if not reuse else name + "_reuse"
+        weight_shape = (input.shape[1], filters, size, size)
+        output_shape = (
+            input.shape[0],
+            filters,
+            input.shape[2] * strides,
+            input.shape[3] * strides,
+        )
+        weight = flow.get_variable(
+            name + "-weight",
+            shape=weight_shape,
+            dtype=input.dtype,
+            initializer=flow.random_normal_initializer(stddev=0.02)
+            if not const_init
+            else flow.constant_initializer(0.002),
+            trainable=trainable,
+            reuse=reuse,
+        )
+        output = flow.nn.conv2d_transpose(
+            input,
+            weight,
+            strides=[strides, strides],
+            output_shape=output_shape,
+            padding="SAME",
+            data_format="NCHW",
+            name=name_,
+        )
+        if use_bias:
+            bias = flow.get_variable(
+                name + "-bias",
+                shape=(filters,),
+                dtype=input.dtype,
+                initializer=flow.constant_initializer(0.0),
+                trainable=trainable,
+                reuse=reuse,
+            )
+            output = flow.nn.bias_add(output, bias, "NCHW")
+        return output
+
+    @staticmethod
+    def conv2d(
+        input,
+        filters,
+        size,
+        name,
+        strides=2,
+        padding="same",
+        trainable=True,
+        reuse=False,
+        const_init=False,
+        use_bias=True,
+    ):
+        name_ = name if not reuse else name + "_reuse"
+        weight_shape = (filters, input.shape[1], size, size)
+        weight = flow.get_variable(
+            name + "-weight",
+            shape=weight_shape,
+            dtype=input.dtype,
+            initializer=flow.random_normal_initializer(stddev=0.02)
+            if not const_init
+            else flow.constant_initializer(0.002),
+            trainable=trainable,
+            reuse=reuse,
+        )
+        output = flow.nn.compat_conv2d(
+            input,
+            weight,
+            strides=[strides, strides],
+            padding=padding,
+            data_format="NCHW",
+            name=name_,
+        )
+        if use_bias:
+            bias = flow.get_variable(
+                name + "-bias",
+                shape=(filters,),
+                dtype=input.dtype,
+                initializer=flow.constant_initializer(0.0),
+                trainable=trainable,
+                reuse=reuse,
+            )
+            output = flow.nn.bias_add(output, bias, "NCHW")
+        return output
+
+    @staticmethod
+    def dense(
+        input,
+        units,
+        name,
+        use_bias=False,
+        trainable=True,
+        reuse=False,
+        const_init=False,
+    ):
+        name_ = name if not reuse else name + "_reuse"
+        in_shape = input.shape
+        in_num_axes = len(in_shape)
+        assert in_num_axes >= 2
+        inputs = flow.reshape(input, (-1, in_shape[-1])) if in_num_axes > 2 else input
+        weight = flow.get_variable(
+            name="{}-weight".format(name),
+            shape=(units, inputs.shape[1]),
+            dtype=inputs.dtype,
+            initializer=flow.random_normal_initializer(stddev=0.02)
+            if not const_init
+            else flow.constant_initializer(0.002),
+            trainable=trainable,
+            model_name="weight",
+            reuse=reuse,
+        )
+        out = flow.matmul(a=inputs, b=weight, transpose_b=True, name=name_ + "matmul")
+        if use_bias:
+            bias = flow.get_variable(
+                name="{}-bias".format(name),
+                shape=(units,),
+                dtype=inputs.dtype,
+                initializer=flow.random_normal_initializer()
+                if not const_init
+                else flow.constant_initializer(0.002),
+                trainable=trainable,
+                model_name="bias",
+                reuse=reuse,
+            )
+            out = flow.nn.bias_add(out, bias, name=name_ + "_bias_add")
+        out = flow.reshape(out, in_shape[:-1] + (units,)) if in_num_axes > 2 else out
+        return out
+
+    @staticmethod
+    def batchnorm(input, name, axis=1, reuse=False):
+        name_ = name if not reuse else name + "_reuse"
+        return flow.layers.batch_normalization(input, axis=axis, name=name_)
diff --git a/python/oneflow/test/models/test_dqn.py b/python/oneflow/test/models/test_dqn.py
new file mode 100644
index 0000000000000000000000000000000000000000..045469fea79aaa17e5a37898faa0b0455fa604db
--- /dev/null
+++ b/python/oneflow/test/models/test_dqn.py
@@ -0,0 +1,250 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import numpy as np
+
+import oneflow as flow
+import oneflow.typing as oft
+import oneflow.unittest
+
+
+@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+def test_1n1c(test_case):
+    dqn = DQN("gpu")
+    dqn.test_parameters_copy()
+
+
+def getQNetParams(var_name_prefix: str = "QNet", is_train: bool = True):
+    weight_init = flow.variance_scaling_initializer(
+        scale=1.0, mode="fan_in", distribution="truncated_normal", data_format="NCHW"
+    )
+    bias_init = flow.constant_initializer(value=0.0)
+    conv_prefix = "_conv1"
+    conv1_weight = flow.get_variable(
+        var_name_prefix + conv_prefix + "_weight",
+        shape=(32, 4, 3, 3),
+        dtype=flow.float32,
+        initializer=weight_init,
+        trainable=is_train,
+    )
+    conv1_bias = flow.get_variable(
+        var_name_prefix + conv_prefix + "_bias",
+        shape=(32,),
+        dtype=flow.float32,
+        initializer=bias_init,
+        trainable=is_train,
+    )
+    conv_prefix = "_conv2"
+    conv2_weight = flow.get_variable(
+        var_name_prefix + conv_prefix + "_weight",
+        shape=(32, 32, 3, 3),
+        dtype=flow.float32,
+        initializer=weight_init,
+        trainable=is_train,
+    )
+    conv2_bias = flow.get_variable(
+        var_name_prefix + conv_prefix + "_bias",
+        shape=(32,),
+        dtype=flow.float32,
+        initializer=bias_init,
+        trainable=is_train,
+    )
+    fc_prefix = "_fc1"
+    fc1_weight = flow.get_variable(
+        var_name_prefix + fc_prefix + "_weight",
+        shape=(512, 32 * 16 * 16),
+        dtype=flow.float32,
+        initializer=weight_init,
+        trainable=is_train,
+    )
+    fc1_bias = flow.get_variable(
+        var_name_prefix + fc_prefix + "_bias",
+        shape=(512,),
+        dtype=flow.float32,
+        initializer=bias_init,
+        trainable=is_train,
+    )
+    fc_prefix = "_fc2"
+    fc2_weight = flow.get_variable(
+        var_name_prefix + fc_prefix + "_weight",
+        shape=(2, 512),
+        dtype=flow.float32,
+        initializer=weight_init,
+        trainable=is_train,
+    )
+    fc2_bias = flow.get_variable(
+        var_name_prefix + fc_prefix + "_bias",
+        shape=(2,),
+        dtype=flow.float32,
+        initializer=bias_init,
+        trainable=is_train,
+    )
+    return (
+        conv1_weight,
+        conv1_bias,
+        conv2_weight,
+        conv2_bias,
+        fc1_weight,
+        fc1_bias,
+        fc2_weight,
+        fc2_bias,
+    )
+
+
+BATCH_SIZE = 32
+
+
+def createOfQNet(
+    input_image: oft.Numpy.Placeholder((BATCH_SIZE, 4, 64, 64), dtype=flow.float32),
+    var_name_prefix: str = "QNet",
+    is_train: bool = True,
+) -> oft.Numpy:
+    (
+        conv1_weight,
+        conv1_bias,
+        conv2_weight,
+        conv2_bias,
+        fc1_weight,
+        fc1_bias,
+        fc2_weight,
+        fc2_bias,
+    ) = getQNetParams(var_name_prefix=var_name_prefix, is_train=is_train)
+    (
+        conv1_weight,
+        conv1_bias,
+        conv2_weight,
+        conv2_bias,
+        fc1_weight,
+        fc1_bias,
+        fc2_weight,
+        fc2_bias,
+    ) = getQNetParams(var_name_prefix=var_name_prefix, is_train=is_train)
+    conv1 = flow.nn.compat_conv2d(
+        input_image, conv1_weight, strides=[1, 1], padding="same", data_format="NCHW"
+    )
+    conv1 = flow.nn.bias_add(conv1, conv1_bias, "NCHW")
+    conv1 = flow.nn.relu(conv1)
+    pool1 = flow.nn.max_pool2d(conv1, 2, 2, "VALID", "NCHW", name="pool1")
+    conv2 = flow.nn.compat_conv2d(
+        pool1, conv2_weight, strides=[1, 1], padding="same", data_format="NCHW"
+    )
+    conv2 = flow.nn.bias_add(conv2, conv2_bias, "NCHW")
+    conv2 = flow.nn.relu(conv2)
+    pool2 = flow.nn.max_pool2d(conv2, 2, 2, "VALID", "NCHW", name="pool2")
+    pool2_flatten = flow.reshape(pool2, (BATCH_SIZE, -1))
+    fc1 = flow.matmul(a=pool2_flatten, b=fc1_weight, transpose_b=True)
+    fc1 = flow.nn.bias_add(fc1, fc1_bias)
+    fc1 = flow.nn.relu(fc1)
+    fc2 = flow.matmul(a=fc1, b=fc2_weight, transpose_b=True)
+    fc2 = flow.nn.bias_add(fc2, fc2_bias)
+    return fc2
+
+
+def get_train_config():
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float32)
+    func_config.default_logical_view(flow.scope.consistent_view())
+    return func_config
+
+
+def get_predict_config():
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float32)
+    func_config.default_logical_view(flow.scope.consistent_view())
+    return func_config
+
+
+class DQN:
+    def __init__(self, device_tag):
+        self.device_tag_ = device_tag
+
+    def test_parameters_copy(self):
+        @flow.global_function("train", get_train_config())
+        def trainQNet(
+            input_image: oft.Numpy.Placeholder(
+                (BATCH_SIZE, 4, 64, 64), dtype=flow.float32
+            ),
+            y_input: oft.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.float32),
+            action_input: oft.Numpy.Placeholder((BATCH_SIZE, 2), dtype=flow.float32),
+        ) -> oft.Numpy:
+            with flow.scope.placement(self.device_tag_, "0:0-0"):
+                out = createOfQNet(input_image, var_name_prefix="QNet", is_train=True)
+                Q_Action = flow.math.reduce_sum(out * action_input, axis=1)
+                cost = flow.math.reduce_mean(flow.math.square(y_input - Q_Action))
+                learning_rate = 0.0002
+                flow.optimizer.SGD(
+                    flow.optimizer.PiecewiseConstantScheduler([], [learning_rate]),
+                    momentum=0,
+                ).minimize(cost)
+            return out
+
+        @flow.global_function("predict", get_predict_config())
+        def predictQNet(
+            input_image: oft.Numpy.Placeholder(
+                (BATCH_SIZE, 4, 64, 64), dtype=flow.float32
+            )
+        ) -> oft.Numpy:
+            with flow.scope.placement(self.device_tag_, "0:0-0"):
+                out = createOfQNet(input_image, var_name_prefix="QNetT", is_train=False)
+                return out
+
+        @flow.global_function("predict", get_predict_config())
+        def copyQNetToQnetT():
+            with flow.scope.placement(self.device_tag_, "0:0-0"):
+                (
+                    t_conv1_weight,
+                    t_conv1_bias,
+                    t_conv2_weight,
+                    t_conv2_bias,
+                    t_fc1_weight,
+                    t_fc1_bias,
+                    t_fc2_weight,
+                    t_fc2_bias,
+                ) = getQNetParams(var_name_prefix="QNet", is_train=True)
+                (
+                    p_conv1_weight,
+                    p_conv1_bias,
+                    p_conv2_weight,
+                    p_conv2_bias,
+                    p_fc1_weight,
+                    p_fc1_bias,
+                    p_fc2_weight,
+                    p_fc2_bias,
+                ) = getQNetParams(var_name_prefix="QNetT", is_train=False)
+                flow.assign(p_conv1_weight, t_conv1_weight)
+                flow.assign(p_conv1_bias, t_conv1_bias)
+                flow.assign(p_conv2_weight, t_conv2_weight)
+                flow.assign(p_conv2_bias, t_conv2_bias)
+                flow.assign(p_fc1_weight, t_fc1_weight)
+                flow.assign(p_fc1_bias, t_fc1_bias)
+                flow.assign(p_fc2_weight, t_fc2_weight)
+                flow.assign(p_fc2_bias, t_fc2_bias)
+
+        check_point = flow.train.CheckPoint()
+        check_point.init()
+        input_image = np.ones((BATCH_SIZE, 4, 64, 64)).astype(np.float32)
+        y_input = np.random.random_sample((BATCH_SIZE,)).astype(np.float32)
+        action_input = np.random.random_sample((BATCH_SIZE, 2)).astype(np.float32)
+        train_out = trainQNet(input_image, y_input, action_input)
+        copyQNetToQnetT()
+        train_out = trainQNet(input_image, y_input, action_input)
+        predict_out = predictQNet(input_image)
+        assert np.allclose(train_out, predict_out, rtol=0.01, atol=0.1), "{}-{}".format(
+            train_out.mean(), predict_out.mean()
+        )
diff --git a/python/oneflow/test/models/vgg16.py b/python/oneflow/test/models/vgg16.py
new file mode 100644
index 0000000000000000000000000000000000000000..973181654cdee9566c20a8e08066e6302efbb865
--- /dev/null
+++ b/python/oneflow/test/models/vgg16.py
@@ -0,0 +1,303 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import argparse
+import os
+from datetime import datetime
+
+import numpy
+
+import oneflow as flow
+import oneflow.core.job.initializer_conf_pb2 as initializer_conf_util
+import oneflow.core.operator.op_conf_pb2 as op_conf_util
+
+_DATA_DIR = "/dataset/PNGS/PNG224/of_record_repeated"
+_SINGLE_DATA_DIR = "/dataset/PNGS/PNG224/of_record"
+_MODEL_LOAD_DIR = "/dataset/PNGS/cnns_model_for_test/vgg16/models/of_model"
+_MODEL_SAVE_DIR = "./model_save-{}".format(
+    str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))
+)
+NODE_LIST = "192.168.1.12,192.168.1.14"
+
+
+class DLNetSpec(object):
+    def __init__(self, enable_auto_mixed_precision):
+        self.batch_size = 8
+        self.data_part_num = 32
+        self.eval_dir = _DATA_DIR
+        self.train_dir = _DATA_DIR
+        self.model_save_dir = _MODEL_SAVE_DIR
+        self.model_load_dir = _MODEL_LOAD_DIR
+        self.num_nodes = 1
+        self.gpu_num_per_node = 1
+        self.iter_num = 10
+        self.enable_auto_mixed_precision = enable_auto_mixed_precision
+
+
+parser = argparse.ArgumentParser(description="flags for multi-node and resource")
+parser.add_argument("-g", "--gpu_num_per_node", type=int, default=1, required=False)
+parser.add_argument("-i", "--iter_num", type=int, default=10, required=False)
+parser.add_argument(
+    "-m", "--multinode", default=False, action="store_true", required=False
+)
+parser.add_argument("-n", "--node_list", type=str, default=NODE_LIST, required=False)
+parser.add_argument(
+    "-s", "--skip_scp_binary", default=False, action="store_true", required=False
+)
+parser.add_argument(
+    "-c",
+    "--scp_binary_without_uuid",
+    default=False,
+    action="store_true",
+    required=False,
+)
+parser.add_argument(
+    "-r", "--remote_by_hand", default=False, action="store_true", required=False
+)
+parser.add_argument("-e", "--eval_dir", type=str, default=_DATA_DIR, required=False)
+parser.add_argument("-t", "--train_dir", type=str, default=_DATA_DIR, required=False)
+parser.add_argument(
+    "-load", "--model_load_dir", type=str, default=_MODEL_LOAD_DIR, required=False
+)
+parser.add_argument(
+    "-save", "--model_save_dir", type=str, default=_MODEL_SAVE_DIR, required=False
+)
+parser.add_argument("-dn", "--data_part_num", type=int, default=32, required=False)
+parser.add_argument("-b", "--batch_size", type=int, default=8, required=False)
+
+
+def _conv2d_layer(
+    name,
+    input,
+    filters,
+    kernel_size=3,
+    strides=1,
+    padding="VALID",
+    data_format="NCHW",
+    dilation_rate=1,
+    activation=op_conf_util.kRelu,
+    use_bias=True,
+    weight_initializer=flow.random_uniform_initializer(),
+    bias_initializer=flow.constant_initializer(),
+):
+    weight_shape = (filters, input.shape[1], kernel_size, kernel_size)
+    weight = flow.get_variable(
+        name + "-weight",
+        shape=weight_shape,
+        dtype=input.dtype,
+        initializer=weight_initializer,
+    )
+    output = flow.nn.conv2d(
+        input, weight, strides, padding, None, data_format, dilation_rate, name=name
+    )
+    if use_bias:
+        bias = flow.get_variable(
+            name + "-bias",
+            shape=(filters,),
+            dtype=input.dtype,
+            initializer=bias_initializer,
+        )
+        output = flow.nn.bias_add(output, bias, "NCHW")
+    if activation is not None:
+        if activation == op_conf_util.kRelu:
+            output = flow.math.relu(output)
+        else:
+            raise NotImplementedError
+    return output
+
+
+def _data_load_layer(args, data_dir):
+    node_num = args.num_nodes
+    total_batch_size = args.batch_size * args.gpu_num_per_node * node_num
+    rgb_mean = [123.68, 116.78, 103.94]
+    ofrecord = flow.data.ofrecord_reader(
+        data_dir,
+        batch_size=total_batch_size,
+        data_part_num=args.data_part_num,
+        name="decode",
+    )
+    image = flow.data.ofrecord_image_decoder(ofrecord, "encoded", color_space="RGB")
+    label = flow.data.ofrecord_raw_decoder(
+        ofrecord, "class/label", shape=(), dtype=flow.int32
+    )
+    rsz = flow.image.resize(image, resize_x=224, resize_y=224, color_space="RGB")
+    normal = flow.image.crop_mirror_normalize(
+        rsz,
+        color_space="RGB",
+        output_layout="NCHW",
+        mean=rgb_mean,
+        output_dtype=flow.float,
+    )
+    return (label, normal)
+
+
+def _conv_block(in_blob, index, filters, conv_times):
+    conv_block = []
+    conv_block.insert(0, in_blob)
+    for i in range(conv_times):
+        conv_i = _conv2d_layer(
+            name="conv{}".format(index),
+            input=conv_block[i],
+            filters=filters,
+            kernel_size=3,
+            strides=1,
+        )
+        conv_block.append(conv_i)
+        index += 1
+    return conv_block
+
+
+def vgg(images, labels, trainable=True):
+    to_return = []
+    conv1 = _conv_block(images, 0, 64, 2)
+    pool1 = flow.nn.max_pool2d(conv1[-1], 2, 2, "VALID", "NCHW", name="pool1")
+    conv2 = _conv_block(pool1, 2, 128, 2)
+    pool2 = flow.nn.max_pool2d(conv2[-1], 2, 2, "VALID", "NCHW", name="pool2")
+    conv3 = _conv_block(pool2, 4, 256, 3)
+    pool3 = flow.nn.max_pool2d(conv3[-1], 2, 2, "VALID", "NCHW", name="pool3")
+    conv4 = _conv_block(pool3, 7, 512, 3)
+    pool4 = flow.nn.max_pool2d(conv4[-1], 2, 2, "VALID", "NCHW", name="pool4")
+    conv5 = _conv_block(pool4, 10, 512, 3)
+    pool5 = flow.nn.max_pool2d(conv5[-1], 2, 2, "VALID", "NCHW", name="pool5")
+
+    def _get_kernel_initializer():
+        kernel_initializer = initializer_conf_util.InitializerConf()
+        kernel_initializer.truncated_normal_conf.std = 0.816496580927726
+        return kernel_initializer
+
+    def _get_bias_initializer():
+        bias_initializer = initializer_conf_util.InitializerConf()
+        bias_initializer.constant_conf.value = 0.0
+        return bias_initializer
+
+    pool5 = flow.reshape(pool5, [-1, 512])
+    fc6 = flow.layers.dense(
+        inputs=pool5,
+        units=4096,
+        activation=flow.math.relu,
+        use_bias=True,
+        kernel_initializer=_get_kernel_initializer(),
+        bias_initializer=_get_bias_initializer(),
+        trainable=trainable,
+        name="fc1",
+    )
+    fc7 = flow.layers.dense(
+        inputs=fc6,
+        units=4096,
+        activation=flow.math.relu,
+        use_bias=True,
+        kernel_initializer=_get_kernel_initializer(),
+        bias_initializer=_get_bias_initializer(),
+        trainable=trainable,
+        name="fc2",
+    )
+    fc8 = flow.layers.dense(
+        inputs=fc7,
+        units=1001,
+        use_bias=True,
+        kernel_initializer=_get_kernel_initializer(),
+        bias_initializer=_get_bias_initializer(),
+        trainable=trainable,
+        name="fc_final",
+    )
+    loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
+        labels, fc8, name="softmax_loss"
+    )
+    to_return.append(loss)
+    return tuple(to_return)
+
+
+def main(args):
+    flow.config.machine_num(args.num_nodes)
+    flow.config.gpu_device_num(args.gpu_num_per_node)
+    train_config = flow.FunctionConfig()
+    train_config.default_logical_view(flow.scope.consistent_view())
+    train_config.default_data_type(flow.float)
+    train_config.enable_auto_mixed_precision(args.enable_auto_mixed_precision)
+
+    @flow.global_function(type="train", function_config=train_config)
+    def vgg_train_job():
+        (labels, images) = _data_load_layer(args, args.train_dir)
+        to_return = vgg(images, labels)
+        loss = to_return[-1]
+        flow.optimizer.SGD(
+            flow.optimizer.PiecewiseConstantScheduler([], [1e-05]), momentum=0
+        ).minimize(loss)
+        return loss
+
+    eval_config = flow.FunctionConfig()
+    eval_config.default_logical_view(flow.scope.consistent_view())
+    eval_config.default_data_type(flow.float)
+    eval_config.enable_auto_mixed_precision(args.enable_auto_mixed_precision)
+
+    @flow.global_function(function_config=eval_config)
+    def vgg_eval_job():
+        (labels, images) = _data_load_layer(args, args.eval_dir)
+        return vgg(images, labels, False)
+
+    check_point = flow.train.CheckPoint()
+    if not args.model_load_dir:
+        check_point.init()
+    else:
+        check_point.load(args.model_load_dir)
+    num_nodes = args.num_nodes
+    print(
+        "Traning vgg16: num_gpu_per_node = {}, num_nodes = {}.".format(
+            args.gpu_num_per_node, num_nodes
+        )
+    )
+    print("{:>12}  {:>12}  {:>12}".format("iter", "loss type", "loss value"))
+    loss = []
+    for i in range(args.iter_num):
+        train_loss = vgg_train_job().get().mean()
+        loss.append(train_loss)
+        fmt_str = "{:>12}  {:>12}  {:>12.6f}"
+        print(fmt_str.format(i, "train loss:", train_loss))
+        if (i + 1) % 100 == 0:
+            check_point.save(_MODEL_SAVE_DIR + str(i))
+    loss_file = "{}n{}c.npy".format(
+        str(num_nodes), str(args.gpu_num_per_node * num_nodes)
+    )
+    loss_path = "./of_loss/vgg16"
+    if not os.path.exists(loss_path):
+        os.makedirs(loss_path)
+    numpy.save(os.path.join(loss_path, loss_file), loss)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    flow.env.log_dir("./log")
+    if args.multinode:
+        flow.env.ctrl_port(12138)
+        nodes = []
+        for n in args.node_list.strip().split(","):
+            addr_dict = {}
+            addr_dict["addr"] = n
+            nodes.append(addr_dict)
+        flow.env.machine(nodes)
+        if args.remote_by_hand is False:
+            if args.scp_binary_without_uuid:
+                flow.deprecated.init_worker(scp_binary=True, use_uuid=False)
+            elif args.skip_scp_binary:
+                flow.deprecated.init_worker(scp_binary=False, use_uuid=False)
+            else:
+                flow.deprecated.init_worker(scp_binary=True, use_uuid=True)
+    main(args)
+    if (
+        args.multinode
+        and args.skip_scp_binary is False
+        and (args.scp_binary_without_uuid is False)
+    ):
+        flow.deprecated.delete_worker()
diff --git a/python/oneflow/test/modules/automated_test_util.py b/python/oneflow/test/modules/automated_test_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..682939b32a9e936c16040732ecb65b7c1c5290f0
--- /dev/null
+++ b/python/oneflow/test/modules/automated_test_util.py
@@ -0,0 +1,29 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+import sys
+
+test_util_parent_dir = os.path.dirname(
+    os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+)
+oneflow_test_utils_dir_from_env = os.getenv("ONEFLOW_TEST_UTILS_DIR")
+if oneflow_test_utils_dir_from_env:
+    from pathlib import Path
+
+    oneflow_test_utils_dir_from_env = Path(oneflow_test_utils_dir_from_env)
+    test_util_parent_dir = str(oneflow_test_utils_dir_from_env.parent.absolute())
+sys.path.append(test_util_parent_dir)
+from test_utils.automated_test_util import *
diff --git a/python/oneflow/test/modules/image_test_util.py b/python/oneflow/test/modules/image_test_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0deb1e2e39d74eb95dee2c50a7a7a5a6e42093a
--- /dev/null
+++ b/python/oneflow/test/modules/image_test_util.py
@@ -0,0 +1,159 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+import random
+
+import cv2
+import numpy as np
+import PIL
+
+import oneflow as flow
+
+global_coco_dict = dict()
+default_coco_anno_file = "/dataset/mscoco_2017/annotations/instances_val2017.json"
+default_coco_image_dir = "/dataset/mscoco_2017/val2017"
+
+
+def get_coco(anno_file):
+    global global_coco_dict
+    if anno_file not in global_coco_dict:
+        from pycocotools.coco import COCO
+
+        global_coco_dict[anno_file] = COCO(anno_file)
+    return global_coco_dict[anno_file]
+
+
+def random_sample_images_from_coco(
+    anno_file=default_coco_anno_file, image_dir=default_coco_image_dir, batch_size=2
+):
+    image_files = []
+    image_ids = []
+    batch_group_id = -1
+    coco = get_coco(anno_file)
+    img_ids = coco.getImgIds()
+    while len(image_files) < batch_size:
+        rand_img_id = random.choice(img_ids)
+        img_h = coco.imgs[rand_img_id]["height"]
+        img_w = coco.imgs[rand_img_id]["width"]
+        group_id = int(img_h / img_w)
+        if batch_group_id == -1:
+            batch_group_id = group_id
+        if group_id != batch_group_id:
+            continue
+        image_files.append(os.path.join(image_dir, coco.imgs[rand_img_id]["file_name"]))
+        image_ids.append(rand_img_id)
+    assert len(image_files) == len(image_ids)
+    return (image_files, image_ids)
+
+
+def read_images_by_cv(image_files, dtype, channels=3):
+    np_dtype = flow.convert_oneflow_dtype_to_numpy_dtype(dtype)
+    images = [cv2.imread(image_file).astype(np_dtype) for image_file in image_files]
+    assert all((isinstance(image, np.ndarray) for image in images))
+    assert all((image.ndim == 3 for image in images))
+    assert all((image.shape[2] == channels for image in images))
+    return images
+
+
+def read_images_by_pil(image_files, dtype, channels=3):
+    image_objs = [PIL.Image.open(image_file) for image_file in image_files]
+    images = []
+    np_dtype = flow.convert_oneflow_dtype_to_numpy_dtype(dtype)
+    for im in image_objs:
+        bands = im.getbands()
+        band = "".join(bands)
+        if band == "RGB":
+            images.append(np.asarray(im).astype(np_dtype)[:, :, ::-1])
+        elif band == "L":
+            gs_image = np.asarray(im).astype(np_dtype)
+            gs_image_shape = gs_image.shape
+            assert len(gs_image_shape) == 2
+            gs_image = gs_image.reshape(gs_image_shape + (1,))
+            gs_image = np.broadcast_to(gs_image, shape=gs_image_shape + (3,))
+            images.append(gs_image)
+        elif band == "BGR":
+            images.append(np.asarray(im).astype(np_dtype))
+        else:
+            raise NotImplementedError
+    assert all((isinstance(image, np.ndarray) for image in images))
+    assert all((image.ndim == 3 for image in images))
+    assert all((image.shape[2] == channels for image in images))
+    return images
+
+
+def infer_images_static_shape(images, channels=3):
+    image_shapes = [image.shape for image in images]
+    assert all((image.ndim == 3 for image in images))
+    assert all((image.shape[2] == channels for image in images))
+    image_shapes = np.asarray(image_shapes)
+    max_h = np.max(image_shapes[:, 0]).item()
+    max_w = np.max(image_shapes[:, 1]).item()
+    image_static_shape = (len(images), max_h, max_w, channels)
+    group_ids = []
+    aspect_ratio_list = []
+    for image_shape in image_shapes:
+        (h, w) = image_shape[0:2]
+        if h < w:
+            group_id = 0
+            aspect_ratio = h / w
+        else:
+            group_id = 1
+            aspect_ratio = w / h
+        group_ids.append(group_id)
+        aspect_ratio_list.append(aspect_ratio)
+    assert all((group_id == group_ids[0] for group_id in group_ids))
+    return (image_static_shape, aspect_ratio_list)
+
+
+def compute_keep_aspect_ratio_resized_size(
+    target_size, min_size, max_size, aspect_ratio, resize_side
+):
+    if resize_side == "shorter":
+        min_res_size = target_size
+        max_res_size = int(round(min_res_size / aspect_ratio))
+        if max_size is not None and max_res_size > max_size:
+            max_res_size = max_size
+            min_res_size = int(round(max_res_size * aspect_ratio))
+    elif resize_side == "longer":
+        max_res_size = target_size
+        min_res_size = int(round(max_res_size * aspect_ratio))
+        if min_size is not None and min_res_size < min_size:
+            min_res_size = min_size
+            max_res_size = int(round(min_res_size / aspect_ratio))
+    else:
+        raise NotImplementedError
+    return (min_res_size, max_res_size)
+
+
+def infer_keep_aspect_ratio_resized_images_static_shape(
+    target_size,
+    min_size,
+    max_size,
+    aspect_ratio_list,
+    resize_side="shorter",
+    channels=3,
+):
+    resized_size_list = []
+    for aspect_ratio in aspect_ratio_list:
+        resized_size_list.append(
+            compute_keep_aspect_ratio_resized_size(
+                target_size, min_size, max_size, aspect_ratio, resize_side
+            )
+        )
+    (res_min_size, res_max_size) = max(
+        resized_size_list, key=lambda size: size[0] * size[1]
+    )
+    return (res_min_size, res_max_size, channels)
diff --git a/python/oneflow/test/modules/resnet50_model.py b/python/oneflow/test/modules/resnet50_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..9782878621e87f7926ddeb51e219491993f01bd7
--- /dev/null
+++ b/python/oneflow/test/modules/resnet50_model.py
@@ -0,0 +1,304 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Any, Callable, List, Optional, Type, Union
+
+import oneflow as flow
+import oneflow.nn as nn
+from oneflow import Tensor
+
+
+class FakeBN(nn.Module):
+    """Common base of _InstanceNorm and _BatchNorm"""
+
+    def __init__(
+        self,
+        num_features: int,
+        eps: float = 1e-05,
+        momentum: float = 0.1,
+        affine: bool = True,
+        track_running_stats: bool = True,
+    ) -> None:
+        super().__init__()
+        self.num_features = num_features
+        self.eps = eps
+        self.momentum = momentum
+        self.affine = affine
+        self.track_running_stats = track_running_stats
+        if self.affine:
+            self.weight = flow.nn.Parameter(flow.Tensor(num_features))
+            self.bias = flow.nn.Parameter(flow.Tensor(num_features))
+        else:
+            self.register_parameter("weight", None)
+            self.register_parameter("bias", None)
+        if self.track_running_stats:
+            self.register_buffer("running_mean", flow.Tensor(num_features))
+            self.register_buffer("running_var", flow.Tensor(num_features))
+        else:
+            self.register_parameter("running_mean", None)
+            self.register_parameter("running_var", None)
+        self._op = flow.builtin_op("identity").Input("in").Output("out").Build()
+
+    def forward(self, input):
+        return self._op(input)[0]
+
+
+def conv3x3(
+    in_planes: int, out_planes: int, stride: int = 1, groups: int = 1, dilation: int = 1
+) -> nn.Conv2d:
+    """3x3 convolution with padding"""
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=dilation,
+        groups=groups,
+        bias=False,
+        dilation=dilation,
+    )
+
+
+def conv1x1(in_planes: int, out_planes: int, stride: int = 1) -> nn.Conv2d:
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion: int = 1
+
+    def __init__(
+        self,
+        inplanes: int,
+        planes: int,
+        stride: int = 1,
+        downsample: Optional[nn.Module] = None,
+        groups: int = 1,
+        base_width: int = 64,
+        dilation: int = 1,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super(BasicBlock, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        if groups != 1 or base_width != 64:
+            raise ValueError("BasicBlock only supports groups=1 and base_width=64")
+        if dilation > 1:
+            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU()
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion: int = 4
+
+    def __init__(
+        self,
+        inplanes: int,
+        planes: int,
+        stride: int = 1,
+        downsample: Optional[nn.Module] = None,
+        groups: int = 1,
+        base_width: int = 64,
+        dilation: int = 1,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super(Bottleneck, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        width = int(planes * (base_width / 64.0)) * groups
+        self.conv1 = conv1x1(inplanes, width)
+        self.bn1 = norm_layer(width)
+        self.conv2 = conv3x3(width, width, stride, groups, dilation)
+        self.bn2 = norm_layer(width)
+        self.conv3 = conv1x1(width, planes * self.expansion)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU()
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+
+
+class ResNet(nn.Module):
+    def __init__(
+        self,
+        block: Type[Union[BasicBlock, Bottleneck]],
+        layers: List[int],
+        num_classes: int = 1000,
+        zero_init_residual: bool = False,
+        groups: int = 1,
+        width_per_group: int = 64,
+        replace_stride_with_dilation: Optional[List[bool]] = None,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super(ResNet, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self._norm_layer = norm_layer
+        self.inplanes = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError(
+                "replace_stride_with_dilation should be None or a 3-element tuple, got {}".format(
+                    replace_stride_with_dilation
+                )
+            )
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(
+            3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False
+        )
+        self.bn1 = norm_layer(self.inplanes)
+        self.relu = nn.ReLU()
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(
+            block, 128, layers[1], stride=2, dilate=replace_stride_with_dilation[0]
+        )
+        self.layer3 = self._make_layer(
+            block, 256, layers[2], stride=2, dilate=replace_stride_with_dilation[1]
+        )
+        self.layer4 = self._make_layer(
+            block, 512, layers[3], stride=2, dilate=replace_stride_with_dilation[2]
+        )
+        self.avgpool = nn.AvgPool2d((7, 7))
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, Bottleneck):
+                    nn.init.constant_(m.bn3.weight, 0)
+                elif isinstance(m, BasicBlock):
+                    nn.init.constant_(m.bn2.weight, 0)
+
+    def _make_layer(
+        self,
+        block: Type[Union[BasicBlock, Bottleneck]],
+        planes: int,
+        blocks: int,
+        stride: int = 1,
+        dilate: bool = False,
+    ) -> nn.Sequential:
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                norm_layer(planes * block.expansion),
+            )
+        layers = []
+        layers.append(
+            block(
+                self.inplanes,
+                planes,
+                stride,
+                downsample,
+                self.groups,
+                self.base_width,
+                previous_dilation,
+                norm_layer,
+            )
+        )
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(
+                block(
+                    self.inplanes,
+                    planes,
+                    groups=self.groups,
+                    base_width=self.base_width,
+                    dilation=self.dilation,
+                    norm_layer=norm_layer,
+                )
+            )
+        return nn.Sequential(*layers)
+
+    def _forward_impl(self, x: Tensor) -> Tensor:
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.avgpool(x)
+        x = flow.flatten(x, 1)
+        x = self.fc(x)
+        return x
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self._forward_impl(x)
+
+
+def _resnet(
+    arch: str,
+    block: Type[Union[BasicBlock, Bottleneck]],
+    layers: List[int],
+    **kwargs: Any
+) -> ResNet:
+    model = ResNet(block, layers, **kwargs)
+    return model
+
+
+def resnet50(**kwargs: Any) -> ResNet:
+    """ResNet-5
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+    """
+    return _resnet("resnet50", Bottleneck, [3, 4, 6, 3], **kwargs)
diff --git a/python/oneflow/test/modules/test_abs.py b/python/oneflow/test/modules/test_abs.py
new file mode 100644
index 0000000000000000000000000000000000000000..43ec34ae345b5eb63094df19c4278bf40963bc9e
--- /dev/null
+++ b/python/oneflow/test/modules/test_abs.py
@@ -0,0 +1,86 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from automated_test_util import *
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_abs_forward(test_case, device):
+    input = flow.Tensor(np.random.randn(2, 3).astype(np.float32))
+    of_out = flow.abs(input)
+    np_out = np.abs(input.numpy())
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    test_case.assertTrue(np.allclose(input.abs().numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_abs_tensor_function_forward(test_case, device):
+    x = np.random.randn(2, 3).astype(np.float32)
+    input = flow.Tensor(x, dtype=flow.float32)
+    np_out = np.abs(x)
+    of_out = input.abs()
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_abs_backward(test_case, device):
+    np_input = np.random.randn(2, 3).astype(np.float32)
+    input = flow.Tensor(np_input, dtype=flow.float32, requires_grad=True)
+    of_out = flow.abs(input).sum()
+    of_out.backward()
+    np_grad = np.where(np_input > 0, 1, -1)
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+def _test_abs_tensor_function_backward(test_case, device):
+    np_input = np.random.randn(2, 3).astype(np.float32)
+    input = flow.Tensor(np_input, dtype=flow.float32, requires_grad=True)
+    of_out = input.abs().sum()
+    of_out.backward()
+    np_grad = np.where(np_input > 0, 1, -1)
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestAbs(flow.unittest.TestCase):
+    def test_cosh(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_abs_forward,
+            _test_abs_tensor_function_forward,
+            _test_abs_backward,
+            _test_abs_tensor_function_backward,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+    def test_flow_abs_with_random_data(test_case):
+        for device in ["cpu", "cuda"]:
+            test_flow_against_pytorch(test_case, "abs", device=device)
+
+    def test_flow_tensor_abs_with_random_data(test_case):
+        for device in ["cpu", "cuda"]:
+            test_tensor_against_pytorch(test_case, "abs", device=device)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_acos.py b/python/oneflow/test/modules/test_acos.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad244a0a1329d31e066e8621d574bdcfa3784729
--- /dev/null
+++ b/python/oneflow/test/modules/test_acos.py
@@ -0,0 +1,55 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_acos_impl(test_case, shape, device):
+    input = flow.Tensor(
+        np.random.rand(*shape) - 0.5, device=flow.device(device), requires_grad=True
+    )
+    of_out = flow.acos(input)
+    np_out = np.arccos(input.numpy())
+    test_case.assertTrue(
+        np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05, equal_nan=True)
+    )
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = -1.0 / np.sqrt(1 - np.square(input.numpy()))
+    test_case.assertTrue(
+        np.allclose(input.grad.numpy(), np_grad, 0.0001, 0.0001, equal_nan=True)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestAcos(flow.unittest.TestCase):
+    def test_acos(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(2,), (2, 3), (2, 3, 4), (2, 4, 5, 6)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_acos_impl(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_acosh.py b/python/oneflow/test/modules/test_acosh.py
new file mode 100644
index 0000000000000000000000000000000000000000..193424c6113939610803c95e82e8bf1318cabc79
--- /dev/null
+++ b/python/oneflow/test/modules/test_acosh.py
@@ -0,0 +1,56 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_acosh_impl(test_case, shape, device):
+    np_input = np.random.rand(*shape) + 2.0
+    of_input = flow.Tensor(
+        np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    of_out = flow.acosh(of_input)
+    np_out = np.arccosh(np_input)
+    test_case.assertTrue(
+        np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001, equal_nan=True)
+    )
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = 1.0 / np.sqrt(np.square(np_input) - 1)
+    test_case.assertTrue(
+        np.allclose(of_input.grad.numpy(), np_grad, 0.0001, 0.0001, equal_nan=True)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestAcosh(flow.unittest.TestCase):
+    def test_acosh(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(2, 3), (2, 3, 4), (2, 4, 5, 6)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_acosh_impl(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_activation.py b/python/oneflow/test/modules/test_activation.py
new file mode 100644
index 0000000000000000000000000000000000000000..d684591b6b1820fab5a9871da10e332ee4c244c5
--- /dev/null
+++ b/python/oneflow/test/modules/test_activation.py
@@ -0,0 +1,828 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from automated_test_util import *
+from scipy import special
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_relu_impl(test_case, shape, device):
+    np_input = np.random.randn(*shape)
+    of_input = flow.Tensor(
+        np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    m = flow.nn.ReLU()
+    of_out = m(of_input)
+    np_out = np.maximum(0, np_input)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    test_case.assertTrue(np.allclose(of_input.grad.numpy(), np_out > 0, 1e-05, 1e-05))
+    inplace_m = flow.nn.ReLU(inplace=True)
+    of_input = flow.Tensor(
+        np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    of_input_inplace = of_input + 1
+    inplace_m(of_input_inplace)
+    np_out = np.maximum(0, np_input + 1)
+    test_case.assertTrue(np.allclose(of_input_inplace.numpy(), np_out, 1e-05, 1e-05))
+    of_out_inplace = of_input_inplace.sum()
+    of_out_inplace.backward()
+    test_case.assertTrue(np.allclose(of_input.grad.numpy(), np_out > 0, 1e-05, 1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestReLUModule(flow.unittest.TestCase):
+    def test_relu(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(2, 3), (2, 3, 4), (2, 4, 5, 6)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_relu_impl(test_case, *arg)
+
+    @autotest
+    def test_relu_module_with_random_data(test_case):
+        m = torch.nn.ReLU()
+        m.train(random())
+        device = random_device()
+        m.to(device)
+        x = random_pytorch_tensor().to(device)
+        y = m(x)
+        return y
+
+
+def _test_relu6_impl(test_case, shape, device):
+    np_input = np.random.randn(*shape)
+    of_input = flow.Tensor(
+        np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    m = flow.nn.ReLU6()
+    of_out = m(of_input)
+    np_out = np.minimum(np.maximum(0, np_input), 6.0)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    test_case.assertTrue(
+        np.allclose(
+            of_input.grad.numpy(),
+            np.where(np_input > 6, 0, np.where(np_input < 0, 0, 1)),
+            1e-05,
+            1e-05,
+        )
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestReLU6Module(flow.unittest.TestCase):
+    def test_relu6(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(2, 3), (2, 3, 4), (2, 4, 5, 6)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_relu6_impl(test_case, *arg)
+
+    @autotest
+    def test_relu6_module_with_random_data(test_case):
+        m = torch.nn.ReLU6()
+        m.train(random())
+        device = random_device()
+        m.to(device)
+        x = random_pytorch_tensor().to(device)
+        y = m(x)
+        return y
+
+
+def _test_tanh_nn_impl(test_case, shape, device):
+    np_input = np.random.randn(*shape)
+    of_input = flow.Tensor(
+        np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    tanh = flow.nn.Tanh()
+    of_out = tanh(of_input)
+    np_out = np.tanh(np_input)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    test_case.assertTrue(
+        np.allclose(of_input.grad.numpy(), 1.0 - np_out * np_out, 1e-05, 1e-05)
+    )
+
+
+def _test_tanh_function_impl(test_case, shape, device):
+    np_input = np.random.randn(*shape)
+    of_input = flow.Tensor(
+        np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    of_out = flow.tanh(of_input)
+    np_out = np.tanh(np_input)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    test_case.assertTrue(
+        np.allclose(of_input.grad.numpy(), 1.0 - np_out * np_out, 1e-05, 1e-05)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestTanh(flow.unittest.TestCase):
+    def test_tanh(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(2, 3), (2, 3, 4), (2, 4, 5, 6)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_tanh_nn_impl(test_case, *arg)
+            _test_tanh_function_impl(test_case, *arg)
+
+    @autotest
+    def test_tanh_module_with_random_data(test_case):
+        m = torch.nn.Tanh()
+        m.train(random())
+        device = random_device()
+        m.to(device)
+        x = random_pytorch_tensor().to(device)
+        y = m(x)
+        return y
+
+    @autotest
+    def test_flow_tanh_with_random_data(test_case):
+        device = random_device()
+        x = random_pytorch_tensor().to(device)
+        y = flow.tanh(x)
+        return y
+
+
+def _test_elu_function_impl(test_case, shape, device):
+    m = flow.nn.ELU()
+    arr = np.random.randn(*shape)
+    np_out = np.where(arr > 0, arr, 1.0 * (np.exp(arr) - 1))
+    x = flow.Tensor(arr, device=flow.device(device), requires_grad=True)
+    of_out = m(x)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, rtol=1e-05, atol=1e-05))
+    m = flow.nn.ELU(alpha=1.2)
+    arr = np.random.randn(*shape)
+    np_out = np.where(arr > 0, arr, 1.2 * (np.exp(arr) - 1))
+    x = flow.Tensor(arr, device=flow.device(device), requires_grad=True)
+    of_out = m(x)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, rtol=1e-05, atol=1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = np.where(arr > 0, 1, 1.2 * np.exp(arr))
+    test_case.assertTrue(np.allclose(x.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestELUModule(flow.unittest.TestCase):
+    def test_elu(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(2, 3), (2, 3, 4), (2, 4, 5, 6)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_elu_function_impl(test_case, *arg)
+
+    @autotest
+    def test_elu_module_with_random_data(test_case):
+        m = torch.nn.ELU(alpha=random() | nothing())
+        m.train(random())
+        device = random_device()
+        m.to(device)
+        x = random_pytorch_tensor().to(device)
+        y = m(x)
+        return y
+
+
+def _np_gelu(x):
+    return 0.5 * x * (1 + special.erf(x / np.sqrt(2)))
+
+
+def _test_gelu_impl(test_case, device):
+    np_input = np.array([1.0, -1.0, 2.3]).astype(np.float32)
+    of_input = flow.Tensor(
+        np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    gelu = flow.nn.GELU()
+    of_out = gelu(of_input)
+    np_out = _np_gelu(np_input)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [1.0833154916763306, -0.08331547677516937, 1.0544281005859375]
+    test_case.assertTrue(np.allclose(of_input.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestGelu(flow.unittest.TestCase):
+    def test_gelu(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_gelu_impl(test_case, *arg)
+
+    def test_gelu_module_with_random_data(test_case):
+        for device in ["cpu", "cuda"]:
+            test_module_against_pytorch(test_case, "nn.GELU", device=device, n=2)
+
+
+def numpy_sigmoid(x):
+    return 1.0 / (1 + np.exp(-x))
+
+
+def numpy_sigmoid_grad(inputs, grads):
+    x = np.exp(-inputs)
+    delta = x / (1 + x) ** 2
+    return delta * grads
+
+
+def numpy_softmax(x, axis):
+    x = x - x.max(axis=axis, keepdims=True)
+    y = np.exp(x)
+    return y / y.sum(axis=axis, keepdims=True)
+
+
+def numpy_logsoftmax(x, dim):
+    e_x = np.exp(x - np.max(x, axis=dim, keepdims=True))
+    return np.log(e_x / e_x.sum(axis=dim, keepdims=True))
+
+
+def numpy_softplus(x, beta, threshold):
+    return np.where(
+        x * beta > threshold, x, 1.0 / beta * np.log(1.0 + np.exp(beta * x))
+    )
+
+
+def numpy_mish_grad(x):
+    f = 1 + np.exp(x)
+    y_grad = (f * f - 1) / (f * f + 1) + x * (4 * f * (f - 1)) / (
+        (f * f + 1) * (f * f + 1)
+    )
+    return y_grad
+
+
+def _test_sigmoid(test_case, device):
+    m = flow.nn.Sigmoid()
+    input_arr = np.random.randn(2, 3, 4, 5)
+    x = flow.Tensor(input_arr, device=flow.device(device))
+    y = m(x)
+    y2 = flow.sigmoid(x)
+    y3 = x.sigmoid()
+    output = numpy_sigmoid(input_arr)
+    test_case.assertTrue(np.allclose(y.numpy(), output, 1e-05, 1e-05))
+    test_case.assertTrue(np.allclose(y2.numpy(), output, 1e-05, 1e-05))
+    test_case.assertTrue(np.allclose(y3.numpy(), output, 1e-05, 1e-05))
+
+
+def _test_sigmoid_backward(test_case, device):
+    input_arr = np.random.randn(2, 3, 4, 5)
+    x = flow.Tensor(input_arr, device=flow.device(device), requires_grad=True)
+    x_grad = numpy_sigmoid_grad(input_arr, np.ones(input_arr.shape))
+    m = flow.nn.Sigmoid()
+    y = m(x).sum()
+    y.backward()
+    test_case.assertTrue(np.allclose(x.grad.numpy(), x_grad, 1e-05, 1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestSigmoid(flow.unittest.TestCase):
+    def test_sigmoid(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["fun"] = [_test_sigmoid, _test_sigmoid_backward]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+    def test_sigmoid_module_with_random_data(test_case):
+        for device in ["cpu", "cuda"]:
+            test_module_against_pytorch(test_case, "nn.Sigmoid", device=device, n=2)
+
+    def test_sigmoid_flow_with_random_data(test_case):
+        for device in ["cpu", "cuda"]:
+            test_flow_against_pytorch(test_case, "sigmoid", device=device, n=2)
+
+    def test_sigmoid_tensor_with_random_data(test_case):
+        for device in ["cpu", "cuda"]:
+            test_tensor_against_pytorch(test_case, "sigmoid", device=device, n=2)
+
+
+def _test_softmax(test_case, device):
+    axis = 0
+    m = flow.nn.Softmax(dim=axis)
+    arr = np.random.randn(2, 3, 4, 5)
+    x = flow.Tensor(arr, device=flow.device(device))
+    y = m(x)
+    output = numpy_softmax(arr, axis)
+    test_case.assertTrue(np.allclose(y.numpy(), output, 1e-05, 1e-05))
+
+
+def _test_softmax_dim_1(test_case, device):
+    axis = 1
+    m = flow.nn.Softmax(dim=axis)
+    arr = np.random.randn(9, 7, 8, 16)
+    x = flow.Tensor(arr, device=flow.device(device))
+    y = m(x)
+    output = numpy_softmax(arr, axis)
+    test_case.assertTrue(np.allclose(y.numpy(), output, 1e-05, 1e-05))
+
+
+def _test_softmax_dim_2(test_case, device):
+    axis = 2
+    m = flow.nn.Softmax(dim=axis)
+    arr = np.random.randn(2, 5, 6, 3)
+    x = flow.Tensor(arr, device=flow.device(device))
+    y = m(x)
+    output = numpy_softmax(arr, axis)
+    test_case.assertTrue(np.allclose(y.numpy(), output, 1e-05, 1e-05))
+
+
+def _test_softmax_dim_3(test_case, device):
+    axis = 3
+    m = flow.nn.Softmax(dim=axis)
+    arr = np.random.randn(1, 3, 4, 7)
+    x = flow.Tensor(arr, device=flow.device(device))
+    y = m(x)
+    output = numpy_softmax(arr, axis)
+    test_case.assertTrue(np.allclose(y.numpy(), output, 1e-05, 1e-05))
+    axis2 = -1
+    m2 = flow.nn.Softmax(dim=axis)
+    y2 = m(x)
+    output2 = numpy_softmax(arr, axis)
+    test_case.assertTrue(np.allclose(y2.numpy(), output2, 1e-05, 1e-05))
+
+
+def _test_softmax_backward_normal(test_case, device):
+    x_grad = np.zeros((2, 3, 4, 5))
+    axis = 0
+    m = flow.nn.Softmax(dim=axis)
+    x = flow.Tensor(
+        np.random.randn(2, 3, 4, 5),
+        requires_grad=True,
+        device=flow.device(device),
+        dtype=flow.float64,
+    )
+    y = m(x).sum()
+    y.backward()
+    test_case.assertTrue(np.allclose(x.grad.numpy(), x_grad, 1e-05, 1e-05))
+
+
+def _test_softmax_backward_1_dim(test_case, device):
+    a = flow.tensor(
+        [1, 2], dtype=flow.float64, device=flow.device(device), requires_grad=True
+    )
+    b = flow.tensor(
+        [3, 4], dtype=flow.float64, device=flow.device(device), requires_grad=True
+    )
+    c = a * b
+    m = flow.nn.Softmax(dim=None)
+    d = m(c)
+    d[0].backward()
+    a_grad = np.array([0.01994417, -0.0265922267])
+    test_case.assertTrue(np.allclose(a.grad.numpy(), a_grad, 1e-05, 1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestSoftmax(flow.unittest.TestCase):
+    def test_softmax(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["fun"] = [
+            _test_softmax,
+            _test_softmax_dim_1,
+            _test_softmax_dim_2,
+            _test_softmax_dim_3,
+            _test_softmax_backward_normal,
+            _test_softmax_backward_1_dim,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+def _np_hardsigmoid_grad(x):
+    return np.where(x > 0, np.where(x >= 1, 0, 1.0 / 6), 0)
+
+
+def _test_hardsigmoid_impl(test_case, shape, device):
+    m = flow.nn.Hardsigmoid()
+    arr = np.random.randn(*shape)
+    np_out = np.maximum(0, np.minimum(1, (arr + 3) / 6))
+    x = flow.Tensor(arr, device=flow.device(device), requires_grad=True)
+    of_out = m(x)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    test_case.assertTrue(
+        np.allclose(x.grad.numpy(), _np_hardsigmoid_grad(np_out), 1e-05, 1e-05)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestHardsigmoidModule(flow.unittest.TestCase):
+    def test_hardsigmoid(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(2, 3), (2, 3, 4), (2, 4, 5, 6)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_hardsigmoid_impl(test_case, *arg)
+
+    def test_hardsigmoid_module_with_random_data(test_case):
+        for device in ["cpu", "cuda"]:
+            test_module_against_pytorch(test_case, "nn.Hardsigmoid", device=device, n=2)
+
+
+def _test_logsoftmax(test_case, device):
+    dim = 1
+    m = flow.nn.LogSoftmax(dim)
+    input_arr = np.random.randn(4, 7)
+    x = flow.Tensor(input_arr, device=flow.device(device))
+    y = m(x)
+    output = numpy_logsoftmax(input_arr, dim)
+    test_case.assertTrue(np.allclose(y.numpy(), output, 1e-05, 1e-05))
+
+
+def _test_logsoftmax_dim_2(test_case, device):
+    dim = 2
+    m = flow.nn.LogSoftmax(dim)
+    input_arr = np.random.randn(3, 4, 5)
+    x = flow.Tensor(input_arr, device=flow.device(device))
+    y = m(x)
+    output = numpy_logsoftmax(input_arr, dim)
+    test_case.assertTrue(np.allclose(y.numpy(), output, 1e-05, 1e-05))
+
+
+def _test_logsoftmax_dim_3(test_case, device):
+    dim = 3
+    m = flow.nn.LogSoftmax(dim)
+    input_arr = np.random.randn(8, 9, 7, 3)
+    x = flow.Tensor(input_arr, device=flow.device(device))
+    y = m(x)
+    output = numpy_logsoftmax(input_arr, dim)
+    test_case.assertTrue(np.allclose(y.numpy(), output, 1e-05, 1e-05))
+
+
+def _test_logsoftmax_backward(test_case, device):
+    axis = 0
+    m = flow.nn.LogSoftmax(axis)
+    input_arr = np.array(
+        [
+            [
+                [
+                    [2.0, 1.0, 9.0, 3.0, 4.0],
+                    [1.0, 6.0, 7.0, 1.0, 4.0],
+                    [4.0, 7.0, 5.0, 8.0, 1.0],
+                    [9.0, 5.0, 7.0, 8.0, 5.0],
+                ],
+                [
+                    [1.0, 1.0, 5.0, 3.0, 5.0],
+                    [3.0, 6.0, 3.0, 7.0, 8.0],
+                    [8.0, 8.0, 1.0, 2.0, 6.0],
+                    [3.0, 5.0, 6.0, 1.0, 1.0],
+                ],
+                [
+                    [8.0, 3.0, 6.0, 3.0, 7.0],
+                    [8.0, 5.0, 1.0, 2.0, 7.0],
+                    [3.0, 9.0, 4.0, 6.0, 5.0],
+                    [5.0, 1.0, 2.0, 3.0, 6.0],
+                ],
+            ],
+            [
+                [
+                    [3.0, 5.0, 3.0, 1.0, 7.0],
+                    [5.0, 2.0, 6.0, 3.0, 5.0],
+                    [5.0, 1.0, 8.0, 6.0, 9.0],
+                    [9.0, 8.0, 4.0, 5.0, 1.0],
+                ],
+                [
+                    [7.0, 5.0, 7.0, 1.0, 6.0],
+                    [3.0, 3.0, 6.0, 6.0, 7.0],
+                    [9.0, 4.0, 1.0, 5.0, 7.0],
+                    [7.0, 6.0, 9.0, 8.0, 6.0],
+                ],
+                [
+                    [6.0, 7.0, 5.0, 3.0, 9.0],
+                    [4.0, 1.0, 2.0, 3.0, 2.0],
+                    [4.0, 3.0, 8.0, 7.0, 8.0],
+                    [1.0, 3.0, 8.0, 6.0, 2.0],
+                ],
+            ],
+        ]
+    )
+    x = flow.Tensor(
+        input_arr, requires_grad=True, device=flow.device(device), dtype=flow.float64
+    )
+    x_grad = np.array(
+        [
+            [
+                [
+                    [0.46211716, 0.96402758, -0.99505475, -0.76159416, 0.90514825],
+                    [0.96402758, -0.96402758, -0.46211716, 0.76159416, 0.46211716],
+                    [0.46211716, -0.99505475, 0.90514825, -0.76159416, 0.9993293],
+                    [0.0, 0.90514825, -0.90514825, -0.90514825, -0.96402758],
+                ],
+                [
+                    [0.99505475, 0.96402758, 0.76159416, -0.76159416, 0.46211716],
+                    [0.0, -0.90514825, 0.90514825, -0.46211716, -0.46211716],
+                    [0.46211716, -0.96402758, 0.0, 0.90514825, 0.46211716],
+                    [0.96402758, 0.46211716, 0.90514825, 0.9981779, 0.9866143],
+                ],
+                [
+                    [-0.76159416, 0.96402758, -0.46211716, 0.0, 0.76159416],
+                    [-0.96402758, -0.96402758, 0.46211716, 0.46211716, -0.9866143],
+                    [0.46211716, -0.99505475, 0.96402758, 0.46211716, 0.90514825],
+                    [-0.96402758, 0.76159416, 0.99505475, 0.90514825, -0.96402758],
+                ],
+            ],
+            [
+                [
+                    [-0.46211716, -0.96402758, 0.99505475, 0.76159416, -0.90514825],
+                    [-0.96402758, 0.96402758, 0.46211716, -0.76159416, -0.46211716],
+                    [-0.46211716, 0.99505475, -0.90514825, 0.76159416, -0.9993293],
+                    [0.0, -0.90514825, 0.90514825, 0.90514825, 0.96402758],
+                ],
+                [
+                    [-0.99505475, -0.96402758, -0.76159416, 0.76159416, -0.46211716],
+                    [0.0, 0.90514825, -0.90514825, 0.46211716, 0.46211716],
+                    [-0.46211716, 0.96402758, 0.0, -0.90514825, -0.46211716],
+                    [-0.96402758, -0.46211716, -0.90514825, -0.9981779, -0.9866143],
+                ],
+                [
+                    [0.76159416, -0.96402758, 0.46211716, 0.0, -0.76159416],
+                    [0.96402758, 0.96402758, -0.46211716, -0.46211716, 0.9866143],
+                    [-0.46211716, 0.99505475, -0.96402758, -0.46211716, -0.90514825],
+                    [0.96402758, -0.76159416, -0.99505475, -0.90514825, 0.96402758],
+                ],
+            ],
+        ]
+    )
+    y = m(x).sum()
+    y.backward()
+    test_case.assertTrue(np.allclose(x.grad.numpy(), x_grad, 1e-05, 1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestLogSoftmax(flow.unittest.TestCase):
+    def test_log_softmax(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["fun"] = [
+            _test_logsoftmax,
+            _test_logsoftmax_dim_2,
+            _test_logsoftmax_dim_3,
+            _test_logsoftmax_backward,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+def _test_logsigmoid(test_case, device):
+    m = flow.nn.LogSigmoid()
+    arr = np.array([1.0, 2.0, 3.0, 10.2, 7.6])
+    np_out = np.log(1.0 / (1.0 + np.exp(-arr)))
+    x = flow.Tensor(arr, device=flow.device(device), requires_grad=True)
+    of_out = m(x)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [
+        0.2689414213699951,
+        0.11920292202211764,
+        0.04742587317756669,
+        3.716893710287265e-05,
+        0.0005002011070795276,
+    ]
+    test_case.assertTrue(np.allclose(x.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestLogSigmoidModule(flow.unittest.TestCase):
+    def test_logsigmoid(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["fun"] = [_test_logsigmoid]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+    def test_logsigmoid_module_with_random_data(test_case):
+        for device in ["cpu", "cuda"]:
+            test_module_against_pytorch(test_case, "nn.LogSigmoid", device=device, n=2)
+
+
+def _test_softplus(test_case, device):
+    m = flow.nn.Softplus()
+    arr = np.random.randn(2, 3, 4, 5)
+    np_out = numpy_softplus(arr, 1.0, 20)
+    x = flow.Tensor(arr, device=flow.device(device))
+    of_out = m(x)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_softplus_beta(test_case, device):
+    m = flow.nn.Softplus(beta=1.11)
+    arr = np.random.randn(2, 3, 4, 5)
+    np_out = numpy_softplus(arr, 1.11, 20)
+    x = flow.Tensor(arr, device=flow.device(device))
+    of_out = m(x)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_softplus_threshold(test_case, device):
+    m = flow.nn.Softplus(beta=1.11, threshold=1.55)
+    arr = np.random.randn(2, 3, 4, 5)
+    np_out = np.where(
+        arr * 1.11 > 1.55, arr, 1.0 / 1.11 * np.log(1.0 + np.exp(1.11 * arr))
+    )
+    np_out = numpy_softplus(arr, 1.11, 1.55)
+    x = flow.Tensor(arr, device=flow.device(device))
+    of_out = m(x)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_softplus_backward(test_case, device):
+    m = flow.nn.Softplus()
+    arr = np.array([1.0, 2.0, 21.0, 20.0, 4.0])
+    x = flow.Tensor(arr, device=flow.device(device), requires_grad=True)
+    of_out = m(x)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [0.7310585786300049, 0.8807970779778824, 1.0, 1.0, 0.9820137900379085]
+    test_case.assertTrue(np.allclose(x.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestSoftplusModule(flow.unittest.TestCase):
+    def test_softplus(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_softplus,
+            _test_softplus_beta,
+            _test_softplus_threshold,
+            _test_softplus_backward,
+        ]
+        arg_dict["device"] = ["cpu"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+    @autotest
+    def test_softplus_module_with_random_data(test_case):
+        m = torch.nn.Softplus(beta=random() | nothing(), threshold=random() | nothing())
+        m.train(random())
+        device = random_device()
+        m.to(device)
+        x = random_pytorch_tensor().to(device)
+        y = m(x)
+        return y
+
+
+def _test_hardswish_impl(test_case, shape, device):
+    m = flow.nn.Hardswish()
+    arr = np.random.randn(*shape)
+    f = arr + 3
+    relu6 = np.where(np.where(f < 0, 0, f) > 6, 6, np.where(f < 0, 0, f))
+    relu6_grad = np.where(f > 6, 0, np.where(f < 0, 0, 1))
+    np_out = arr * relu6 / 6
+    x = flow.Tensor(arr, device=flow.device(device), requires_grad=True)
+    of_out = m(x)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = relu6 / 6 + arr * relu6_grad / 6
+    test_case.assertTrue(np.allclose(x.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestHardswishModule(flow.unittest.TestCase):
+    def test_hardswish(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(2, 3), (2, 3, 4), (2, 4, 5, 6)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_hardswish_impl(test_case, *arg)
+
+    @autotest()
+    def test_hardswish_module_with_random_data(test_case):
+        m = torch.nn.Hardswish()
+        m.train(random())
+        device = random_device()
+        m.to(device)
+        x = random_pytorch_tensor().to(device)
+        y = m(x)
+        return y
+
+
+def _np_hardtanh_grad(x):
+    return np.where(x <= -2.0, 0.0, np.where(x >= 2.3, 0.0, 1.0))
+
+
+def _test_hardtanh_impl(test_case, shape, device):
+    m = flow.nn.Hardtanh()
+    arr = np.random.randn(*shape)
+    np_out = np.maximum(-1, np.minimum(1, arr))
+    x = flow.Tensor(arr, device=flow.device(device))
+    of_out = m(x)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    m = flow.nn.Hardtanh(min_val=-2.0, max_val=2.3)
+    arr = np.random.randn(*shape)
+    np_out = np.maximum(-2.0, np.minimum(2.3, arr))
+    x = flow.Tensor(arr, device=flow.device(device), requires_grad=True)
+    of_out = m(x)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    test_case.assertTrue(
+        np.allclose(x.grad.numpy(), _np_hardtanh_grad(np_out), 1e-05, 1e-05)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestHardtanhModule(flow.unittest.TestCase):
+    def test_hardtanh(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(2, 3), (2, 3, 4), (2, 4, 5, 6)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_hardtanh_impl(test_case, *arg)
+
+
+def _test_leakyrelu_impl(test_case, shape, device):
+    negative_slope = 0.2
+    m = flow.nn.LeakyReLU(negative_slope=negative_slope)
+    arr = np.random.randn(*shape)
+    np_out = np.maximum(0, arr) + negative_slope * np.minimum(0, arr)
+    x = flow.Tensor(arr, device=flow.device(device), requires_grad=True)
+    of_out = m(x)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    np_grad = np.where(arr < 0, 1.0 * negative_slope, 1.0)
+    of_out = of_out.sum()
+    of_out.backward()
+    test_case.assertTrue(np.allclose(x.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestLeakyReLUModule(flow.unittest.TestCase):
+    def test_leaky_relu(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(2, 3), (2, 3, 4), (2, 4, 5, 6)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_leakyrelu_impl(test_case, *arg)
+
+    @autotest
+    def test_leakyrelu_module_with_random_data(test_case):
+        m = torch.nn.LeakyReLU(negative_slope=random() | nothing())
+        m.train(random())
+        device = random_device()
+        m.to(device)
+        x = random_pytorch_tensor().to(device)
+        y = m(x)
+        return y
+
+
+def _test_mish(test_case, shape, device):
+    np_input = np.random.randn(*shape)
+    of_input = flow.Tensor(np_input, dtype=flow.float32, device=flow.device(device))
+    m = flow.nn.Mish()
+    of_out = m(of_input)
+    np_out = np_input * np.tanh(numpy_softplus(np_input, 1.0, 20))
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_mish_backward(test_case, shape, device):
+    m = flow.nn.Mish()
+    arr = np.random.randn(*shape)
+    x = flow.Tensor(arr, device=flow.device(device), requires_grad=True)
+    of_out = m(x)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = numpy_mish_grad(arr)
+    test_case.assertTrue(np.allclose(x.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestMishModule(flow.unittest.TestCase):
+    def test_mish(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [_test_mish, _test_mish_backward]
+        arg_dict["shape"] = [(2, 3), (2, 3, 4), (2, 4, 5, 6)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_adaptive_pool.py b/python/oneflow/test/modules/test_adaptive_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..decdfab86cfa54f1f9903e108fbba19398d192c1
--- /dev/null
+++ b/python/oneflow/test/modules/test_adaptive_pool.py
@@ -0,0 +1,898 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_adaptive_avgpool1d_forward(test_case, device):
+    input = flow.Tensor(
+        np.array(
+            [
+                [
+                    [
+                        0.05580734834074974,
+                        -0.6875145435333252,
+                        -1.654430866241455,
+                        -0.6225992441177368,
+                        0.10183599591255188,
+                        0.05019790679216385,
+                        -1.2537643909454346,
+                        0.14907236397266388,
+                    ]
+                ]
+            ]
+        ),
+        dtype=flow.float32,
+        device=flow.device(device),
+    )
+    m = flow.nn.AdaptiveAvgPool1d(4)
+    m.to(device)
+    of_out_1 = m(input)
+    of_out_2 = flow.adaptive_avg_pool1d(input, 4)
+    np_out = np.array(
+        [
+            [
+                [
+                    -0.3158535957336426,
+                    -1.1385149955749512,
+                    0.07601694762706757,
+                    -0.5523459911346436,
+                ]
+            ]
+        ]
+    )
+    test_case.assertTrue(np.allclose(of_out_1.numpy(), np_out, 1e-05, 1e-05))
+    test_case.assertTrue(np.allclose(of_out_2.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_adaptive_avgpool1d_backward(test_case, device):
+    input = flow.Tensor(
+        np.array(
+            [
+                [
+                    [
+                        0.05580734834074974,
+                        -0.6875145435333252,
+                        -1.654430866241455,
+                        -0.6225992441177368,
+                        0.10183599591255188,
+                        0.05019790679216385,
+                        -1.2537643909454346,
+                        0.14907236397266388,
+                    ]
+                ]
+            ]
+        ),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    m = flow.nn.AdaptiveAvgPool1d(4)
+    of_out = m(input)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = np.array([[[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]]])
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+@unittest.skipIf(
+    not flow.unittest.env.eager_execution_enabled(),
+    ".numpy() doesn't work in lazy mode",
+)
+def _test_adaptive_avgpool2d_forward(test_case, device):
+    input = flow.Tensor(
+        np.array(
+            [
+                [
+                    [
+                        [
+                            0.10039155930280685,
+                            0.04879157617688179,
+                            -1.0515470504760742,
+                            0.9466001987457275,
+                        ],
+                        [
+                            0.45375481247901917,
+                            0.23611211776733398,
+                            1.343685269355774,
+                            0.3979687988758087,
+                        ],
+                        [
+                            0.05580734834074974,
+                            -0.6875145435333252,
+                            -1.654430866241455,
+                            -0.6225992441177368,
+                        ],
+                        [
+                            0.10183599591255188,
+                            0.05019790679216385,
+                            -1.2537643909454346,
+                            0.14907236397266388,
+                        ],
+                    ]
+                ]
+            ]
+        ),
+        dtype=flow.float32,
+        device=flow.device(device),
+    )
+    m = flow.nn.AdaptiveAvgPool2d((2, 2))
+    m.to(device)
+    of_out_1 = m(input)
+    of_out_2 = flow.adaptive_avg_pool2d(input, (2, 2))
+    np_out = np.array(
+        [
+            [
+                [
+                    [0.20976251363754272, 0.4091767966747284],
+                    [-0.1199183315038681, -0.8454304933547974],
+                ]
+            ]
+        ]
+    )
+    test_case.assertTrue(np.allclose(of_out_1.numpy(), np_out, 1e-05, 1e-05))
+    test_case.assertTrue(np.allclose(of_out_2.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_adaptive_avgpool2d_backward(test_case, device):
+    input = flow.Tensor(
+        np.array(
+            [
+                [
+                    [
+                        [
+                            0.10039155930280685,
+                            0.04879157617688179,
+                            -1.0515470504760742,
+                            0.9466001987457275,
+                        ],
+                        [
+                            0.45375481247901917,
+                            0.23611211776733398,
+                            1.343685269355774,
+                            0.3979687988758087,
+                        ],
+                        [
+                            0.05580734834074974,
+                            -0.6875145435333252,
+                            -1.654430866241455,
+                            -0.6225992441177368,
+                        ],
+                        [
+                            0.10183599591255188,
+                            0.05019790679216385,
+                            -1.2537643909454346,
+                            0.14907236397266388,
+                        ],
+                    ]
+                ]
+            ]
+        ),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    m = flow.nn.AdaptiveAvgPool2d((2, 2))
+    of_out = m(input)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = np.array(
+        [
+            [
+                [
+                    [0.25, 0.25, 0.25, 0.25],
+                    [0.25, 0.25, 0.25, 0.25],
+                    [0.25, 0.25, 0.25, 0.25],
+                    [0.25, 0.25, 0.25, 0.25],
+                ]
+            ]
+        ]
+    )
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+def _test_adaptive_avgpool2d_hw_forward(test_case, device):
+    input = flow.Tensor(
+        np.array(
+            [
+                [
+                    [
+                        [0.28242185711860657, -0.7742040753364563, -0.5439430475234985],
+                        [-0.1706847995519638, 0.0430854931473732, 0.34247592091560364],
+                        [-1.036131501197815, -1.033642292022705, 0.3455536365509033],
+                    ]
+                ]
+            ]
+        ),
+        dtype=flow.float32,
+        device=flow.device(device),
+    )
+    m = flow.nn.AdaptiveAvgPool2d((1, 2))
+    m.to(device)
+    of_out = m(input)
+    np_out = np.array([[[[-0.4481925666332245, -0.27011242508888245]]]])
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_adaptive_avgpool2d_hw_backward(test_case, device):
+    input = flow.Tensor(
+        np.array(
+            [
+                [
+                    [
+                        [0.28242185711860657, -0.7742040753364563, -0.5439430475234985],
+                        [-0.1706847995519638, 0.0430854931473732, 0.34247592091560364],
+                        [-1.036131501197815, -1.033642292022705, 0.3455536365509033],
+                    ]
+                ]
+            ]
+        ),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    m = flow.nn.AdaptiveAvgPool2d((1, 2))
+    of_out = m(input)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = np.array(
+        [
+            [
+                [
+                    [0.1666666716337204, 0.3333333432674408, 0.1666666716337204],
+                    [0.1666666716337204, 0.3333333432674408, 0.1666666716337204],
+                    [0.1666666716337204, 0.3333333432674408, 0.1666666716337204],
+                ]
+            ]
+        ]
+    )
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+def _test_adaptive_avgpool3d_forward(test_case, device):
+    input = flow.Tensor(
+        np.array(
+            [
+                [
+                    [
+                        [
+                            [
+                                -1.077571799600885,
+                                -0.7804538890365837,
+                                -1.2627538752119443,
+                                0.9993507145120477,
+                            ],
+                            [
+                                2.0222532489157516,
+                                1.103451377699465,
+                                -0.4377324754879578,
+                                1.890491810587517,
+                            ],
+                            [
+                                -0.5593861899064654,
+                                -0.4949520241526519,
+                                -0.18536721363519787,
+                                -0.6098969866775772,
+                            ],
+                            [
+                                -1.6536215260171816,
+                                -1.0392583540436786,
+                                0.3686776597613967,
+                                -0.5356882834951805,
+                            ],
+                        ],
+                        [
+                            [
+                                -1.2617900664449953,
+                                -1.4390921091631532,
+                                0.20654399652431357,
+                                0.8186472101906713,
+                            ],
+                            [
+                                -0.3033378863400014,
+                                -0.8173269764076293,
+                                -0.3767515097625614,
+                                -0.11021655039337777,
+                            ],
+                            [
+                                -0.22977043608192885,
+                                1.2717196366649905,
+                                -0.4790851297878291,
+                                -1.4495369404727856,
+                            ],
+                            [
+                                -1.2802093286977783,
+                                -0.11184514806663474,
+                                1.7022167087210984,
+                                -1.7354837287725355,
+                            ],
+                        ],
+                        [
+                            [
+                                2.4706497991773606,
+                                -0.6549702631973298,
+                                -0.9318107079571676,
+                                1.4652904271682428,
+                            ],
+                            [
+                                1.1419864234341397,
+                                1.389909081086008,
+                                0.9657841900525568,
+                                -0.8563114264976619,
+                            ],
+                            [
+                                0.19515087084250754,
+                                -0.37808457398571094,
+                                0.2938625398496183,
+                                0.9279930510353327,
+                            ],
+                            [
+                                -0.9374118277994007,
+                                0.3341831730452431,
+                                -0.2792542765303833,
+                                0.38029090707066726,
+                            ],
+                        ],
+                        [
+                            [
+                                0.5918686659736041,
+                                -0.7870631089938902,
+                                -0.9534344874245392,
+                                0.31341612954718795,
+                            ],
+                            [
+                                0.7509029444145228,
+                                -0.9299288398562323,
+                                -0.7343054052782476,
+                                -0.8806481590696694,
+                            ],
+                            [
+                                -0.4707853016353985,
+                                0.12253641652645629,
+                                0.5088022039832846,
+                                0.520391789327562,
+                            ],
+                            [
+                                -0.0861300651163632,
+                                0.30291348404866386,
+                                -0.6268565873680123,
+                                -0.27469204305759976,
+                            ],
+                        ],
+                    ]
+                ]
+            ]
+        ),
+        dtype=flow.float32,
+        device=flow.device(device),
+    )
+    m = flow.nn.AdaptiveAvgPool3d((2, 2, 2))
+    m.to(device)
+    of_out_1 = m(input)
+    of_out_2 = flow.adaptive_avg_pool3d(input, (2, 2, 2))
+    np_out = np.array(
+        [
+            [
+                [
+                    [
+                        [-0.3192335125472539, 0.2159474151198386],
+                        [-0.5121654212876662, -0.3655204892948264],
+                    ],
+                    [
+                        [0.4966693377547728, -0.2015024299324123],
+                        [-0.11470347800925032, 0.18131719803880864],
+                    ],
+                ]
+            ]
+        ]
+    )
+    test_case.assertTrue(np.allclose(of_out_1.numpy(), np_out, 1e-05, 1e-05))
+    test_case.assertTrue(np.allclose(of_out_2.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_adaptive_avgpool3d_backward(test_case, device):
+    input = flow.Tensor(
+        np.array(
+            [
+                [
+                    [
+                        [
+                            [
+                                -1.077571799600885,
+                                -0.7804538890365837,
+                                -1.2627538752119443,
+                                0.9993507145120477,
+                            ],
+                            [
+                                2.0222532489157516,
+                                1.103451377699465,
+                                -0.4377324754879578,
+                                1.890491810587517,
+                            ],
+                            [
+                                -0.5593861899064654,
+                                -0.4949520241526519,
+                                -0.18536721363519787,
+                                -0.6098969866775772,
+                            ],
+                            [
+                                -1.6536215260171816,
+                                -1.0392583540436786,
+                                0.3686776597613967,
+                                -0.5356882834951805,
+                            ],
+                        ],
+                        [
+                            [
+                                -1.2617900664449953,
+                                -1.4390921091631532,
+                                0.20654399652431357,
+                                0.8186472101906713,
+                            ],
+                            [
+                                -0.3033378863400014,
+                                -0.8173269764076293,
+                                -0.3767515097625614,
+                                -0.11021655039337777,
+                            ],
+                            [
+                                -0.22977043608192885,
+                                1.2717196366649905,
+                                -0.4790851297878291,
+                                -1.4495369404727856,
+                            ],
+                            [
+                                -1.2802093286977783,
+                                -0.11184514806663474,
+                                1.7022167087210984,
+                                -1.7354837287725355,
+                            ],
+                        ],
+                        [
+                            [
+                                2.4706497991773606,
+                                -0.6549702631973298,
+                                -0.9318107079571676,
+                                1.4652904271682428,
+                            ],
+                            [
+                                1.1419864234341397,
+                                1.389909081086008,
+                                0.9657841900525568,
+                                -0.8563114264976619,
+                            ],
+                            [
+                                0.19515087084250754,
+                                -0.37808457398571094,
+                                0.2938625398496183,
+                                0.9279930510353327,
+                            ],
+                            [
+                                -0.9374118277994007,
+                                0.3341831730452431,
+                                -0.2792542765303833,
+                                0.38029090707066726,
+                            ],
+                        ],
+                        [
+                            [
+                                0.5918686659736041,
+                                -0.7870631089938902,
+                                -0.9534344874245392,
+                                0.31341612954718795,
+                            ],
+                            [
+                                0.7509029444145228,
+                                -0.9299288398562323,
+                                -0.7343054052782476,
+                                -0.8806481590696694,
+                            ],
+                            [
+                                -0.4707853016353985,
+                                0.12253641652645629,
+                                0.5088022039832846,
+                                0.520391789327562,
+                            ],
+                            [
+                                -0.0861300651163632,
+                                0.30291348404866386,
+                                -0.6268565873680123,
+                                -0.27469204305759976,
+                            ],
+                        ],
+                    ]
+                ]
+            ]
+        ),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    m = flow.nn.AdaptiveAvgPool3d((2, 2, 2))
+    of_out = m(input)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = np.array(
+        [
+            [
+                [
+                    [
+                        [0.125, 0.125, 0.125, 0.125],
+                        [0.125, 0.125, 0.125, 0.125],
+                        [0.125, 0.125, 0.125, 0.125],
+                        [0.125, 0.125, 0.125, 0.125],
+                    ],
+                    [
+                        [0.125, 0.125, 0.125, 0.125],
+                        [0.125, 0.125, 0.125, 0.125],
+                        [0.125, 0.125, 0.125, 0.125],
+                        [0.125, 0.125, 0.125, 0.125],
+                    ],
+                    [
+                        [0.125, 0.125, 0.125, 0.125],
+                        [0.125, 0.125, 0.125, 0.125],
+                        [0.125, 0.125, 0.125, 0.125],
+                        [0.125, 0.125, 0.125, 0.125],
+                    ],
+                    [
+                        [0.125, 0.125, 0.125, 0.125],
+                        [0.125, 0.125, 0.125, 0.125],
+                        [0.125, 0.125, 0.125, 0.125],
+                        [0.125, 0.125, 0.125, 0.125],
+                    ],
+                ]
+            ]
+        ]
+    )
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+def _test_adaptive_avgpool3d_dhw_forward(test_case, device):
+    input = flow.Tensor(
+        np.array(
+            [
+                [
+                    [
+                        [
+                            [
+                                -1.077571799600885,
+                                -0.7804538890365837,
+                                -1.2627538752119443,
+                                0.9993507145120477,
+                            ],
+                            [
+                                2.0222532489157516,
+                                1.103451377699465,
+                                -0.4377324754879578,
+                                1.890491810587517,
+                            ],
+                            [
+                                -0.5593861899064654,
+                                -0.4949520241526519,
+                                -0.18536721363519787,
+                                -0.6098969866775772,
+                            ],
+                            [
+                                -1.6536215260171816,
+                                -1.0392583540436786,
+                                0.3686776597613967,
+                                -0.5356882834951805,
+                            ],
+                        ],
+                        [
+                            [
+                                -1.2617900664449953,
+                                -1.4390921091631532,
+                                0.20654399652431357,
+                                0.8186472101906713,
+                            ],
+                            [
+                                -0.3033378863400014,
+                                -0.8173269764076293,
+                                -0.3767515097625614,
+                                -0.11021655039337777,
+                            ],
+                            [
+                                -0.22977043608192885,
+                                1.2717196366649905,
+                                -0.4790851297878291,
+                                -1.4495369404727856,
+                            ],
+                            [
+                                -1.2802093286977783,
+                                -0.11184514806663474,
+                                1.7022167087210984,
+                                -1.7354837287725355,
+                            ],
+                        ],
+                        [
+                            [
+                                2.4706497991773606,
+                                -0.6549702631973298,
+                                -0.9318107079571676,
+                                1.4652904271682428,
+                            ],
+                            [
+                                1.1419864234341397,
+                                1.389909081086008,
+                                0.9657841900525568,
+                                -0.8563114264976619,
+                            ],
+                            [
+                                0.19515087084250754,
+                                -0.37808457398571094,
+                                0.2938625398496183,
+                                0.9279930510353327,
+                            ],
+                            [
+                                -0.9374118277994007,
+                                0.3341831730452431,
+                                -0.2792542765303833,
+                                0.38029090707066726,
+                            ],
+                        ],
+                        [
+                            [
+                                0.5918686659736041,
+                                -0.7870631089938902,
+                                -0.9534344874245392,
+                                0.31341612954718795,
+                            ],
+                            [
+                                0.7509029444145228,
+                                -0.9299288398562323,
+                                -0.7343054052782476,
+                                -0.8806481590696694,
+                            ],
+                            [
+                                -0.4707853016353985,
+                                0.12253641652645629,
+                                0.5088022039832846,
+                                0.520391789327562,
+                            ],
+                            [
+                                -0.0861300651163632,
+                                0.30291348404866386,
+                                -0.6268565873680123,
+                                -0.27469204305759976,
+                            ],
+                        ],
+                    ]
+                ]
+            ]
+        ),
+        dtype=flow.float32,
+        device=flow.device(device),
+    )
+    m = flow.nn.AdaptiveAvgPool3d((1, 2, 3))
+    m.to(device)
+    of_out = m(input)
+    np_out = np.array(
+        [
+            [
+                [
+                    [0.08871791260375947, -0.4024959376509308, 0.00722249259371315],
+                    [-0.31343444964845824, 0.08188803218941582, -0.09210164562800888],
+                ]
+            ]
+        ]
+    )
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_adaptive_avgpool3d_dhw_backward(test_case, device):
+    input = flow.Tensor(
+        np.array(
+            [
+                [
+                    [
+                        [
+                            [
+                                -1.077571799600885,
+                                -0.7804538890365837,
+                                -1.2627538752119443,
+                                0.9993507145120477,
+                            ],
+                            [
+                                2.0222532489157516,
+                                1.103451377699465,
+                                -0.4377324754879578,
+                                1.890491810587517,
+                            ],
+                            [
+                                -0.5593861899064654,
+                                -0.4949520241526519,
+                                -0.18536721363519787,
+                                -0.6098969866775772,
+                            ],
+                            [
+                                -1.6536215260171816,
+                                -1.0392583540436786,
+                                0.3686776597613967,
+                                -0.5356882834951805,
+                            ],
+                        ],
+                        [
+                            [
+                                -1.2617900664449953,
+                                -1.4390921091631532,
+                                0.20654399652431357,
+                                0.8186472101906713,
+                            ],
+                            [
+                                -0.3033378863400014,
+                                -0.8173269764076293,
+                                -0.3767515097625614,
+                                -0.11021655039337777,
+                            ],
+                            [
+                                -0.22977043608192885,
+                                1.2717196366649905,
+                                -0.4790851297878291,
+                                -1.4495369404727856,
+                            ],
+                            [
+                                -1.2802093286977783,
+                                -0.11184514806663474,
+                                1.7022167087210984,
+                                -1.7354837287725355,
+                            ],
+                        ],
+                        [
+                            [
+                                2.4706497991773606,
+                                -0.6549702631973298,
+                                -0.9318107079571676,
+                                1.4652904271682428,
+                            ],
+                            [
+                                1.1419864234341397,
+                                1.389909081086008,
+                                0.9657841900525568,
+                                -0.8563114264976619,
+                            ],
+                            [
+                                0.19515087084250754,
+                                -0.37808457398571094,
+                                0.2938625398496183,
+                                0.9279930510353327,
+                            ],
+                            [
+                                -0.9374118277994007,
+                                0.3341831730452431,
+                                -0.2792542765303833,
+                                0.38029090707066726,
+                            ],
+                        ],
+                        [
+                            [
+                                0.5918686659736041,
+                                -0.7870631089938902,
+                                -0.9534344874245392,
+                                0.31341612954718795,
+                            ],
+                            [
+                                0.7509029444145228,
+                                -0.9299288398562323,
+                                -0.7343054052782476,
+                                -0.8806481590696694,
+                            ],
+                            [
+                                -0.4707853016353985,
+                                0.12253641652645629,
+                                0.5088022039832846,
+                                0.520391789327562,
+                            ],
+                            [
+                                -0.0861300651163632,
+                                0.30291348404866386,
+                                -0.6268565873680123,
+                                -0.27469204305759976,
+                            ],
+                        ],
+                    ]
+                ]
+            ]
+        ),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    m = flow.nn.AdaptiveAvgPool3d((1, 2, 3))
+    of_out = m(input)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = np.array(
+        [
+            [
+                [
+                    [
+                        [0.0625, 0.125, 0.125, 0.0625],
+                        [0.0625, 0.125, 0.125, 0.0625],
+                        [0.0625, 0.125, 0.125, 0.0625],
+                        [0.0625, 0.125, 0.125, 0.0625],
+                    ],
+                    [
+                        [0.0625, 0.125, 0.125, 0.0625],
+                        [0.0625, 0.125, 0.125, 0.0625],
+                        [0.0625, 0.125, 0.125, 0.0625],
+                        [0.0625, 0.125, 0.125, 0.0625],
+                    ],
+                    [
+                        [0.0625, 0.125, 0.125, 0.0625],
+                        [0.0625, 0.125, 0.125, 0.0625],
+                        [0.0625, 0.125, 0.125, 0.0625],
+                        [0.0625, 0.125, 0.125, 0.0625],
+                    ],
+                    [
+                        [0.0625, 0.125, 0.125, 0.0625],
+                        [0.0625, 0.125, 0.125, 0.0625],
+                        [0.0625, 0.125, 0.125, 0.0625],
+                        [0.0625, 0.125, 0.125, 0.0625],
+                    ],
+                ]
+            ]
+        ]
+    )
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestAdaptiveAvgPool(flow.unittest.TestCase):
+    def test_adaptive_avgpool1d(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_adaptive_avgpool1d_forward,
+            _test_adaptive_avgpool1d_backward,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+    def test_adaptive_avgpool2d(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_adaptive_avgpool2d_forward,
+            _test_adaptive_avgpool2d_backward,
+            _test_adaptive_avgpool2d_hw_forward,
+            _test_adaptive_avgpool2d_hw_backward,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+    def test_adaptive_avgpool3d(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_adaptive_avgpool3d_forward,
+            _test_adaptive_avgpool3d_backward,
+            _test_adaptive_avgpool3d_dhw_forward,
+            _test_adaptive_avgpool3d_dhw_backward,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_add.py b/python/oneflow/test/modules/test_add.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbc7a2a51f62b1b9e80be750bdddaab95b56a85c
--- /dev/null
+++ b/python/oneflow/test/modules/test_add.py
@@ -0,0 +1,156 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_add_forward(test_case, shape, device):
+    x = flow.Tensor(np.random.randn(*shape), device=flow.device(device))
+    y = flow.Tensor(np.random.randn(*shape), device=flow.device(device))
+    of_out = flow.add(x, y)
+    np_out = np.add(x.numpy(), y.numpy())
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+    x = 5
+    y = flow.Tensor(np.random.randn(*shape), device=flow.device(device))
+    of_out = flow.add(x, y)
+    np_out = np.add(x, y.numpy())
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+    x = flow.Tensor(np.random.randn(*shape), device=flow.device(device))
+    y = 5
+    of_out = flow.add(x, y)
+    np_out = np.add(x.numpy(), y)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+    x = flow.Tensor(np.random.randn(*shape), device=flow.device(device))
+    y = flow.Tensor(np.array([5.0]), device=flow.device(device))
+    of_out = flow.add(x, y)
+    np_out = np.add(x.numpy(), y.numpy())
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+    x = flow.Tensor(np.random.randn(1, 1), device=flow.device(device))
+    y = flow.Tensor(np.random.randn(*shape), device=flow.device(device))
+    of_out = flow.add(x, y)
+    np_out = np.add(x.numpy(), y.numpy())
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+
+
+def _test_add_backward(test_case, shape, device):
+    x = 5
+    y = flow.Tensor(
+        np.random.randn(*shape), requires_grad=True, device=flow.device(device)
+    )
+    of_out = flow.add(x, y).sum()
+    of_out.backward()
+    test_case.assertTrue(
+        np.allclose(y.grad.numpy(), np.ones(shape=shape), 0.0001, 0.0001)
+    )
+
+
+def _test_inplace_add(test_case, shape, device):
+    np_x = np.random.randn(*shape)
+    of_x = flow.Tensor(
+        np_x, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    of_x_inplace = of_x + 1
+    id_old = id(of_x_inplace)
+    of_x_inplace.add_(5)
+    test_case.assertEqual(id_old, id(of_x_inplace))
+    np_out = np_x + 1 + 5
+    test_case.assertTrue(np.allclose(of_x_inplace.numpy(), np_out, 1e-05, 1e-05))
+    of_x_inplace = of_x_inplace.sum()
+    of_x_inplace.backward()
+    test_case.assertTrue(np.allclose(of_x.grad.numpy(), np.ones(shape), 1e-05, 1e-05))
+    of_x = flow.Tensor(
+        np_x, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    of_y = flow.Tensor(
+        np.random.randn(*shape), device=flow.device(device), requires_grad=False
+    )
+    of_x_inplace = of_x + 1
+    id_old = id(of_x_inplace)
+    of_x_inplace.add_(of_y)
+    test_case.assertEqual(id_old, id(of_x_inplace))
+    np_out = np_x + 1 + of_y.numpy()
+    test_case.assertTrue(np.allclose(of_x_inplace.numpy(), np_out, 1e-05, 1e-05))
+    of_x_inplace = of_x_inplace.sum()
+    of_x_inplace.backward()
+    test_case.assertTrue(np.allclose(of_x.grad.numpy(), np.ones(shape), 1e-05, 1e-05))
+    of_x = flow.Tensor(
+        np_x, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    of_y = flow.Tensor(
+        np.random.randn(*shape), device=flow.device(device), requires_grad=False
+    )
+    of_x_inplace = of_x + 1
+    id_old = id(of_x_inplace)
+    of_x_inplace += of_y
+    test_case.assertEqual(id_old, id(of_x_inplace))
+    np_out = np_x + 1 + of_y.numpy()
+    test_case.assertTrue(np.allclose(of_x_inplace.numpy(), np_out, 1e-05, 1e-05))
+    of_x_inplace = of_x_inplace.sum()
+    of_x_inplace.backward()
+    test_case.assertTrue(np.allclose(of_x.grad.numpy(), np.ones(shape), 1e-05, 1e-05))
+    of_x = flow.Tensor(
+        np_x, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    of_y = flow.Tensor(np.array([5.0]), device=flow.device(device), requires_grad=False)
+    of_x_inplace = of_x + 1
+    id_old = id(of_x_inplace)
+    of_x_inplace.add_(of_y)
+    test_case.assertEqual(id_old, id(of_x_inplace))
+    np_out = np_x + 6
+    test_case.assertTrue(np.allclose(of_x_inplace.numpy(), np_out, 1e-05, 1e-05))
+    of_x_inplace = of_x_inplace.sum()
+    of_x_inplace.backward()
+    test_case.assertTrue(np.allclose(of_x.grad.numpy(), np.ones(shape), 1e-05, 1e-05))
+    of_x = flow.Tensor(
+        np_x, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    np_y = np.random.randn(*shape[:-1], 1)
+    of_y = flow.Tensor(np_y, device=flow.device(device), requires_grad=False)
+    of_x_inplace = of_x + 1
+    id_old = id(of_x_inplace)
+    of_x_inplace.add_(of_y)
+    test_case.assertEqual(id_old, id(of_x_inplace))
+    np_out = np_x + 1 + np_y
+    test_case.assertTrue(np.allclose(of_x_inplace.numpy(), np_out, 1e-05, 1e-05))
+    of_x_inplace = of_x_inplace.sum()
+    of_x_inplace.backward()
+    test_case.assertTrue(np.allclose(of_x.grad.numpy(), np.ones(shape), 1e-05, 1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestAddModule(flow.unittest.TestCase):
+    def test_add(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_add_forward,
+            _test_add_backward,
+            _test_inplace_add,
+        ]
+        arg_dict["shape"] = [(2, 3), (2, 3, 4), (2, 3, 4, 5)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_addmm.py b/python/oneflow/test/modules/test_addmm.py
new file mode 100644
index 0000000000000000000000000000000000000000..8dd938c29567139d19312047a1bac5afe17e0633
--- /dev/null
+++ b/python/oneflow/test/modules/test_addmm.py
@@ -0,0 +1,70 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_addmm(test_case, shape, alpha, beta, device):
+    mat1 = np.random.randn(*shape)
+    mat2 = np.random.randn(*shape)
+    input = np.random.randn(*shape)
+    mat1_tensor = flow.Tensor(mat1, dtype=flow.float32, device=flow.device(device))
+    mat2_tensor = flow.Tensor(mat2, dtype=flow.float32, device=flow.device(device))
+    input_tensor = flow.Tensor(input, dtype=flow.float32, device=flow.device(device))
+    of_out = flow.addmm(input_tensor, mat1_tensor, mat2_tensor, alpha, beta)
+    np_out = np.add(beta * input, alpha * np.matmul(mat1, mat2))
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_addmm_backward(test_case, shape, alpha, beta, device):
+    mat1 = np.random.randn(*shape)
+    mat2 = np.random.randn(*shape)
+    input = np.random.randn(*shape)
+    mat1_tensor = flow.Tensor(mat1, dtype=flow.float32, device=flow.device(device))
+    mat2_tensor = flow.Tensor(mat2, dtype=flow.float32, device=flow.device(device))
+    input_tensor = flow.Tensor(
+        input, dtype=flow.float32, requires_grad=True, device=flow.device(device)
+    )
+    of_out = flow.addmm(input_tensor, mat1_tensor, mat2_tensor, alpha, beta).sum()
+    of_out.backward()
+    np_grad_out = np.ones_like(input) * beta
+    test_case.assertTrue(
+        np.allclose(input_tensor.grad.numpy(), np_grad_out, 1e-05, 1e-05)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestAddmm(flow.unittest.TestCase):
+    def test_addmm(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["function_test"] = [_test_addmm, _test_addmm_backward]
+        arg_dict["shape"] = [(3, 3)]
+        arg_dict["alpha"] = [4, 1.2, -3.7]
+        arg_dict["beta"] = [1.5, 4, -2]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_allreduce.py b/python/oneflow/test/modules/test_allreduce.py
new file mode 100644
index 0000000000000000000000000000000000000000..e440384d8bf5a5a0c74300812c3cec356d3c7847
--- /dev/null
+++ b/python/oneflow/test/modules/test_allreduce.py
@@ -0,0 +1,49 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow as flow
+import oneflow.unittest
+
+
+@flow.unittest.skip_unless_1n2d()
+class TestAllReduce(flow.unittest.TestCase):
+    def test_all_reduce(test_case):
+        arr_rank1 = np.array([1, 2])
+        arr_rank2 = np.array([3, 4])
+        if flow.distributed.get_rank() == 0:
+            x = flow.Tensor([1, 2])
+        elif flow.distributed.get_rank() == 1:
+            x = flow.Tensor([3, 4])
+        else:
+            raise ValueError
+        x = x.to(f"cuda:{flow.distributed.get_local_rank()}")
+        nccl_allreduce_op = (
+            flow.builtin_op("eager_nccl_all_reduce")
+            .Input("in")
+            .Output("out")
+            .Attr("parallel_conf", f'device_tag: "gpu", device_name: "0:0-1"')
+            .Build()
+        )
+        y = nccl_allreduce_op(x)[0]
+        test_case.assertTrue(np.allclose(y.numpy(), arr_rank1 + arr_rank2))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_arange.py b/python/oneflow/test/modules/test_arange.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c4b92ffaafd5e1f2fa82f62b5c09454cb2c087f
--- /dev/null
+++ b/python/oneflow/test/modules/test_arange.py
@@ -0,0 +1,70 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_arange(test_case, device):
+    np_out = np.arange(13, dtype=np.float32)
+    of_out = flow.arange(13, device=device, dtype=flow.float32)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_arange_step_prarm(test_case, device):
+    np_out = np.arange(0, 20, 2)
+    of_out = flow.arange(0, 20, step=2, device=device)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_arange_more_params(test_case, device):
+    np_out = np.arange(0, 100, 3)
+    of_out = flow.arange(start=0, end=100, step=3, device=device)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_arange_backward(test_case, device):
+    np_out = np.arange(13)
+    x = flow.arange(13, device=device)
+    x.requires_grad = True
+    y = x.sum()
+    y.backward()
+    test_case.assertTrue(np.allclose(x.grad.numpy(), np.ones(13), 1e-05, 1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestArange(flow.unittest.TestCase):
+    def test_transpose(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["function_test"] = [
+            _test_arange,
+            _test_arange_step_prarm,
+            _test_arange_more_params,
+            _test_arange_backward,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_argmax.py b/python/oneflow/test/modules/test_argmax.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1299eaa8cf764d8529236045a99d03431a43d30
--- /dev/null
+++ b/python/oneflow/test/modules/test_argmax.py
@@ -0,0 +1,96 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_argmax_aixs_negative(test_case, device):
+    input = flow.Tensor(
+        np.random.randn(2, 6, 5, 3), dtype=flow.float32, device=flow.device(device)
+    )
+    axis = -1
+    of_out = flow.argmax(input, dim=axis)
+    np_out = np.argmax(input.numpy(), axis=axis)
+    test_case.assertTrue(np.array_equal(of_out.numpy().flatten(), np_out.flatten()))
+
+
+def _test_tensor_argmax(test_case, device):
+    input = flow.Tensor(
+        np.random.randn(2, 6, 5, 3), dtype=flow.float32, device=flow.device(device)
+    )
+    axis = 0
+    of_out = input.argmax(dim=axis)
+    np_out = np.argmax(input.numpy(), axis=axis)
+    test_case.assertTrue(np.array_equal(of_out.numpy().shape, np_out.shape))
+    test_case.assertTrue(np.array_equal(of_out.numpy().flatten(), np_out.flatten()))
+
+
+def _test_argmax_axis_postive(test_case, device):
+    input = flow.Tensor(
+        np.random.randn(2, 6, 5, 3), dtype=flow.float32, device=flow.device(device)
+    )
+    axis = 1
+    of_out = flow.argmax(input, dim=axis)
+    np_out = np.argmax(input.numpy(), axis=axis)
+    test_case.assertTrue(np.array_equal(of_out.numpy().flatten(), np_out.flatten()))
+
+
+def _test_argmax_keepdims(test_case, device):
+    input = flow.Tensor(
+        np.random.randn(2, 6, 5, 3), dtype=flow.float32, device=flow.device(device)
+    )
+    axis = 0
+    of_out = input.argmax(axis, True)
+    np_out = np.argmax(input.numpy(), axis=axis)
+    np_out = np.expand_dims(np_out, axis=axis)
+    test_case.assertTrue(np.array_equal(of_out.numpy().shape, np_out.shape))
+    test_case.assertTrue(np.array_equal(of_out.numpy().flatten(), np_out.flatten()))
+
+
+def _test_argmax_dim_equal_none(test_case, device):
+    input = flow.Tensor(
+        np.random.randn(2, 6, 5, 3), dtype=flow.float32, device=flow.device(device)
+    )
+    of_out = input.argmax()
+    np_out = np.argmax(input.numpy().flatten(), axis=0)
+    test_case.assertTrue(np.array_equal(of_out.numpy().flatten(), np_out.flatten()))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestArgmax(flow.unittest.TestCase):
+    def test_argmax(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_argmax_aixs_negative,
+            _test_tensor_argmax,
+            _test_argmax_axis_postive,
+            _test_argmax_keepdims,
+            _test_argmax_dim_equal_none,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_argsort.py b/python/oneflow/test/modules/test_argsort.py
new file mode 100644
index 0000000000000000000000000000000000000000..981ccb922d0b9e710a140b5259908c1e55c67ac4
--- /dev/null
+++ b/python/oneflow/test/modules/test_argsort.py
@@ -0,0 +1,67 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList, type_name_to_flow_type
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_argsort(test_case, data_shape, axis, descending, data_type, device):
+    input = flow.Tensor(
+        np.random.randn(*data_shape),
+        dtype=type_name_to_flow_type[data_type],
+        device=flow.device(device),
+    )
+    of_out = flow.argsort(input, dim=axis, descending=descending)
+    np_input = -input.numpy() if descending else input.numpy()
+    np_out = np.argsort(np_input, axis=axis)
+    test_case.assertTrue(np.array_equal(of_out.numpy().flatten(), np_out.flatten()))
+
+
+def _test_tensor_argsort(test_case, data_shape, axis, descending, data_type, device):
+    input = flow.Tensor(
+        np.random.randn(*data_shape),
+        dtype=type_name_to_flow_type[data_type],
+        device=flow.device(device),
+    )
+    of_out = input.argsort(dim=axis, descending=descending)
+    np_input = -input.numpy() if descending else input.numpy()
+    np_out = np.argsort(np_input, axis=axis)
+    test_case.assertTrue(np.array_equal(of_out.numpy().shape, np_out.shape))
+    test_case.assertTrue(np.array_equal(of_out.numpy().flatten(), np_out.flatten()))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestArgsort(flow.unittest.TestCase):
+    def test_argsort(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [_test_argsort, _test_tensor_argsort]
+        arg_dict["data_shape"] = [(2, 6, 5, 4), (3, 4, 8)]
+        arg_dict["axis"] = [-1, 0, 2]
+        arg_dict["descending"] = [True, False]
+        arg_dict["data_type"] = ["double", "float32", "int32"]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_argwhere.py b/python/oneflow/test/modules/test_argwhere.py
new file mode 100644
index 0000000000000000000000000000000000000000..375c1f52660d90c530ba43a4d15b4e4b16043d76
--- /dev/null
+++ b/python/oneflow/test/modules/test_argwhere.py
@@ -0,0 +1,48 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_argwhere(test_case, shape, device):
+    np_input = np.random.randn(*shape)
+    input = flow.Tensor(np_input, device=flow.device(device))
+    of_out = flow.argwhere(input)
+    np_out = np.argwhere(np_input)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+    test_case.assertTrue(np.array_equal(of_out.numpy().shape, np_out.shape))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestArgwhere(flow.unittest.TestCase):
+    def test_argwhere(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [_test_argwhere]
+        arg_dict["shape"] = [(2, 3), (2, 3, 4), (2, 4, 5, 6)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_atan.py b/python/oneflow/test/modules/test_atan.py
new file mode 100644
index 0000000000000000000000000000000000000000..91c602b7e847a65d489251f90d748ba1dafaafd2
--- /dev/null
+++ b/python/oneflow/test/modules/test_atan.py
@@ -0,0 +1,75 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_atan(test_case, shape, device):
+    np_input = np.random.randn(*shape)
+    of_input = flow.Tensor(
+        np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    of_out = flow.atan(of_input)
+    np_out = np.arctan(np_input)
+    test_case.assertTrue(
+        np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05, equal_nan=True)
+    )
+    of_out = of_out.sum()
+    of_out.backward()
+    np_out_grad = 1 / (1 + np_input ** 2)
+    test_case.assertTrue(
+        np.allclose(of_input.grad.numpy(), np_out_grad, 1e-05, 1e-05, equal_nan=True)
+    )
+
+
+def _test_arctan(test_case, shape, device):
+    np_input = np.random.randn(*shape)
+    of_input = flow.Tensor(
+        np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    of_out = flow.arctan(of_input)
+    np_out = np.arctan(np_input)
+    test_case.assertTrue(
+        np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05, equal_nan=True)
+    )
+    of_out = of_out.sum()
+    of_out.backward()
+    np_out_grad = 1 / (1 + np_input ** 2)
+    test_case.assertTrue(
+        np.allclose(of_input.grad.numpy(), np_out_grad, 1e-05, 1e-05, equal_nan=True)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestAtan(flow.unittest.TestCase):
+    def test_atan(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [_test_atan, _test_arctan]
+        arg_dict["shape"] = [(2,), (2, 3), (2, 3, 4), (2, 4, 5, 6)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_atan2.py b/python/oneflow/test/modules/test_atan2.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8cba30fd05f615e96930fe79ab79c930d18f078
--- /dev/null
+++ b/python/oneflow/test/modules/test_atan2.py
@@ -0,0 +1,137 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from automated_test_util import *
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_atan2_forward(test_case, shape, scalar, device):
+    np_input_x = 10 * np.random.rand(*shape)
+    np_input_y = 10 * np.random.randn(*shape)
+    of_input_x = flow.Tensor(np_input_x, dtype=flow.float32, device=flow.device(device))
+    of_input_y = flow.Tensor(np_input_y, dtype=flow.float32, device=flow.device(device))
+    of_out = flow.atan2(of_input_x, of_input_y)
+    np_out = np.arctan2(np_input_x, np_input_y)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_atan2_backward(test_case, device):
+    np_input_x = np.random.rand(2, 3)
+    np_input_y = np.random.rand(2, 3)
+    np_y_grad = -1 * np_input_x / (np_input_x * np_input_x + np_input_y * np_input_y)
+    np_x_grad = np_input_y / (np_input_x * np_input_x + np_input_y * np_input_y)
+
+    def test_x_y_grad():
+        of_input_x = flow.Tensor(
+            np_input_x,
+            dtype=flow.float32,
+            device=flow.device(device),
+            requires_grad=True,
+        )
+        of_input_y = flow.Tensor(
+            np_input_y,
+            dtype=flow.float32,
+            device=flow.device(device),
+            requires_grad=True,
+        )
+        of_out = flow.atan2(of_input_x, of_input_y)
+        of_out_sum = of_out.sum()
+        of_out_sum.backward()
+        test_case.assertTrue(
+            np.allclose(of_input_x.grad.numpy(), np_x_grad, 0.0001, 0.0001)
+        )
+        test_case.assertTrue(
+            np.allclose(of_input_y.grad.numpy(), np_y_grad, 0.0001, 0.0001)
+        )
+
+    def test_x_grad():
+        of_input_x = flow.Tensor(
+            np_input_x,
+            dtype=flow.float32,
+            device=flow.device(device),
+            requires_grad=True,
+        )
+        of_input_y = flow.Tensor(
+            np_input_y, dtype=flow.float32, device=flow.device(device)
+        )
+        of_out = flow.atan2(of_input_x, of_input_y)
+        of_out_sum = of_out.sum()
+        of_out_sum.backward()
+        test_case.assertTrue(
+            np.allclose(of_input_x.grad.numpy(), np_x_grad, 0.0001, 0.0001)
+        )
+
+    def test_y_grad():
+        of_input_x = flow.Tensor(
+            np_input_x, dtype=flow.float32, device=flow.device(device)
+        )
+        of_input_y = flow.Tensor(
+            np_input_y,
+            dtype=flow.float32,
+            device=flow.device(device),
+            requires_grad=True,
+        )
+        of_out = flow.atan2(of_input_x, of_input_y)
+        of_out_sum = of_out.sum()
+        of_out_sum.backward()
+        test_case.assertTrue(
+            np.allclose(of_input_y.grad.numpy(), np_y_grad, 0.0001, 0.0001)
+        )
+
+    test_x_y_grad()
+    test_x_grad()
+    test_y_grad()
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestAtan2(flow.unittest.TestCase):
+    def test_atan2_forward(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(2,), (2, 3), (2, 3, 4), (2, 3, 4, 5)]
+        arg_dict["scalar"] = [2.1, 0.8]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_atan2_forward(test_case, *arg)
+
+    def test_atan2_backward(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_atan2_backward(test_case, *arg)
+
+    def test_flow_atan2_with_random_data(test_case):
+        for device in ["cpu", "cuda"]:
+            test_flow_against_pytorch(
+                test_case,
+                "atan2",
+                extra_annotations={"other": flow.Tensor},
+                extra_generators={
+                    "input": random_tensor(ndim=1, dim1=1),
+                    "other": random_tensor(ndim=1, dim1=1),
+                },
+                device=device,
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_atanh.py b/python/oneflow/test/modules/test_atanh.py
new file mode 100644
index 0000000000000000000000000000000000000000..966e8b029d8faa17293dc2d240cdfe5a661ef216
--- /dev/null
+++ b/python/oneflow/test/modules/test_atanh.py
@@ -0,0 +1,75 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_atanh_impl(test_case, shape, device):
+    np_input = np.random.random(shape) - 0.5
+    of_input = flow.Tensor(
+        np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    of_out = flow.atanh(of_input)
+    np_out = np.arctanh(np_input)
+    test_case.assertTrue(
+        np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001, equal_nan=True)
+    )
+    of_out = of_out.sum()
+    of_out.backward()
+    np_out_grad = 1.0 / (1.0 - np.square(np_input))
+    test_case.assertTrue(
+        np.allclose(of_input.grad.numpy(), np_out_grad, 0.0001, 0.0001, equal_nan=True)
+    )
+
+
+def _test_arctanh_impl(test_case, shape, device):
+    np_input = np.random.random(shape) - 0.5
+    of_input = flow.Tensor(
+        np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    of_out = flow.arctanh(of_input)
+    np_out = np.arctanh(np_input)
+    test_case.assertTrue(
+        np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001, equal_nan=True)
+    )
+    of_out = of_out.sum()
+    of_out.backward()
+    np_out_grad = 1.0 / (1.0 - np.square(np_input))
+    test_case.assertTrue(
+        np.allclose(of_input.grad.numpy(), np_out_grad, 0.0001, 0.0001, equal_nan=True)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestAtanh(flow.unittest.TestCase):
+    def test_atanh(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(2,), (2, 3), (2, 3, 4), (2, 4, 5, 6)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_atanh_impl(test_case, *arg)
+            _test_arctanh_impl(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_autograd.py b/python/oneflow/test/modules/test_autograd.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4abc49b9c868175b80914a0133672a8dcf5ebd1
--- /dev/null
+++ b/python/oneflow/test/modules/test_autograd.py
@@ -0,0 +1,90 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_autograd_backward(test_case, shape, device):
+    np_input = np.random.rand(*shape)
+    of_input = flow.Tensor(
+        np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    of_out = of_input ** 2
+    of_out_sum = of_out.sum()
+    of_out_sum.backward()
+    test_case.assertTrue(
+        np.allclose(of_input.grad.numpy(), np_input * 2, 0.0001, 0.0001)
+    )
+    of_input = flow.Tensor(
+        np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    of_out = of_input ** 2
+    of_out_sum = of_out.sum()
+    of_out_sum.backward(flow.ones_like(of_out_sum) * 3)
+    test_case.assertTrue(
+        np.allclose(of_input.grad.numpy(), np_input * 6, 0.0001, 0.0001)
+    )
+    of_input = flow.Tensor(
+        np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    of_out = of_input ** 2
+    of_out_sum = of_out.sum()
+    of_out_sum.backward(retain_graph=True)
+    of_out_sum.backward(retain_graph=True)
+    test_case.assertTrue(
+        np.allclose(of_input.grad.numpy(), np_input * 4, 0.0001, 0.0001)
+    )
+
+
+def _test_autograd_grad(test_case, shape, device):
+    np_input = np.random.rand(*shape)
+    of_input = flow.Tensor(
+        np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    of_out = of_input ** 2
+    of_out_sum = of_out.sum()
+    grad = flow.autograd.grad(of_out_sum, of_input)[0]
+    test_case.assertTrue(of_input.grad is None)
+    test_case.assertTrue(np.allclose(grad.numpy(), np_input * 2, 0.0001, 0.0001))
+    of_input = flow.Tensor(
+        np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    of_out = of_input ** 2
+    of_out_sum = of_out.sum()
+    grad = flow.autograd.grad(of_out_sum, of_input, flow.ones_like(of_out_sum) * 3)[0]
+    test_case.assertTrue(np.allclose(grad.numpy(), np_input * 6, 0.0001, 0.0001))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestAutograd(flow.unittest.TestCase):
+    def test_autograd_interface(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["case"] = [_test_autograd_backward, _test_autograd_grad]
+        arg_dict["shape"] = [(2, 3), (2, 3, 4, 5)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_avgpool.py b/python/oneflow/test/modules/test_avgpool.py
new file mode 100644
index 0000000000000000000000000000000000000000..e386a56e2cefd3824ad7f78a45aa2eae4f55b76c
--- /dev/null
+++ b/python/oneflow/test/modules/test_avgpool.py
@@ -0,0 +1,602 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import math
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _nd_tuple_to_dhw(nd_tuple, dim, prefix=1, dhw_offset=0):
+    assert dim <= 3
+    assert dim == len(nd_tuple) - dhw_offset
+    nd_tuple = list(nd_tuple)
+    dhw_tuple = nd_tuple[:dhw_offset]
+    dhw_tuple.extend([prefix for _ in range(3 - dim)])
+    dhw_tuple.extend(nd_tuple[dhw_offset:])
+    return tuple(dhw_tuple)
+
+
+def _dhw_tuple_to_nd(dhw_tuple, dim, prefix=1, dhw_offset=0):
+    assert dim <= 3
+    assert 3 == len(dhw_tuple) - dhw_offset
+    dhw_tuple = list(dhw_tuple)
+    nd_tuple = dhw_tuple[:dhw_offset]
+    nd_offset = dhw_offset + 3 - dim
+    for i in dhw_tuple[dhw_offset:nd_offset]:
+        assert prefix == i
+    nd_tuple.extend(dhw_tuple[nd_offset:])
+    return tuple(nd_tuple)
+
+
+class AvgPoolNumpy:
+    def __init__(self, dim=2, kernel_size=(2, 2), stride=(2, 2), padding=(0, 0)):
+        self.dim = dim
+        self.stride = _nd_tuple_to_dhw(stride, dim)
+        self.padding = _nd_tuple_to_dhw(padding, dim, prefix=0)
+        self.kernel_size = _nd_tuple_to_dhw(kernel_size, dim)
+        self.w_depth = self.kernel_size[0]
+        self.w_height = self.kernel_size[1]
+        self.w_width = self.kernel_size[2]
+        self.min_val = 0.0
+
+    def __call__(self, x):
+        self.x_shape = x.shape
+        x_shape_5d = _nd_tuple_to_dhw(self.x_shape, self.dim, prefix=1, dhw_offset=2)
+        x = x.reshape(x_shape_5d)
+        self.in_batch = np.shape(x)[0]
+        self.in_channel = np.shape(x)[1]
+        self.in_depth = np.shape(x)[2]
+        self.in_height = np.shape(x)[3]
+        self.in_width = np.shape(x)[4]
+        pad_x = np.pad(
+            x,
+            (
+                (0, 0),
+                (0, 0),
+                (self.padding[0], self.padding[0]),
+                (self.padding[1], self.padding[1]),
+                (self.padding[2], self.padding[2]),
+            ),
+            "constant",
+            constant_values=(self.min_val, self.min_val),
+        )
+        self.pad_x = pad_x
+        self.pad_shape = pad_x.shape
+        self.out_depth = int((self.in_depth - self.w_depth) / self.stride[0]) + 1
+        self.out_height = int((self.in_height - self.w_height) / self.stride[1]) + 1
+        self.out_width = int((self.in_width - self.w_width) / self.stride[2]) + 1
+        self.pad_out_depth = np.uint16(
+            math.ceil((self.pad_shape[2] - self.w_depth + 1) / self.stride[0])
+        )
+        self.pad_out_height = np.uint16(
+            math.ceil((self.pad_shape[3] - self.w_height + 1) / self.stride[1])
+        )
+        self.pad_out_width = np.uint16(
+            math.ceil((self.pad_shape[4] - self.w_width + 1) / self.stride[2])
+        )
+        out = np.zeros(
+            (
+                self.in_batch,
+                self.in_channel,
+                self.pad_out_depth,
+                self.pad_out_height,
+                self.pad_out_width,
+            )
+        )
+        self.arg_avg = np.zeros_like(out)
+        for n in range(self.in_batch):
+            for c in range(self.in_channel):
+                for i in range(self.pad_out_depth):
+                    for j in range(self.pad_out_height):
+                        for k in range(self.pad_out_width):
+                            start_i = i * self.stride[0]
+                            start_j = j * self.stride[1]
+                            start_k = k * self.stride[2]
+                            end_i = start_i + self.w_depth
+                            end_j = start_j + self.w_height
+                            end_k = start_k + self.w_width
+                            out[n, c, i, j, k] = np.average(
+                                pad_x[n, c, start_i:end_i, start_j:end_j, start_k:end_k]
+                            )
+                            self.arg_avg[n, c, i, j, k] = np.average(
+                                pad_x[n, c, start_i:end_i, start_j:end_j, start_k:end_k]
+                            )
+        self.out_shape_5d = out.shape
+        out_shape = _dhw_tuple_to_nd(out.shape, self.dim, dhw_offset=2)
+        out = out.reshape(out_shape)
+        return out
+
+
+def _test_avgpool3d(test_case, device):
+    input_arr = np.array(
+        [
+            [
+                [
+                    [[-1.1132425, -0.79719835], [1.99409501, 0.23270504]],
+                    [[-0.69827855, -0.19336448], [0.86132664, -0.86734113]],
+                ],
+                [
+                    [[0.90614991, -1.11548232], [-0.17957948, -0.14095705]],
+                    [[0.12856562, -0.82078871], [-0.79095713, -0.86583306]],
+                ],
+            ],
+            [
+                [
+                    [[-1.99924145, 0.39951706], [-1.31197624, -0.68801404]],
+                    [[-0.09358264, 0.12486073], [-0.45929356, 0.31948792]],
+                ],
+                [
+                    [[0.72989192, 1.65362442], [0.12919752, -1.45644394]],
+                    [[-0.33608345, -0.4950027], [-0.30841882, 1.06204887]],
+                ],
+            ],
+        ]
+    )
+    dim = 3
+    (kernel_size, stride, padding) = ((2, 2, 2), (1, 1, 1), (0, 0, 0))
+    m_numpy = AvgPoolNumpy(dim, kernel_size, stride, padding)
+    numpy_output = m_numpy(input_arr)
+    m = flow.nn.AvgPool3d(kernel_size=kernel_size, stride=stride, padding=padding)
+    m.to(flow.device(device))
+    x = flow.Tensor(input_arr, requires_grad=True, device=flow.device(device))
+    output = m(x)
+    test_case.assertTrue(np.allclose(numpy_output, output.numpy(), 0.0001, 0.0001))
+
+
+def _test_avgpool3d_backward(test_case, device):
+    dim = 3
+    input_arr = np.array(
+        [
+            [
+                [
+                    [[-1.1132425, -0.79719835], [1.99409501, 0.23270504]],
+                    [[-0.69827855, -0.19336448], [0.86132664, -0.86734113]],
+                ],
+                [
+                    [[0.90614991, -1.11548232], [-0.17957948, -0.14095705]],
+                    [[0.12856562, -0.82078871], [-0.79095713, -0.86583306]],
+                ],
+            ],
+            [
+                [
+                    [[-1.99924145, 0.39951706], [-1.31197624, -0.68801404]],
+                    [[-0.09358264, 0.12486073], [-0.45929356, 0.31948792]],
+                ],
+                [
+                    [[0.72989192, 1.65362442], [0.12919752, -1.45644394]],
+                    [[-0.33608345, -0.4950027], [-0.30841882, 1.06204887]],
+                ],
+            ],
+        ]
+    )
+    (kernel_size, stride, padding) = ((2, 2, 2), (1, 1, 1), (0, 0, 0))
+    m_numpy = AvgPoolNumpy(dim, kernel_size, stride, padding)
+    numpy_output = m_numpy(input_arr)
+    m = flow.nn.AvgPool3d(kernel_size=kernel_size, stride=stride, padding=padding)
+    m.to(flow.device(device))
+    x = flow.Tensor(input_arr, requires_grad=True, device=flow.device(device))
+    output = m(x)
+    test_case.assertTrue(np.allclose(numpy_output, output.numpy(), 0.0001, 0.0001))
+    output = output.sum()
+    output.backward()
+    doutput = np.ones_like(numpy_output, dtype=np.float64)
+    numpy_grad = np.zeros(shape=input_arr.shape)
+    numpy_grad[...] = 0.125
+    test_case.assertTrue(np.allclose(x.grad.numpy(), numpy_grad, 1e-05, 1e-05))
+
+
+def _test_avgpool3d_special_kernel_size_backward(test_case, device):
+    dim = 3
+    input_arr = np.array(
+        [
+            [
+                [
+                    [
+                        [
+                            1.66918755,
+                            -0.91884044,
+                            -0.53434356,
+                            -0.57682845,
+                            -0.57808441,
+                            1.99174729,
+                        ],
+                        [
+                            -0.57801338,
+                            1.810334,
+                            -0.30454292,
+                            -0.32011417,
+                            -2.4486984,
+                            -0.66338876,
+                        ],
+                        [
+                            -0.15772485,
+                            0.6784365,
+                            1.18897709,
+                            1.20692234,
+                            1.43578745,
+                            -0.36833255,
+                        ],
+                        [
+                            0.74718159,
+                            0.09179258,
+                            -0.94193085,
+                            -0.35707129,
+                            -0.62257021,
+                            0.42824892,
+                        ],
+                        [
+                            -0.13482852,
+                            -0.02991985,
+                            0.28971932,
+                            1.80695194,
+                            -0.07023364,
+                            -0.92182529,
+                        ],
+                        [
+                            -0.02296651,
+                            -1.43817104,
+                            1.4028344,
+                            0.18194114,
+                            -0.59439764,
+                            1.51888284,
+                        ],
+                    ],
+                    [
+                        [
+                            0.39941812,
+                            -0.69972636,
+                            1.05458831,
+                            0.93664904,
+                            -1.00730994,
+                            1.09524098,
+                        ],
+                        [
+                            0.63022077,
+                            0.85397415,
+                            1.0084123,
+                            -0.20605707,
+                            -0.37284122,
+                            0.11387859,
+                        ],
+                        [
+                            -1.26611431,
+                            -0.62012754,
+                            0.09563748,
+                            -0.21232549,
+                            -1.77755391,
+                            0.22544966,
+                        ],
+                        [
+                            0.05055287,
+                            -0.97104387,
+                            0.00743758,
+                            -0.01799878,
+                            -0.01687093,
+                            -0.95385641,
+                        ],
+                        [
+                            -0.46048377,
+                            0.74474033,
+                            0.38518884,
+                            1.4415209,
+                            -0.74031676,
+                            1.3467917,
+                        ],
+                        [
+                            1.07532674,
+                            -1.22199077,
+                            0.53129623,
+                            -1.15805626,
+                            -1.59087007,
+                            0.27252823,
+                        ],
+                    ],
+                    [
+                        [
+                            2.10041429,
+                            -2.43180683,
+                            1.21660805,
+                            -2.60185516,
+                            1.05938698,
+                            0.96355525,
+                        ],
+                        [
+                            -1.25661354,
+                            -1.13195752,
+                            0.47894153,
+                            1.19304616,
+                            -0.69451204,
+                            2.2175799,
+                        ],
+                        [
+                            1.34278748,
+                            -1.52081064,
+                            -0.2507571,
+                            0.67087564,
+                            -0.79763021,
+                            -0.41767333,
+                        ],
+                        [
+                            -2.32956058,
+                            0.03233625,
+                            -1.47391582,
+                            0.70333218,
+                            -0.2506578,
+                            0.24757612,
+                        ],
+                        [
+                            0.22672213,
+                            -0.60840215,
+                            -1.55909351,
+                            -0.30993582,
+                            -0.25493395,
+                            -1.13345972,
+                        ],
+                        [
+                            -0.30647421,
+                            -0.48087784,
+                            -0.71393674,
+                            -1.36828179,
+                            1.10667612,
+                            -0.15967295,
+                        ],
+                    ],
+                    [
+                        [
+                            0.32983435,
+                            -0.91425562,
+                            -0.35299711,
+                            1.31247588,
+                            0.15367215,
+                            -1.98610838,
+                        ],
+                        [
+                            0.81303132,
+                            0.15115689,
+                            1.8122944,
+                            0.96024569,
+                            1.75029563,
+                            1.79526488,
+                        ],
+                        [
+                            -0.72335846,
+                            -0.25343156,
+                            0.68296792,
+                            0.12407177,
+                            0.2543815,
+                            -0.51771794,
+                        ],
+                        [
+                            -1.56714417,
+                            1.19790861,
+                            1.20180306,
+                            0.41645108,
+                            -0.4753875,
+                            0.43112448,
+                        ],
+                        [
+                            -0.72958873,
+                            1.07136698,
+                            0.99048707,
+                            -1.65848592,
+                            0.53776319,
+                            0.37002138,
+                        ],
+                        [
+                            1.45602655,
+                            0.05036957,
+                            0.53813642,
+                            -1.29038552,
+                            0.66232652,
+                            -0.00563294,
+                        ],
+                    ],
+                    [
+                        [
+                            1.82491436,
+                            -1.87574983,
+                            -0.27483037,
+                            -1.41977775,
+                            0.95369067,
+                            -0.19138531,
+                        ],
+                        [
+                            -1.25252398,
+                            1.33494634,
+                            -0.13758054,
+                            -0.33883371,
+                            1.80729216,
+                            1.29806594,
+                        ],
+                        [
+                            0.77033134,
+                            -1.30258535,
+                            -1.8302794,
+                            0.52123884,
+                            0.90620194,
+                            -0.67787233,
+                        ],
+                        [
+                            -0.29091427,
+                            -0.27677645,
+                            -0.18344966,
+                            -0.92565511,
+                            0.19842833,
+                            0.59580347,
+                        ],
+                        [
+                            -0.29520923,
+                            0.17046046,
+                            -0.80503485,
+                            0.89908856,
+                            0.69774822,
+                            0.29579325,
+                        ],
+                        [
+                            0.17788624,
+                            -0.34228185,
+                            -0.37028163,
+                            -1.18220291,
+                            1.77898418,
+                            -0.17662215,
+                        ],
+                    ],
+                    [
+                        [
+                            0.06161488,
+                            1.56969206,
+                            0.81895252,
+                            -0.82887789,
+                            0.9260089,
+                            -0.0988148,
+                        ],
+                        [
+                            0.21460429,
+                            -1.4755581,
+                            1.36994785,
+                            1.17893958,
+                            -1.01790093,
+                            0.08058205,
+                        ],
+                        [
+                            -0.78913355,
+                            -0.48296865,
+                            -1.08832194,
+                            -0.81984527,
+                            0.22901453,
+                            0.0114611,
+                        ],
+                        [
+                            -0.50999815,
+                            -0.52438008,
+                            -0.39893658,
+                            -0.68719077,
+                            1.0338822,
+                            0.14097484,
+                        ],
+                        [
+                            1.45503734,
+                            1.70649681,
+                            -0.53885203,
+                            -0.62992688,
+                            -0.3641152,
+                            -0.1234822,
+                        ],
+                        [
+                            -1.18950772,
+                            1.64488172,
+                            0.46651043,
+                            -2.17475965,
+                            0.36525702,
+                            0.9185165,
+                        ],
+                    ],
+                ]
+            ]
+        ]
+    )
+    (kernel_size, stride, padding) = ((1, 1, 1), (5, 5, 5), (0, 0, 0))
+    m_numpy = AvgPoolNumpy(dim, kernel_size, stride, padding)
+    numpy_output = m_numpy(input_arr)
+    m = flow.nn.AvgPool3d(kernel_size=kernel_size, stride=stride, padding=padding)
+    m.to(flow.device(device))
+    x = flow.Tensor(input_arr, requires_grad=True, device=flow.device(device))
+    output = m(x)
+    test_case.assertTrue(np.allclose(numpy_output, output.numpy(), 0.0001, 0.0001))
+    output = output.sum()
+    output.backward()
+    doutput = np.ones_like(numpy_output, dtype=np.float64)
+    numpy_grad = np.array(
+        [
+            [
+                [
+                    [
+                        [1.0, 0.0, 0.0, 0.0, 0.0, 1.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [1.0, 0.0, 0.0, 0.0, 0.0, 1.0],
+                    ],
+                    [
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                    ],
+                    [
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                    ],
+                    [
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                    ],
+                    [
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                    ],
+                    [
+                        [1.0, 0.0, 0.0, 0.0, 0.0, 1.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [1.0, 0.0, 0.0, 0.0, 0.0, 1.0],
+                    ],
+                ]
+            ]
+        ]
+    )
+    test_case.assertTrue(np.allclose(x.grad.numpy(), numpy_grad, 1e-05, 1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestPoolingModule(flow.unittest.TestCase):
+    def test_avgpool3d(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_avgpool3d,
+            _test_avgpool3d_backward,
+            _test_avgpool3d_special_kernel_size_backward,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_avgpool2d.py b/python/oneflow/test/modules/test_avgpool2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cf437f26614d5359aaae7f47a1e70ea4d0028f6
--- /dev/null
+++ b/python/oneflow/test/modules/test_avgpool2d.py
@@ -0,0 +1,430 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow as flow
+import oneflow.unittest
+from oneflow.nn.modules.utils import _pair, _reverse_repeat_tuple, _single, _triple
+
+g_samples = [
+    {
+        "kernel": (3, 2),
+        "padding": 0,
+        "stride": (2, 1),
+        "in": np.array(
+            [
+                [
+                    [
+                        [
+                            -0.1953,
+                            1.3992,
+                            -0.7464,
+                            0.691,
+                            -1.5484,
+                            0.497,
+                            1.4963,
+                            0.308,
+                            -1.473,
+                            -0.1238,
+                        ],
+                        [
+                            -0.3532,
+                            1.2078,
+                            -0.3796,
+                            0.7326,
+                            -1.5795,
+                            0.2128,
+                            0.6501,
+                            -0.1266,
+                            -1.3121,
+                            0.1483,
+                        ],
+                        [
+                            -0.3412,
+                            -1.6446,
+                            -1.0039,
+                            -0.5594,
+                            0.745,
+                            -0.5323,
+                            -1.6887,
+                            0.2399,
+                            1.9422,
+                            0.4214,
+                        ],
+                        [
+                            -1.6362,
+                            -1.2234,
+                            -1.2531,
+                            0.6109,
+                            0.2228,
+                            -0.208,
+                            0.6359,
+                            0.2451,
+                            0.3864,
+                            0.4263,
+                        ],
+                        [
+                            0.7053,
+                            0.3413,
+                            0.909,
+                            -0.4057,
+                            -0.283,
+                            1.0444,
+                            -0.2884,
+                            0.7638,
+                            -1.4793,
+                            0.2079,
+                        ],
+                        [
+                            -0.1207,
+                            0.8458,
+                            -0.9521,
+                            0.363,
+                            0.1772,
+                            0.3945,
+                            0.4056,
+                            -0.7822,
+                            0.6166,
+                            1.3343,
+                        ],
+                        [
+                            -0.4115,
+                            0.5802,
+                            1.2909,
+                            1.6508,
+                            -0.0561,
+                            -0.7964,
+                            0.9786,
+                            0.4265,
+                            0.7262,
+                            0.2819,
+                        ],
+                        [
+                            -0.2667,
+                            -0.0792,
+                            0.4771,
+                            0.3248,
+                            -0.1313,
+                            -0.3325,
+                            -0.9973,
+                            0.3128,
+                            -0.5151,
+                            -0.1225,
+                        ],
+                        [
+                            -1.4983,
+                            0.2604,
+                            -0.9127,
+                            0.0822,
+                            0.3708,
+                            -2.6024,
+                            0.2249,
+                            -0.75,
+                            0.3152,
+                            0.1931,
+                        ],
+                        [
+                            -0.2171,
+                            -0.2602,
+                            0.9051,
+                            -0.0933,
+                            -0.0902,
+                            -1.3837,
+                            -1.2519,
+                            -1.3091,
+                            0.7155,
+                            2.3376,
+                        ],
+                    ]
+                ]
+            ]
+        ),
+        "out": np.array(
+            [
+                [
+                    [
+                        [
+                            0.0121,
+                            -0.1946,
+                            -0.211,
+                            -0.2531,
+                            -0.3675,
+                            0.1059,
+                            0.1465,
+                            -0.0703,
+                            -0.0662,
+                        ],
+                        [
+                            -0.6331,
+                            -0.6458,
+                            -0.2837,
+                            0.0551,
+                            0.1648,
+                            -0.1729,
+                            -0.0154,
+                            0.3497,
+                            0.3175,
+                        ],
+                        [
+                            0.3234,
+                            0.5025,
+                            0.476,
+                            0.241,
+                            0.0801,
+                            0.2897,
+                            0.2506,
+                            0.0453,
+                            0.2813,
+                        ],
+                        [
+                            -0.2359,
+                            0.2694,
+                            0.4855,
+                            0.3735,
+                            -0.5913,
+                            -0.5875,
+                            0.0326,
+                            0.0859,
+                            0.1465,
+                        ],
+                    ]
+                ]
+            ]
+        ),
+        "ceil_mode": False,
+    },
+    {
+        "in": np.array(
+            [
+                [
+                    [
+                        [
+                            -0.25874418,
+                            -0.735277,
+                            0.7187668,
+                            0.4317905,
+                            -0.15865013,
+                            0.32455945,
+                            0.91029733,
+                            -0.42489085,
+                            0.13257249,
+                            -0.7680078,
+                        ],
+                        [
+                            -0.48924643,
+                            0.41322532,
+                            -0.24956563,
+                            0.39011025,
+                            1.1571697,
+                            1.1312183,
+                            1.3140937,
+                            -0.88671404,
+                            -0.73976123,
+                            0.09273718,
+                        ],
+                        [
+                            -1.5684161,
+                            0.94065344,
+                            0.39506504,
+                            -0.698693,
+                            -0.9967914,
+                            -2.0290415,
+                            0.98462844,
+                            0.7358801,
+                            1.1113276,
+                            0.6782418,
+                        ],
+                        [
+                            1.4970111,
+                            -0.10413595,
+                            1.4999448,
+                            1.3459393,
+                            -0.7604277,
+                            1.2852267,
+                            0.01842104,
+                            -1.2325357,
+                            0.44910756,
+                            -0.66622615,
+                        ],
+                        [
+                            2.0804522,
+                            -0.8352113,
+                            -0.63586867,
+                            0.16018416,
+                            -0.08155673,
+                            0.41048485,
+                            -1.2774752,
+                            -0.24625959,
+                            0.06801426,
+                            -0.36709896,
+                        ],
+                        [
+                            -2.2077172,
+                            0.72850853,
+                            0.48929325,
+                            0.7826485,
+                            1.3427622,
+                            -1.1062458,
+                            1.2447584,
+                            1.87407,
+                            1.0484176,
+                            -1.321674,
+                        ],
+                        [
+                            1.0160061,
+                            0.12091469,
+                            -0.80043447,
+                            1.3699176,
+                            0.83278096,
+                            -0.02582553,
+                            -0.08430449,
+                            -2.1967752,
+                            -0.02168749,
+                            -2.374834,
+                        ],
+                        [
+                            -0.6146652,
+                            -1.5595887,
+                            0.10382211,
+                            -0.43930522,
+                            0.11752917,
+                            -0.03595898,
+                            -1.216878,
+                            2.0072885,
+                            0.8048424,
+                            2.2326653,
+                        ],
+                        [
+                            0.02489181,
+                            0.01249131,
+                            0.5591928,
+                            0.20447306,
+                            1.4736984,
+                            0.76396596,
+                            -0.90115523,
+                            1.0401802,
+                            0.22212219,
+                            0.15565436,
+                        ],
+                        [
+                            -2.0538027,
+                            -1.0389869,
+                            -0.94865525,
+                            -0.6091378,
+                            -0.19524679,
+                            -0.50839746,
+                            0.4246262,
+                            0.0206702,
+                            0.62251437,
+                            -1.7211599,
+                        ],
+                    ]
+                ]
+            ]
+        ),
+        "out": np.array(
+            [
+                [
+                    [
+                        [
+                            -0.28296748,
+                            0.24714465,
+                            0.164579,
+                            0.02082263,
+                            -0.09525593,
+                            0.43929258,
+                            0.43888247,
+                            -0.01193098,
+                            0.08451834,
+                        ],
+                        [
+                            0.3350589,
+                            0.21007456,
+                            0.34442863,
+                            -0.1718909,
+                            -0.36201763,
+                            -0.10129262,
+                            -0.16955681,
+                            0.14758904,
+                            0.2122277,
+                        ],
+                        [
+                            0.15049218,
+                            -0.15546632,
+                            0.2276234,
+                            0.7344561,
+                            0.22873335,
+                            -0.13976796,
+                            -0.11433101,
+                            0.08762991,
+                            -0.49481043,
+                        ],
+                        [
+                            -0.16665831,
+                            -0.2606004,
+                            0.16627766,
+                            0.5931824,
+                            0.5210317,
+                            -0.25002605,
+                            -0.22527404,
+                            0.30932844,
+                            0.16979378,
+                        ],
+                        [
+                            -0.76385164,
+                            -0.35398954,
+                            -0.19853179,
+                            0.2184467,
+                            0.38350502,
+                            -0.05524013,
+                            0.14608034,
+                            0.47637174,
+                            -0.18021727,
+                        ],
+                    ]
+                ]
+            ]
+        ),
+        "kernel": (3, 2),
+        "stride": (2, 1),
+        "padding": 0,
+        "ceil_mode": True,
+    },
+]
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestModule(flow.unittest.TestCase):
+    def test_AvgPool2d(test_case):
+        global g_samples
+        for sample in g_samples:
+            of_avgpool2d = flow.nn.AvgPool2d(
+                kernel_size=sample["kernel"],
+                padding=sample["padding"],
+                stride=sample["stride"],
+                ceil_mode=sample["ceil_mode"],
+            )
+            x = flow.Tensor(sample["in"])
+            of_y = of_avgpool2d(x)
+            test_case.assertTrue(of_y.numpy().shape == sample["out"].shape)
+            test_case.assertTrue(np.allclose(of_y.numpy(), sample["out"], 0.001, 0.001))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_batchnorm.py b/python/oneflow/test/modules/test_batchnorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffe4c767ea53e4f414689fab15a355dfb6893ad3
--- /dev/null
+++ b/python/oneflow/test/modules/test_batchnorm.py
@@ -0,0 +1,521 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from automated_test_util import *
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_batchnorm1d_2d_input(test_case, device):
+    input_arr = np.array(
+        [
+            [0.1438, 1.1229, -0.048, -1.6834, -0.8262],
+            [0.5836, 0.135, -0.886, -1.7878, 1.0592],
+            [0.7252, -1.1488, -0.0274, 1.4051, 0.1018],
+            [-0.3595, -0.1801, 0.1146, -1.5712, -1.9291],
+        ],
+        dtype=np.float32,
+    )
+    output_arr = np.array(
+        [
+            [-0.3056, 1.4066, 0.4151, -0.5783, -0.3864],
+            [0.7326, 0.1884, -1.71, -0.6563, 1.317],
+            [1.0668, -1.3949, 0.4674, 1.7292, 0.4521],
+            [-1.4938, -0.2002, 0.8275, -0.4945, -1.3827],
+        ],
+        dtype=np.float32,
+    )
+    m = flow.nn.BatchNorm1d(num_features=5, eps=1e-05, momentum=0.1).to(
+        device=flow.device(device)
+    )
+    x = flow.Tensor(input_arr, device=flow.device(device))
+    y = m(x)
+    test_case.assertTrue(np.allclose(y.numpy(), output_arr, rtol=0.0001, atol=0.0001))
+
+
+def _test_batchnorm1d_3d_input(test_case, device):
+    input_arr = np.array(
+        [
+            [
+                [-0.1091, 2.0041, 0.885, -0.0412],
+                [-1.2055, 0.7442, 2.33, 1.2411],
+                [-1.2466, 0.3667, 1.2267, 0.3043],
+            ],
+            [
+                [-0.2484, -1.1407, 0.3352, 0.6687],
+                [-0.2975, -0.0227, -0.2302, -0.3762],
+                [-0.7759, -0.6789, 1.1444, 1.8077],
+            ],
+        ],
+        dtype=np.float32,
+    )
+    output_arr = np.array(
+        [
+            [
+                [-0.464, 1.9673, 0.6798, -0.3859],
+                [-1.4207, 0.4529, 1.9767, 0.9303],
+                [-1.4831, 0.096, 0.9379, 0.035],
+            ],
+            [
+                [-0.6243, -1.651, 0.0471, 0.4309],
+                [-0.5481, -0.284, -0.4834, -0.6237],
+                [-1.0224, -0.9274, 0.8573, 1.5066],
+            ],
+        ],
+        dtype=np.float32,
+    )
+    m = flow.nn.BatchNorm1d(num_features=3, eps=1e-05, momentum=0.1).to(
+        device=flow.device(device)
+    )
+    x = flow.Tensor(input_arr, device=flow.device(device))
+    y = m(x)
+    test_case.assertTrue(np.allclose(y.numpy(), output_arr, rtol=0.0001, atol=0.0001))
+
+
+def _test_batchnorm2d(test_case, device):
+    input_arr = np.array(
+        [
+            [
+                [
+                    [-0.8791, 0.2553, 0.7403, -0.2859],
+                    [0.8006, -1.7701, -0.9617, 0.1705],
+                    [0.2842, 1.7825, 0.3365, -0.8525],
+                ],
+                [
+                    [0.7332, -0.0737, 0.7245, -0.6551],
+                    [1.4461, -0.1827, 0.9737, -2.1571],
+                    [0.4657, 0.7244, 0.3378, 0.1775],
+                ],
+            ],
+            [
+                [
+                    [1.8896, 1.8686, 0.1896, 0.9817],
+                    [-0.0671, 1.5569, 1.1449, 0.0086],
+                    [-0.9468, -0.0124, 1.3227, -0.6567],
+                ],
+                [
+                    [-0.8472, 1.3012, -1.1065, 0.9348],
+                    [1.0346, 1.5703, 0.2419, -0.7048],
+                    [0.6957, -0.4523, -0.8819, 1.0164],
+                ],
+            ],
+        ],
+        dtype=np.float32,
+    )
+    output = np.array(
+        [
+            [
+                [
+                    [-1.1868, -0.0328, 0.4606, -0.5833],
+                    [0.522, -2.0933, -1.2709, -0.119],
+                    [-0.0034, 1.5209, 0.0498, -1.1598],
+                ],
+                [
+                    [0.5601, -0.3231, 0.5505, -0.9595],
+                    [1.3404, -0.4424, 0.8233, -2.6035],
+                    [0.2673, 0.5504, 0.1273, -0.0482],
+                ],
+            ],
+            [
+                [
+                    [1.6299, 1.6085, -0.0996, 0.7062],
+                    [-0.3608, 1.2914, 0.8723, -0.2837],
+                    [-1.2557, -0.3051, 1.0531, -0.9606],
+                ],
+                [
+                    [-1.1698, 1.1818, -1.4536, 0.7807],
+                    [0.89, 1.4763, 0.0223, -1.0139],
+                    [0.519, -0.7375, -1.2078, 0.87],
+                ],
+            ],
+        ],
+        dtype=np.float32,
+    )
+    m = flow.nn.BatchNorm2d(num_features=2, eps=1e-05, momentum=0.1).to(
+        device=flow.device(device)
+    )
+    x = flow.Tensor(input_arr, device=flow.device(device), dtype=flow.float32)
+    y = m(x)
+    test_case.assertTrue(np.allclose(y.numpy(), output, 0.0001, 0.0001))
+
+
+def _test_batchnorm2d_track_running_stats(test_case, device):
+    input_arr = np.array(
+        [
+            [
+                [
+                    [-0.8791, 0.2553, 0.7403, -0.2859],
+                    [0.8006, -1.7701, -0.9617, 0.1705],
+                    [0.2842, 1.7825, 0.3365, -0.8525],
+                ],
+                [
+                    [0.7332, -0.0737, 0.7245, -0.6551],
+                    [1.4461, -0.1827, 0.9737, -2.1571],
+                    [0.4657, 0.7244, 0.3378, 0.1775],
+                ],
+            ],
+            [
+                [
+                    [1.8896, 1.8686, 0.1896, 0.9817],
+                    [-0.0671, 1.5569, 1.1449, 0.0086],
+                    [-0.9468, -0.0124, 1.3227, -0.6567],
+                ],
+                [
+                    [-0.8472, 1.3012, -1.1065, 0.9348],
+                    [1.0346, 1.5703, 0.2419, -0.7048],
+                    [0.6957, -0.4523, -0.8819, 1.0164],
+                ],
+            ],
+        ],
+        dtype=np.float32,
+    )
+    output = np.array(
+        [
+            [
+                [
+                    [-1.1868, -0.0328, 0.4606, -0.5833],
+                    [0.522, -2.0933, -1.2709, -0.119],
+                    [-0.0034, 1.5209, 0.0498, -1.1598],
+                ],
+                [
+                    [0.5601, -0.3231, 0.5505, -0.9595],
+                    [1.3404, -0.4424, 0.8233, -2.6035],
+                    [0.2673, 0.5504, 0.1273, -0.0482],
+                ],
+            ],
+            [
+                [
+                    [1.6299, 1.6085, -0.0996, 0.7062],
+                    [-0.3608, 1.2914, 0.8723, -0.2837],
+                    [-1.2557, -0.3051, 1.0531, -0.9606],
+                ],
+                [
+                    [-1.1698, 1.1818, -1.4536, 0.7807],
+                    [0.89, 1.4763, 0.0223, -1.0139],
+                    [0.519, -0.7375, -1.2078, 0.87],
+                ],
+            ],
+        ],
+        dtype=np.float32,
+    )
+    m = flow.nn.BatchNorm2d(
+        num_features=2, eps=1e-05, momentum=0.1, track_running_stats=False
+    ).to(device=flow.device(device))
+    x = flow.Tensor(input_arr, device=flow.device(device))
+    y = m(x)
+    test_case.assertTrue(np.allclose(y.numpy(), output, 0.0001, 0.0001))
+
+
+def _test_batchnorm2d_4d_input(test_case, device):
+    input_arr = np.array(
+        [
+            [
+                [
+                    [-0.8791, 0.2553, 0.7403, -0.2859],
+                    [0.8006, -1.7701, -0.9617, 0.1705],
+                    [0.2842, 1.7825, 0.3365, -0.8525],
+                ],
+                [
+                    [0.7332, -0.0737, 0.7245, -0.6551],
+                    [1.4461, -0.1827, 0.9737, -2.1571],
+                    [0.4657, 0.7244, 0.3378, 0.1775],
+                ],
+            ],
+            [
+                [
+                    [1.8896, 1.8686, 0.1896, 0.9817],
+                    [-0.0671, 1.5569, 1.1449, 0.0086],
+                    [-0.9468, -0.0124, 1.3227, -0.6567],
+                ],
+                [
+                    [-0.8472, 1.3012, -1.1065, 0.9348],
+                    [1.0346, 1.5703, 0.2419, -0.7048],
+                    [0.6957, -0.4523, -0.8819, 1.0164],
+                ],
+            ],
+        ],
+        dtype=np.float32,
+    )
+    output = np.array(
+        [
+            [
+                [
+                    [-1.1868, -0.0328, 0.4606, -0.5833],
+                    [0.522, -2.0933, -1.2709, -0.119],
+                    [-0.0034, 1.5209, 0.0498, -1.1598],
+                ],
+                [
+                    [0.5601, -0.3231, 0.5505, -0.9595],
+                    [1.3404, -0.4424, 0.8233, -2.6035],
+                    [0.2673, 0.5504, 0.1273, -0.0482],
+                ],
+            ],
+            [
+                [
+                    [1.6299, 1.6085, -0.0996, 0.7062],
+                    [-0.3608, 1.2914, 0.8723, -0.2837],
+                    [-1.2557, -0.3051, 1.0531, -0.9606],
+                ],
+                [
+                    [-1.1698, 1.1818, -1.4536, 0.7807],
+                    [0.89, 1.4763, 0.0223, -1.0139],
+                    [0.519, -0.7375, -1.2078, 0.87],
+                ],
+            ],
+        ],
+        dtype=np.float32,
+    )
+    m = flow.nn.BatchNorm2d(num_features=2, eps=1e-05, momentum=0.1).to(
+        device=flow.device(device)
+    )
+    x = flow.Tensor(input_arr, device=flow.device(device))
+    y = m(x)
+    test_case.assertTrue(np.allclose(y.numpy(), output, 0.0001, 0.0001))
+
+
+def test_batchnorm2d_infer(test_case, device):
+    input_arr = np.array(
+        [
+            [
+                [
+                    [-0.8791, 0.2553, 0.7403, -0.2859],
+                    [0.8006, -1.7701, -0.9617, 0.1705],
+                    [0.2842, 1.7825, 0.3365, -0.8525],
+                ],
+                [
+                    [0.7332, -0.0737, 0.7245, -0.6551],
+                    [1.4461, -0.1827, 0.9737, -2.1571],
+                    [0.4657, 0.7244, 0.3378, 0.1775],
+                ],
+            ],
+            [
+                [
+                    [1.8896, 1.8686, 0.1896, 0.9817],
+                    [-0.0671, 1.5569, 1.1449, 0.0086],
+                    [-0.9468, -0.0124, 1.3227, -0.6567],
+                ],
+                [
+                    [-0.8472, 1.3012, -1.1065, 0.9348],
+                    [1.0346, 1.5703, 0.2419, -0.7048],
+                    [0.6957, -0.4523, -0.8819, 1.0164],
+                ],
+            ],
+        ],
+        dtype=np.float32,
+    )
+    output_arr = np.array(
+        [
+            [
+                [
+                    [-0.8790956, 0.2552987, 0.7402963, -0.28589857],
+                    [0.800596, -1.7700912, -0.9616952, 0.17049915],
+                    [0.28419858, 1.7824911, 0.3364983, -0.85249573],
+                ],
+                [
+                    [0.7331963, -0.07369963, 0.72449636, -0.6550967],
+                    [1.4460927, -0.18269908, 0.9736951, -2.1570892],
+                    [0.46569768, 0.72439635, 0.3377983, 0.1774991],
+                ],
+            ],
+            [
+                [
+                    [1.8895906, 1.8685907, 0.18959905, 0.9816951],
+                    [-0.06709967, 1.5568923, 1.1448942, 0.00859996],
+                    [-0.9467952, -0.01239994, 1.3226933, -0.65669674],
+                ],
+                [
+                    [-0.84719574, 1.3011935, -1.1064945, 0.9347953],
+                    [1.0345949, 1.5702921, 0.24189879, -0.7047965],
+                    [0.69569653, -0.45229775, -0.8818956, 1.0163949],
+                ],
+            ],
+        ],
+        dtype=np.float32,
+    )
+    m = flow.nn.BatchNorm2d(num_features=2, eps=1e-05, momentum=0.1).to(
+        device=flow.device(device)
+    )
+    m.eval()
+    x = flow.Tensor(input_arr, device=flow.device(device))
+    y = m(x)
+    test_case.assertTrue(np.allclose(y.numpy(), output_arr, 0.0001, 0.0001))
+
+
+def test_batchnorm2d_infer_4d_input(test_case, device):
+    input_arr = np.array(
+        [
+            [
+                [
+                    [-0.8791, 0.2553, 0.7403, -0.2859],
+                    [0.8006, -1.7701, -0.9617, 0.1705],
+                    [0.2842, 1.7825, 0.3365, -0.8525],
+                ],
+                [
+                    [0.7332, -0.0737, 0.7245, -0.6551],
+                    [1.4461, -0.1827, 0.9737, -2.1571],
+                    [0.4657, 0.7244, 0.3378, 0.1775],
+                ],
+            ],
+            [
+                [
+                    [1.8896, 1.8686, 0.1896, 0.9817],
+                    [-0.0671, 1.5569, 1.1449, 0.0086],
+                    [-0.9468, -0.0124, 1.3227, -0.6567],
+                ],
+                [
+                    [-0.8472, 1.3012, -1.1065, 0.9348],
+                    [1.0346, 1.5703, 0.2419, -0.7048],
+                    [0.6957, -0.4523, -0.8819, 1.0164],
+                ],
+            ],
+        ],
+        dtype=np.float32,
+    )
+    output_arr = np.array(
+        [
+            [
+                [
+                    [-0.8790956, 0.2552987, 0.7402963, -0.28589857],
+                    [0.800596, -1.7700912, -0.9616952, 0.17049915],
+                    [0.28419858, 1.7824911, 0.3364983, -0.85249573],
+                ],
+                [
+                    [0.7331963, -0.07369963, 0.72449636, -0.6550967],
+                    [1.4460927, -0.18269908, 0.9736951, -2.1570892],
+                    [0.46569768, 0.72439635, 0.3377983, 0.1774991],
+                ],
+            ],
+            [
+                [
+                    [1.8895906, 1.8685907, 0.18959905, 0.9816951],
+                    [-0.06709967, 1.5568923, 1.1448942, 0.00859996],
+                    [-0.9467952, -0.01239994, 1.3226933, -0.65669674],
+                ],
+                [
+                    [-0.84719574, 1.3011935, -1.1064945, 0.9347953],
+                    [1.0345949, 1.5702921, 0.24189879, -0.7047965],
+                    [0.69569653, -0.45229775, -0.8818956, 1.0163949],
+                ],
+            ],
+        ],
+        dtype=np.float32,
+    )
+    m = flow.nn.BatchNorm2d(num_features=2, eps=1e-05, momentum=0.1).to(
+        device=flow.device(device)
+    )
+    m.eval()
+    x = flow.Tensor(input_arr, device=flow.device(device))
+    y = m(x)
+    test_case.assertTrue(np.allclose(y.numpy(), output_arr, 0.0001, 0.0001))
+
+
+def _test_batchnorm2d_backward(test_case, device):
+    input_arr = np.array(
+        [
+            [
+                [
+                    [-0.8791, 0.2553, 0.7403, -0.2859],
+                    [0.8006, -1.7701, -0.9617, 0.1705],
+                    [0.2842, 1.7825, 0.3365, -0.8525],
+                ],
+                [
+                    [0.7332, -0.0737, 0.7245, -0.6551],
+                    [1.4461, -0.1827, 0.9737, -2.1571],
+                    [0.4657, 0.7244, 0.3378, 0.1775],
+                ],
+            ],
+            [
+                [
+                    [1.8896, 1.8686, 0.1896, 0.9817],
+                    [-0.0671, 1.5569, 1.1449, 0.0086],
+                    [-0.9468, -0.0124, 1.3227, -0.6567],
+                ],
+                [
+                    [-0.8472, 1.3012, -1.1065, 0.9348],
+                    [1.0346, 1.5703, 0.2419, -0.7048],
+                    [0.6957, -0.4523, -0.8819, 1.0164],
+                ],
+            ],
+        ],
+        dtype=np.float32,
+    )
+    m = flow.nn.BatchNorm2d(num_features=2, eps=1e-05, momentum=0.1).to(
+        device=flow.device(device)
+    )
+    x = flow.Tensor(input_arr, device=flow.device(device), requires_grad=True)
+    y = m(x)
+    z = y.sum()
+    z.backward()
+    test_case.assertTrue(
+        np.allclose(x.grad.numpy(), np.zeros(shape=input_arr.shape), 1e-05, 1e-05)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestBatchNorm(flow.unittest.TestCase):
+    def test_batchnorm(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_batchnorm2d,
+            _test_batchnorm1d_2d_input,
+            _test_batchnorm1d_3d_input,
+            _test_batchnorm2d_4d_input,
+            _test_batchnorm2d_track_running_stats,
+            test_batchnorm2d_infer,
+            test_batchnorm2d_infer_4d_input,
+            _test_batchnorm2d_backward,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+    @unittest.skip("batchnorm module has a bug")
+    def test_with_random_data(test_case):
+        for device in ["cpu", "cuda"]:
+            for training in [True, False]:
+                test_module_against_pytorch(
+                    test_case,
+                    "nn.BatchNorm2d",
+                    extra_annotations={
+                        "num_features": int,
+                        "eps": float,
+                        "momentum": float,
+                        "affine": bool,
+                        "track_running_stats": bool,
+                        "dtype": str,
+                        "device": flow.device,
+                    },
+                    extra_generators={
+                        "input": random_tensor(ndim=4, dim1=8),
+                        "num_features": constant(8),
+                        "eps": random(1e-06, 1),
+                        "momentum": random(0, 1),
+                        "track_running_stats": constant(True),
+                    },
+                    device=device,
+                    training=training,
+                    n=10,
+                )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_bce_loss.py b/python/oneflow/test/modules/test_bce_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6ea6bb2b37b7c730e6edaac792c35167f391ff2
--- /dev/null
+++ b/python/oneflow/test/modules/test_bce_loss.py
@@ -0,0 +1,150 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _np_bceloss(np_input, np_target, np_weight):
+    np_cross_entropy = -(
+        np_target * np.log(np_input) + (1 - np_target) * np.log(1 - np_input)
+    )
+    if np_weight is not None:
+        assert (
+            np_weight.shape == np_input.shape
+        ), "The weight shape must be the same as Input shape"
+        np_weighted_loss = np_weight * np_cross_entropy
+    else:
+        np_weighted_loss = np_cross_entropy
+    np_bce_loss = np_weighted_loss
+    np_bce_loss_sum = np.sum(np_weighted_loss)
+    np_bce_loss_mean = np.mean(np_weighted_loss)
+    return {"none": np_bce_loss, "sum": np_bce_loss_sum, "mean": np_bce_loss_mean}
+
+
+def _test_bceloss_impl(test_case, device, reduction):
+    x = np.array([[1.2, 0.2, -0.3], [0.7, 0.6, -2]]).astype(np.float32)
+    y = np.array([[0, 1, 0], [1, 0, 1]]).astype(np.float32)
+    w = np.array([[2, 2, 2], [2, 2, 2]]).astype(np.float32)
+    input = flow.Tensor(
+        x, dtype=flow.float32, requires_grad=True, device=flow.device(device)
+    )
+    target = flow.Tensor(y, dtype=flow.float32, device=flow.device(device))
+    weight = flow.Tensor(w, dtype=flow.float32, device=flow.device(device))
+    activation = flow.nn.Sigmoid()
+    sigmoid_input = activation(input)
+    loss = flow.nn.BCELoss(weight, reduction=reduction)
+    loss = loss.to(device)
+    of_out = loss(sigmoid_input, target)
+    np_out = _np_bceloss(sigmoid_input.numpy(), y, w)[reduction]
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    if reduction == "none":
+        np_grad = np.array(
+            [[1.5370497, -0.90033215, 0.851115], [-0.6636245, 1.2913125, -1.7615942]]
+        ).astype(np.float32)
+    elif reduction == "sum":
+        np_grad = np.array(
+            [[1.5370497, -0.90033215, 0.851115], [-0.6636245, 1.2913125, -1.7615942]]
+        ).astype(np.float32)
+    else:
+        np_grad = np.array(
+            [
+                [0.25617492, -0.15005533, 0.14185251],
+                [-0.11060409, 0.21521877, -0.29359904],
+            ]
+        ).astype(np.float32)
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05))
+    input_none = input = flow.Tensor(
+        np.array([[1.2, 0.2, -0.3], [0.7, 0.6, -2]]).astype(np.float32),
+        dtype=flow.float32,
+        requires_grad=True,
+        device=flow.device(device),
+    )
+    sigmoid_input_none = activation(input_none)
+    loss_none = flow.nn.BCELoss(reduction=reduction)
+    loss_none = loss_none.to(device)
+    of_out_none = loss_none(sigmoid_input_none, target)
+    np_out_none = _np_bceloss(sigmoid_input.numpy(), y, None)[reduction]
+    test_case.assertTrue(np.allclose(of_out_none.numpy(), np_out_none, 1e-05, 1e-05))
+    of_out_none = of_out_none.sum()
+    of_out_none.backward()
+    if reduction == "none":
+        np_grad_none = np.array(
+            [[0.7685, -0.4502, 0.4256], [-0.3318, 0.6457, -0.8808]]
+        ).astype(np.float32)
+    elif reduction == "sum":
+        np_grad_none = np.array(
+            [[0.7685, -0.4502, 0.4256], [-0.3318, 0.6457, -0.8808]]
+        ).astype(np.float32)
+    else:
+        np_grad_none = np.array(
+            [[0.1281, -0.075, 0.0709], [-0.0553, 0.1076, -0.1468]]
+        ).astype(np.float32)
+    test_case.assertTrue(
+        np.allclose(input_none.grad.numpy(), np_grad_none, 0.0001, 0.0001)
+    )
+    input_none = input = flow.Tensor(
+        np.array([[1.2, 0.2, -0.3], [0.7, 0.6, -2]]).astype(np.float32),
+        dtype=flow.float32,
+        requires_grad=True,
+        device=flow.device(device),
+    )
+    sigmoid_input_none = activation(input_none)
+    loss_none = flow.nn.BCELoss(reduction=reduction)
+    loss_none = loss_none.to(device)
+    of_out_none = loss_none(sigmoid_input_none, target)
+    np_out_none = _np_bceloss(sigmoid_input.numpy(), y, None)[reduction]
+    test_case.assertTrue(np.allclose(of_out_none.numpy(), np_out_none, 1e-05, 1e-05))
+    of_out_none = of_out_none.sum()
+    of_out_none.backward()
+    if reduction == "none":
+        np_grad_none = np.array(
+            [[0.7685, -0.4502, 0.4256], [-0.3318, 0.6457, -0.8808]]
+        ).astype(np.float32)
+    elif reduction == "sum":
+        np_grad_none = np.array(
+            [[0.7685, -0.4502, 0.4256], [-0.3318, 0.6457, -0.8808]]
+        ).astype(np.float32)
+    else:
+        np_grad_none = np.array(
+            [[0.1281, -0.075, 0.0709], [-0.0553, 0.1076, -0.1468]]
+        ).astype(np.float32)
+    test_case.assertTrue(
+        np.allclose(input_none.grad.numpy(), np_grad_none, 0.0001, 0.0001)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestBCELossModule(flow.unittest.TestCase):
+    def test_bceloss(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [_test_bceloss_impl]
+        arg_dict["device"] = ["cpu", "cuda"]
+        arg_dict["reduction"] = ["none", "sum", "mean"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_bcewithlogitsloss.py b/python/oneflow/test/modules/test_bcewithlogitsloss.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b01ed68818f14ef8998373dc799cdbfb99b4ab6
--- /dev/null
+++ b/python/oneflow/test/modules/test_bcewithlogitsloss.py
@@ -0,0 +1,116 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _np_bcewithlogitsloss(
+    np_input, np_target, np_weight=None, np_pos_weight=None, reduction="none"
+):
+    _neg_input = np.negative(np_input)
+    _max_val = np.clip(_neg_input, 0, None)
+    _neg_max_val = np.negative(_max_val)
+    if np_pos_weight is not None:
+        _log_weight = (np_pos_weight - 1) * np_target + 1
+        _loss = (1 - np_target) * np_input + _log_weight * (
+            np.log(np.exp(_neg_max_val) + np.exp(_neg_input - _max_val)) + _max_val
+        )
+    else:
+        _loss = (1 - np_target) * np_input + _max_val
+        _loss += np.log(np.exp(_neg_max_val) + np.exp(_neg_input - _max_val))
+    if np_weight is not None:
+        assert (
+            np_weight.shape == np_input.shape
+        ), "The weight shape must be the same as Input shape"
+        _weighted_loss = np_weight * _loss
+    else:
+        _weighted_loss = _loss
+    if reduction == "mean":
+        return _weighted_loss.mean()
+    elif reduction == "sum":
+        return _weighted_loss.sum()
+    else:
+        return _weighted_loss
+
+
+def _np_bcewithlogitsloss_grad(np_input, np_target, np_weight, np_pos_weight):
+    elemcnt = np_target.size
+    np_bce_with_logits_grad_mean = -(np_weight / elemcnt) * (
+        np_target
+        - 1
+        + ((1 - np_pos_weight) * np_target - 1)
+        * (-np.exp(-np_input) / (1 + np.exp(-np_input)))
+    )
+    np_bce_with_logits_grad_sum = np_bce_with_logits_grad_mean * elemcnt
+    return {
+        "mean": np_bce_with_logits_grad_mean,
+        "sum": np_bce_with_logits_grad_sum,
+        "none": np_bce_with_logits_grad_sum,
+    }
+
+
+def _test_bcewithlogitsloss_impl(test_case, device, shape, reduction):
+    x = np.random.randn(*shape).astype(np.float32)
+    y = np.random.randint(0, 2, [*shape]).astype(np.float32)
+    w = np.random.randn(*shape).astype(np.float32)
+    pw = np.random.randn([*shape][-1]).astype(np.float32)
+    input = flow.Tensor(
+        x, dtype=flow.float32, requires_grad=True, device=flow.device(device)
+    )
+    target = flow.Tensor(y, dtype=flow.float32, device=flow.device(device))
+    weight = flow.Tensor(w, dtype=flow.float32, device=flow.device(device))
+    pos_weight = flow.Tensor(pw, dtype=flow.float32, device=flow.device(device))
+    bcewithlogits_loss = flow.nn.BCEWithLogitsLoss(
+        weight=weight, pos_weight=pos_weight, reduction=reduction
+    )
+    of_out = bcewithlogits_loss(input, target)
+    np_out = _np_bcewithlogitsloss(
+        x, y, np_weight=w, np_pos_weight=pw, reduction=reduction
+    )
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = _np_bcewithlogitsloss_grad(x, y, np_weight=w, np_pos_weight=pw)[reduction]
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 0.0001, 0.0001))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestBCEWithLogitsLossModule(flow.unittest.TestCase):
+    def test_bcewithlogitsloss(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [_test_bcewithlogitsloss_impl]
+        arg_dict["device"] = ["cpu", "cuda"]
+        arg_dict["shape"] = [
+            (3, 5),
+            (10, 9, 21),
+            (14, 22, 9, 21),
+            (3, 2, 4, 16, 5),
+            (1,),
+        ]
+        arg_dict["reduction"] = ["none", "sum", "mean"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_bernoulli.py b/python/oneflow/test/modules/test_bernoulli.py
new file mode 100644
index 0000000000000000000000000000000000000000..2aecd6d6ce2f6613997982d058a2ba0250d49303
--- /dev/null
+++ b/python/oneflow/test/modules/test_bernoulli.py
@@ -0,0 +1,56 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_bernoulli(test_case, shape):
+    input_arr = np.ones(shape)
+    x = flow.Tensor(input_arr, device=flow.device("cpu"))
+    y = flow.bernoulli(x)
+    test_case.assertTrue(np.allclose(y.numpy(), x.numpy()))
+
+
+def _test_bernoulli_with_generator(test_case, shape):
+    generator = flow.Generator()
+    generator.manual_seed(0)
+    x = flow.Tensor(np.random.rand(*shape), device=flow.device("cpu"))
+    y_1 = flow.bernoulli(x, generator=generator)
+    y_1.numpy()
+    generator.manual_seed(0)
+    y_2 = flow.bernoulli(x, generator=generator)
+    test_case.assertTrue(np.allclose(y_1.numpy(), y_2.numpy()))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestBernoulli(flow.unittest.TestCase):
+    def test_bernoulli(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_functions"] = [_test_bernoulli]
+        arg_dict["shape"] = [(2, 3), (2, 3, 4), (2, 3, 4, 5)]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_bmm.py b/python/oneflow/test/modules/test_bmm.py
new file mode 100644
index 0000000000000000000000000000000000000000..e18748de281d0173ac2e35a4127b2d3059246277
--- /dev/null
+++ b/python/oneflow/test/modules/test_bmm.py
@@ -0,0 +1,101 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_bmm(test_case, device):
+    input1 = flow.Tensor(
+        np.random.randn(10, 3, 4), dtype=flow.float32, device=flow.device(device)
+    )
+    input2 = flow.Tensor(
+        np.random.randn(10, 4, 5), dtype=flow.float32, device=flow.device(device)
+    )
+    of_out = flow.bmm(input1, input2)
+    np_out = np.matmul(input1.numpy(), input2.numpy())
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_bmm_backward(test_case, device):
+    input1 = flow.Tensor(
+        [
+            [
+                [-0.0036776792258024216, 1.9946473836898804, -0.423959881067276],
+                [1.0892143249511719, 0.04005361348390579, -0.27883127331733704],
+            ],
+            [
+                [-0.970306396484375, 0.017771577462553978, 0.019596196711063385],
+                [0.27402883768081665, -0.8192587494850159, -0.3135920464992523],
+            ],
+        ],
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    input2 = flow.Tensor(
+        [
+            [
+                [1.118346929550171, -0.930071234703064],
+                [1.1238232851028442, 1.373764157295227],
+                [0.17178462445735931, -1.1010534763336182],
+            ],
+            [
+                [0.6694859862327576, 0.9250285029411316],
+                [-1.0835869312286377, 0.4192655086517334],
+                [1.2616937160491943, 0.33809131383895874],
+            ],
+        ],
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    of_out = flow.bmm(input1, input2)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [
+        [
+            [0.18827569484710693, 2.4975874423980713, -0.9292688369750977],
+            [0.18827569484710693, 2.4975874423980713, -0.9292688369750977],
+        ],
+        [
+            [1.5945144891738892, -0.6643214225769043, 1.5997850894927979],
+            [1.5945144891738892, -0.6643214225769043, 1.5997850894927979],
+        ],
+    ]
+    test_case.assertTrue(
+        np.allclose(input1.grad.numpy(), np_grad, atol=1e-05, rtol=1e-05)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestModule(flow.unittest.TestCase):
+    def test_bmm(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [_test_bmm, _test_bmm_backward]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_broadcast_like.py b/python/oneflow/test/modules/test_broadcast_like.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0e988f80ff0febad6f1e24485758f3ff91f70fa
--- /dev/null
+++ b/python/oneflow/test/modules/test_broadcast_like.py
@@ -0,0 +1,111 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_broadcast_like(test_case, device):
+    input = flow.Tensor(
+        np.ones(shape=(3, 1, 1), dtype=np.float32),
+        dtype=flow.float32,
+        device=flow.device(device),
+    )
+    like_tensor = flow.Tensor(
+        np.ones(shape=(3, 3, 3), dtype=np.float32),
+        dtype=flow.float32,
+        device=flow.device(device),
+    )
+    of_out = flow.broadcast_like(input, like_tensor, broadcast_axes=(1, 2))
+    np_out = np.ones(shape=(3, 3, 3))
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_broadcast_like_3dim(test_case, device):
+    input = flow.Tensor(
+        np.ones(shape=(1, 3, 2), dtype=np.float32),
+        dtype=flow.float32,
+        device=flow.device(device),
+    )
+    like_tensor = flow.Tensor(
+        np.ones(shape=(3, 3, 2), dtype=np.float32),
+        dtype=flow.float32,
+        device=flow.device(device),
+    )
+    of_out = flow.broadcast_like(input, like_tensor, broadcast_axes=(0,))
+    np_out = np.ones(shape=(3, 3, 2))
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_broadcast_like_4dim(test_case, device):
+    input = flow.Tensor(
+        np.ones(shape=(1, 3, 2, 1), dtype=np.float32),
+        dtype=flow.float32,
+        device=flow.device(device),
+    )
+    like_tensor = flow.Tensor(
+        np.ones(shape=(3, 3, 2, 3), dtype=np.float32),
+        dtype=flow.float32,
+        device=flow.device(device),
+    )
+    of_out = flow.broadcast_like(input, like_tensor, broadcast_axes=(0, 3))
+    np_out = np.ones(shape=(3, 3, 2, 3))
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_broadcast_like_backward(test_case, device):
+    input = flow.Tensor(
+        np.ones(shape=(3, 1, 1), dtype=np.float32),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    like_tensor = flow.Tensor(
+        np.ones(shape=(3, 3, 3), dtype=np.float32),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    of_out = flow.broadcast_like(input, like_tensor, broadcast_axes=(1, 2))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [[[9.0]], [[9.0]], [[9.0]]]
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestBroadCastLike(flow.unittest.TestCase):
+    def test_broadcast_like(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_broadcast_like,
+            _test_broadcast_like_3dim,
+            _test_broadcast_like_4dim,
+            _test_broadcast_like_backward,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_cast.py b/python/oneflow/test/modules/test_cast.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d21a21429e3cf663cc4448355c4d312e4895272
--- /dev/null
+++ b/python/oneflow/test/modules/test_cast.py
@@ -0,0 +1,71 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_cast_float2int(test_case, device, shape):
+    np_arr = np.random.randn(*shape).astype(np.float32)
+    input = flow.Tensor(np_arr, dtype=flow.float32, device=flow.device(device))
+    output = flow.cast(input, flow.int8)
+    np_out = np_arr.astype(np.int8)
+    test_case.assertTrue(np.array_equal(output.numpy(), np_out))
+
+
+def _test_cast_int2float(test_case, device, shape):
+    np_arr = np.random.randn(*shape).astype(np.int8)
+    input = flow.Tensor(np_arr, dtype=flow.int8, device=flow.device(device))
+    output = flow.cast(input, flow.float32)
+    np_out = np_arr.astype(np.float32)
+    test_case.assertTrue(np.array_equal(output.numpy(), np_out))
+
+
+def _test_cast_backward(test_case, device, shape):
+    np_arr = np.random.randn(*shape).astype(np.float32)
+    x = flow.Tensor(
+        np_arr, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    y = flow.cast(x, flow.int8)
+    z = y.sum()
+    z.backward()
+    np_out = np_arr.astype(np.int8)
+    test_case.assertTrue(np.array_equal(x.grad.numpy(), np.ones(shape=shape)))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestCast(flow.unittest.TestCase):
+    def test_cast(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_cast_float2int,
+            _test_cast_int2float,
+            _test_cast_backward,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        arg_dict["shape"] = [(2, 3), (2, 3, 4), (2, 3, 4, 5)]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_ceil.py b/python/oneflow/test/modules/test_ceil.py
new file mode 100644
index 0000000000000000000000000000000000000000..6eb12d7a32be4b65546b8bd0a3e2c472cbacdb5f
--- /dev/null
+++ b/python/oneflow/test/modules/test_ceil.py
@@ -0,0 +1,51 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_ceil_impl(test_case, device, shape):
+    x = flow.Tensor(
+        np.random.randn(*shape), device=flow.device(device), requires_grad=True
+    )
+    of_out = flow.ceil(x)
+    np_out = np.ceil(x.numpy())
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+    of_out = of_out.sum()
+    of_out.backward()
+    test_case.assertTrue(np.allclose(x.grad.numpy(), np.zeros(shape), 0.0001, 0.0001))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestCeilModule(flow.unittest.TestCase):
+    def test_ceil(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [_test_ceil_impl]
+        arg_dict["device"] = ["cpu", "cuda"]
+        arg_dict["shape"] = [(1,), (2, 3), (2, 3, 4), (2, 3, 4, 5)]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_chunk.py b/python/oneflow/test/modules/test_chunk.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac15254d55b2b5a458a5619646411028c57eb401
--- /dev/null
+++ b/python/oneflow/test/modules/test_chunk.py
@@ -0,0 +1,188 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_2_dim_forward(test_case, device):
+    np_arr = np.random.randn(2, 3).astype(np.float32)
+    input = flow.Tensor(np_arr, device=flow.device(device))
+    dim = 0
+    chunks = 2
+    of_out = flow.chunk(input, chunks, dim)
+    np_out_shape = [(1, 3), (1, 3)]
+    for i in range(0, chunks):
+        of_out_shape = of_out[i].numpy().shape
+        test_case.assertTrue(np.allclose(of_out_shape, np_out_shape[i], 1e-05, 1e-05))
+    dim = 1
+    chunks = 2
+    of_out = flow.chunk(input, chunks, dim)
+    np_out_shape = [(2, 1), (2, 2)]
+    for i in range(0, chunks):
+        of_out_shape = of_out[i].numpy().shape
+        test_case.assertTrue(np.allclose(of_out_shape, np_out_shape[i], 1e-05, 1e-05))
+    dim = 1
+    chunks = 3
+    of_out = flow.chunk(input, chunks, dim)
+    np_out_shape = [(2, 1), (2, 1), (2, 1)]
+    for i in range(0, chunks):
+        of_out_shape = of_out[i].numpy().shape
+        test_case.assertTrue(np.allclose(of_out_shape, np_out_shape[i], 1e-05, 1e-05))
+
+
+def _test_2_dim_tensor_function_forward(test_case, device):
+    np_arr = np.random.randn(2, 3).astype(np.float32)
+    input = flow.Tensor(np_arr, device=flow.device(device))
+    dim = 0
+    chunks = 2
+    of_out = input.chunk(chunks, dim)
+    np_out_shape = [(1, 3), (1, 3)]
+    for i in range(0, chunks):
+        of_out_shape = of_out[i].numpy().shape
+        test_case.assertTrue(np.allclose(of_out_shape, np_out_shape[i], 1e-05, 1e-05))
+    dim = 1
+    chunks = 2
+    of_out = input.chunk(chunks, dim)
+    np_out_shape = [(2, 1), (2, 2)]
+    for i in range(0, chunks):
+        of_out_shape = of_out[i].numpy().shape
+        test_case.assertTrue(np.allclose(of_out_shape, np_out_shape[i], 1e-05, 1e-05))
+    dim = 1
+    chunks = 3
+    of_out = input.chunk(chunks, dim)
+    np_out_shape = [(2, 1), (2, 1), (2, 1)]
+    for i in range(0, chunks):
+        of_out_shape = of_out[i].numpy().shape
+        test_case.assertTrue(np.allclose(of_out_shape, np_out_shape[i], 1e-05, 1e-05))
+
+
+def _test_4_dim_forward(test_case, device):
+    np_arr = np.random.randn(5, 3, 6, 9).astype(np.float32)
+    input = flow.Tensor(np_arr, device=flow.device(device))
+    dim = 2
+    chunks = 3
+    of_out = flow.chunk(input, chunks, dim)
+    np_out_shape = [(5, 3, 2, 9), (5, 3, 2, 9), (5, 3, 2, 9)]
+    for i in range(0, chunks):
+        of_out_shape = of_out[i].numpy().shape
+        test_case.assertTrue(np.allclose(of_out_shape, np_out_shape[i], 1e-05, 1e-05))
+    dim = 2
+    chunks = 4
+    of_out = flow.chunk(input, chunks, dim)
+    np_out_shape = [(5, 3, 1, 9), (5, 3, 1, 9), (5, 3, 1, 9), (5, 3, 3, 9)]
+    for i in range(0, chunks):
+        of_out_shape = of_out[i].numpy().shape
+        test_case.assertTrue(np.allclose(of_out_shape, np_out_shape[i], 1e-05, 1e-05))
+    dim = 3
+    chunks = 3
+    of_out = flow.chunk(input, chunks, dim)
+    np_out_shape = [(5, 3, 6, 3), (5, 3, 6, 3), (5, 3, 6, 3)]
+    for i in range(0, chunks):
+        of_out_shape = of_out[i].numpy().shape
+        test_case.assertTrue(np.allclose(of_out_shape, np_out_shape[i], 1e-05, 1e-05))
+    dim = 3
+    chunks = 2
+    of_out = flow.chunk(input, chunks, dim)
+    np_out_shape = [(5, 3, 6, 4), (5, 3, 6, 5)]
+    for i in range(0, chunks):
+        of_out_shape = of_out[i].numpy().shape
+        test_case.assertTrue(np.allclose(of_out_shape, np_out_shape[i], 1e-05, 1e-05))
+    dim = 3
+    chunks = 4
+    of_out = flow.chunk(input, chunks, dim)
+    np_out_shape = [(5, 3, 6, 2), (5, 3, 6, 2), (5, 3, 6, 2), (5, 3, 6, 3)]
+    for i in range(0, chunks):
+        of_out_shape = of_out[i].numpy().shape
+        test_case.assertTrue(np.allclose(of_out_shape, np_out_shape[i], 1e-05, 1e-05))
+
+
+def _test_4_dim_tensor_function_forward(test_case, device):
+    np_arr = np.random.randn(5, 3, 6, 9).astype(np.float32)
+    input = flow.Tensor(np_arr, device=flow.device(device))
+    dim = 2
+    chunks = 3
+    of_out = input.chunk(chunks, dim)
+    np_out_shape = [(5, 3, 2, 9), (5, 3, 2, 9), (5, 3, 2, 9)]
+    for i in range(0, chunks):
+        of_out_shape = of_out[i].numpy().shape
+        test_case.assertTrue(np.allclose(of_out_shape, np_out_shape[i], 1e-05, 1e-05))
+    dim = 2
+    chunks = 4
+    of_out = input.chunk(chunks, dim)
+    np_out_shape = [(5, 3, 1, 9), (5, 3, 1, 9), (5, 3, 1, 9), (5, 3, 3, 9)]
+    for i in range(0, chunks):
+        of_out_shape = of_out[i].numpy().shape
+        test_case.assertTrue(np.allclose(of_out_shape, np_out_shape[i], 1e-05, 1e-05))
+    dim = 3
+    chunks = 3
+    of_out = input.chunk(chunks, dim)
+    np_out_shape = [(5, 3, 6, 3), (5, 3, 6, 3), (5, 3, 6, 3)]
+    for i in range(0, chunks):
+        of_out_shape = of_out[i].numpy().shape
+        test_case.assertTrue(np.allclose(of_out_shape, np_out_shape[i], 1e-05, 1e-05))
+    dim = 3
+    chunks = 2
+    of_out = input.chunk(chunks, dim)
+    np_out_shape = [(5, 3, 6, 4), (5, 3, 6, 5)]
+    for i in range(0, chunks):
+        of_out_shape = of_out[i].numpy().shape
+        test_case.assertTrue(np.allclose(of_out_shape, np_out_shape[i], 1e-05, 1e-05))
+    dim = 3
+    chunks = 4
+    of_out = input.chunk(chunks, dim)
+    np_out_shape = [(5, 3, 6, 2), (5, 3, 6, 2), (5, 3, 6, 2), (5, 3, 6, 3)]
+    for i in range(0, chunks):
+        of_out_shape = of_out[i].numpy().shape
+        test_case.assertTrue(np.allclose(of_out_shape, np_out_shape[i], 1e-05, 1e-05))
+
+
+def _test_chunk_backward(test_case, device):
+    np_arr = np.random.randn(2, 3).astype(np.float32)
+    input = flow.Tensor(np_arr, device=flow.device(device))
+    input.requires_grad = True
+    y = flow.chunk(input, chunks=2, dim=0)
+    (z1, z2) = (y[0].sum(), y[1].sum())
+    z1.backward()
+    z2.backward()
+    np_grad = np.ones((2, 3))
+    test_case.assertTrue(np.array_equal(input.grad.numpy(), np_grad))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestChunk(flow.unittest.TestCase):
+    def test_chunk(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_2_dim_forward,
+            _test_4_dim_forward,
+            _test_2_dim_tensor_function_forward,
+            _test_4_dim_tensor_function_forward,
+            _test_chunk_backward,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_clamp.py b/python/oneflow/test/modules/test_clamp.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ef548bfd688db283eb8d4856a8a5ab201691a6f
--- /dev/null
+++ b/python/oneflow/test/modules/test_clamp.py
@@ -0,0 +1,111 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_clamp(test_case, shape, device):
+    input = flow.Tensor(
+        np.random.randn(*shape), dtype=flow.float32, device=flow.device(device)
+    )
+    of_out = flow.clamp(input, 0.1, 0.5)
+    np_out = np.clip(input.numpy(), 0.1, 0.5)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_tensor_clamp(test_case, shape, device):
+    input = flow.Tensor(
+        np.random.randn(*shape), dtype=flow.float32, device=flow.device(device)
+    )
+    of_out = input.clamp(0.1, 0.5)
+    np_out = np.clip(input.numpy(), 0.1, 0.5)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_clamp_scalar_min(test_case, shape, device):
+    input = flow.Tensor(
+        np.random.randn(*shape), dtype=flow.float32, device=flow.device(device)
+    )
+    of_out = flow.clamp(input, 0.1, None)
+    np_out = np.clip(input.numpy(), 0.1, None)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_clamp_scalar_max(test_case, shape, device):
+    input = flow.Tensor(
+        np.random.randn(*shape), dtype=flow.float32, device=flow.device(device)
+    )
+    of_out = flow.clamp(input, None, 0.5)
+    np_out = np.clip(input.numpy(), None, 0.5)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_clamp_integral(test_case, shape, device):
+    input = flow.Tensor(np.random.randint(3, 10, shape), device=flow.device(device))
+    of_out = flow.clamp(input, 1, 5)
+    np_out = np.clip(input.numpy(), 1, 5)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _numpy_clamp_grad(arr, min, max):
+    grad = np.zeros_like(arr)
+    grad[arr.clip(min, max) == arr] += 1
+    return grad
+
+
+def _test_clamp_backward(test_case, shape, device):
+    x = flow.Tensor(
+        np.random.randn(*shape),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    y = flow.clamp(x, 0.1, 0.5).sum()
+    y.backward()
+    test_case.assertTrue(
+        np.allclose(
+            x.grad.numpy(), _numpy_clamp_grad(x.numpy(), 0.1, 0.5), 1e-05, 1e-05
+        )
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestClampModule(flow.unittest.TestCase):
+    def test_clamp(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["fun"] = [
+            _test_clamp,
+            _test_tensor_clamp,
+            _test_clamp_scalar_min,
+            _test_clamp_scalar_max,
+            _test_clamp_integral,
+            _test_clamp_backward,
+        ]
+        arg_dict["shape"] = [(2,), (2, 3), (2, 4, 5, 6)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_concat.py b/python/oneflow/test/modules/test_concat.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5dea24b23a5fb769a20758f877f7a8cf27b19f2
--- /dev/null
+++ b/python/oneflow/test/modules/test_concat.py
@@ -0,0 +1,137 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_concat_origin(test_case, device):
+    input1 = flow.Tensor(
+        np.random.randn(2, 6, 5, 3), dtype=flow.float32, device=flow.device(device)
+    )
+    input2 = flow.Tensor(
+        np.random.randn(2, 6, 5, 3), dtype=flow.float32, device=flow.device(device)
+    )
+    of_out = flow.cat([input1, input2], dim=0)
+    np_out = np.concatenate((input1.numpy(), input2.numpy()), axis=0)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+def _test_concat_with_axis_one(test_case, device):
+    input1 = flow.Tensor(
+        np.random.randn(2, 6, 5, 3), dtype=flow.float32, device=flow.device(device)
+    )
+    input2 = flow.Tensor(
+        np.random.randn(2, 6, 5, 3), dtype=flow.float32, device=flow.device(device)
+    )
+    of_out = flow.cat([input1, input2], dim=1)
+    np_out = np.concatenate((input1.numpy(), input2.numpy()), axis=1)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+def _test_concat_with_three_tensor(test_case, device):
+    input1 = flow.Tensor(
+        np.random.randn(2, 6, 5, 3), dtype=flow.float32, device=flow.device(device)
+    )
+    input2 = flow.Tensor(
+        np.random.randn(2, 6, 5, 3), dtype=flow.float32, device=flow.device(device)
+    )
+    input3 = flow.Tensor(
+        np.random.randn(2, 6, 5, 3), dtype=flow.float32, device=flow.device(device)
+    )
+    of_out = flow.cat([input1, input2, input3], dim=1)
+    np_out = np.concatenate((input1.numpy(), input2.numpy(), input3.numpy()), axis=1)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+def _test_concat_with_three_tensor_backward(test_case, device):
+    input1 = flow.Tensor(
+        np.random.randn(2, 6, 5, 3),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    input2 = flow.Tensor(
+        np.random.randn(2, 6, 5, 3),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    input3 = flow.Tensor(
+        np.random.randn(2, 6, 5, 3),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    of_out = flow.cat([input1, input2, input3], dim=1)
+    of_out = of_out.sum()
+    of_out.backward()
+    test_case.assertTrue(
+        np.allclose(input1.grad.numpy(), np.ones((2, 6, 5, 3)), 0.0001, 0.0001)
+    )
+    test_case.assertTrue(
+        np.allclose(input2.grad.numpy(), np.ones((2, 6, 5, 3)), 0.0001, 0.0001)
+    )
+    test_case.assertTrue(
+        np.allclose(input3.grad.numpy(), np.ones((2, 6, 5, 3)), 0.0001, 0.0001)
+    )
+
+
+def _test_concat_grad_and_no_grad(test_case, device):
+    input1 = flow.Tensor(
+        np.random.randn(2, 6, 5, 3),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    input2 = flow.Tensor(
+        np.random.randn(2, 6, 5, 3),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=False,
+    )
+    of_out = flow.cat([input1, input2], dim=1)
+    of_out = of_out.sum()
+    of_out.backward()
+    test_case.assertTrue(
+        np.allclose(input1.grad.numpy(), np.ones((2, 6, 5, 3)), 0.0001, 0.0001)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestModule(flow.unittest.TestCase):
+    def test_concat(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_concat_origin,
+            _test_concat_with_axis_one,
+            _test_concat_with_three_tensor,
+            _test_concat_with_three_tensor_backward,
+            _test_concat_grad_and_no_grad,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_constant.py b/python/oneflow/test/modules/test_constant.py
new file mode 100644
index 0000000000000000000000000000000000000000..deaeb060668df00a43e69bfb1c623dff7bd239b0
--- /dev/null
+++ b/python/oneflow/test/modules/test_constant.py
@@ -0,0 +1,126 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+from oneflow.framework.tensor import register_tensor_op
+
+
+def _test_ones(test_case, device, shape):
+    y = flow.ones(shape, device=flow.device(device))
+    test_case.assertTrue(np.array_equal(np.ones(shape), y.numpy()))
+
+
+def _test_different_dtype(test_case, device, shape):
+    y1 = flow.ones(shape, dtype=flow.int32, device=flow.device(device))
+    test_case.assertTrue(np.array_equal(np.ones(shape, dtype=np.int32), y1.numpy()))
+    y2 = flow.ones(shape, dtype=flow.uint8, device=flow.device(device))
+    test_case.assertTrue(np.array_equal(np.ones(shape, dtype=np.uint8), y2.numpy()))
+    y3 = flow.ones(shape, dtype=flow.float64, device=flow.device(device))
+    test_case.assertTrue(np.array_equal(np.ones(shape, dtype=np.float64), y3.numpy()))
+
+
+def _test_ones_backward(test_case, device, shape):
+    x = flow.ones(shape, device=flow.device(device), requires_grad=True)
+    y = x.sum()
+    y.backward()
+    test_case.assertTrue(np.array_equal(np.ones(shape), x.grad.numpy()))
+
+
+def _test_zeros(test_case, device, shape):
+    y = flow.zeros(shape, device=flow.device(device))
+    test_case.assertTrue(np.array_equal(np.zeros(shape), y.numpy()))
+    y2 = flow.zeros(10, device=flow.device(device))
+    test_case.assertTrue(np.array_equal(np.zeros(10), y2.numpy()))
+    y3 = flow.zeros(10, dtype=flow.int, device=flow.device(device))
+    test_case.assertTrue(np.array_equal(np.zeros(10, dtype=int), y3.numpy()))
+
+
+def _test_zeros_backward(test_case, device, shape):
+    x = flow.zeros(shape, device=flow.device(device), requires_grad=True)
+    y = x.sum()
+    y.backward()
+    test_case.assertTrue(np.array_equal(np.ones(shape), x.grad.numpy()))
+
+
+def _test_ones_like(test_case, device, shape):
+    x = flow.Tensor(np.ones(shape, dtype=np.float64))
+    test_case.assertTrue(
+        np.array_equal(np.ones_like(x.numpy()), flow.ones_like(x).numpy())
+    )
+    x2 = flow.Tensor(np.ones([2, 4], dtype=int))
+    test_case.assertTrue(
+        np.array_equal(np.ones_like(x2.numpy()), flow.ones_like(x2).numpy())
+    )
+
+
+def _test_zeros_like(test_case, device, shape):
+    x = flow.Tensor(np.ones(shape, dtype=np.float64))
+    test_case.assertTrue(
+        np.array_equal(np.zeros_like(x.numpy()), flow.zeros_like(x).numpy())
+    )
+    x2 = flow.Tensor(np.ones(shape, dtype=int))
+    test_case.assertTrue(
+        np.array_equal(np.zeros_like(x2.numpy()), flow.zeros_like(x2).numpy())
+    )
+
+
+def _test_new_ones(test_case, device, shape):
+    x = flow.Tensor(np.ones(shape), device=flow.device(device))
+    y = x.new_ones(shape, device=device)
+    test_case.assertTrue(x.dtype == y.dtype)
+    test_case.assertTrue(x.device == y.device)
+    test_case.assertTrue(x.requires_grad == y.requires_grad)
+    x = flow.Tensor(np.ones(shape), device=flow.device(device))
+    y = x.new_ones(x.shape, device=device)
+    test_case.assertTrue(x.dtype == y.dtype)
+    test_case.assertTrue(x.device == y.device)
+    test_case.assertTrue(x.requires_grad == y.requires_grad)
+    x = flow.Tensor(np.ones(shape), device=flow.device(device))
+    x = x.new_ones(shape, device=device, requires_grad=True)
+    y = x.sum()
+    y.backward()
+    test_case.assertTrue(np.array_equal(np.ones_like(x.numpy()), x.grad.numpy()))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestConstantModule(flow.unittest.TestCase):
+    def test_cast(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_ones,
+            _test_different_dtype,
+            _test_zeros,
+            _test_ones_backward,
+            _test_zeros_backward,
+            _test_ones_like,
+            _test_zeros_like,
+            _test_new_ones,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        arg_dict["shape"] = [(2, 3), (2, 3, 4), (2, 3, 4, 5)]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_constantpad.py b/python/oneflow/test/modules/test_constantpad.py
new file mode 100644
index 0000000000000000000000000000000000000000..a658ec0f03dccdafe181dd81c18a265a52d89875
--- /dev/null
+++ b/python/oneflow/test/modules/test_constantpad.py
@@ -0,0 +1,155 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from automated_test_util import *
+from test_util import Array2Numpy, FlattenArray, GenArgList, Index2Coordinate
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _np_constant_pad2d_grad(src, dest, padding):
+    (c_idx, h_idx, w_idx) = (1, 2, 3)
+    pad_left = padding[0]
+    pad_right = padding[1]
+    pad_top = padding[2]
+    pad_bottom = padding[3]
+    (dx_height, dx_width) = (dest.shape[h_idx], dest.shape[w_idx])
+    (dy_height, dy_width) = (src.shape[h_idx], src.shape[w_idx])
+    numpy_src = np.ones(src.shape, np.int32)
+    numpy_dest = np.zeros(dest.shape, np.int32)
+    array_src = FlattenArray(numpy_src)
+    array_dest = FlattenArray(numpy_dest)
+    src_num = src.shape[c_idx] * src.shape[h_idx] * src.shape[w_idx]
+    dest_num = dest.shape[c_idx] * dest.shape[h_idx] * dest.shape[w_idx]
+    elements_num = src.shape[0] * src_num
+    for iter_n in range(elements_num):
+        coords = Index2Coordinate(iter_n, src.shape)
+        (n, c, i, j) = (coords[0], coords[c_idx], coords[h_idx], coords[w_idx])
+        ip_x = ip_y = 0
+        if (
+            j >= pad_left
+            and j < dx_width + pad_left
+            and (i >= pad_top)
+            and (i < dx_height + pad_top)
+        ):
+            ip_x = j - pad_left
+            ip_y = i - pad_top
+            src_index = n * src_num + c * dy_width * dy_height + i * dy_width + j
+            dest_index = (
+                n * dest_num + c * dx_width * dx_height + ip_y * dx_width + ip_x
+            )
+            array_dest[dest_index] += array_src[src_index]
+    numpy_dest = Array2Numpy(array_dest, dest.shape)
+    return numpy_dest
+
+
+def _test_ConstantPad2d(test_case, shape, padding, value, device):
+    np_input = np.random.random(shape)
+    of_input = flow.Tensor(
+        np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    if isinstance(padding, int):
+        np_boundary = ((0, 0), (0, 0), (padding, padding), (padding, padding))
+    elif isinstance(padding, (tuple, int)) and len(padding) == 4:
+        np_boundary = (
+            (0, 0),
+            (0, 0),
+            (padding[2], padding[3]),
+            (padding[0], padding[1]),
+        )
+    else:
+        raise ValueError("padding must be in or list or tuple!")
+    layer = flow.nn.ConstantPad2d(padding=padding, value=value)
+    of_out = layer(of_input)
+    np_out = np.pad(np_input, np_boundary, mode="constant", constant_values=value)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_out_grad = _np_constant_pad2d_grad(np_out, np_input, layer.padding)
+    test_case.assertTrue(np.allclose(of_input.grad.numpy(), np_out_grad, 1e-05, 1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestConstantPad1d(flow.unittest.TestCase):
+    @autotest(rtol=0.0001, atol=0.0001)
+    def test_constantpad1d_with_random_data(test_case):
+        m = torch.nn.ConstantPad1d(padding=random().to(int), value=random().to(float))
+        m.train(random())
+        device = random_device()
+        m.to(device)
+        x = random_pytorch_tensor(ndim=3, dim1=random(1, 6), dim2=random(1, 6)).to(
+            device
+        )
+        y = m(x)
+        return y
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestConstantPad2d(flow.unittest.TestCase):
+    def test_ConstantPad2d(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(1, 2, 3, 4), (8, 3, 4, 4)]
+        arg_dict["padding"] = [2, (1, 1, 2, 2)]
+        arg_dict["value"] = [0.8, 1]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_ConstantPad2d(test_case, *arg)
+
+    def test_with_random_data(test_case):
+        for device in ["cpu", "cuda"]:
+            spatial_size = np.random.randint(1, 6)
+            test_module_against_pytorch(
+                test_case,
+                "nn.ConstantPad2d",
+                extra_annotations={"padding": int, "value": float},
+                extra_generators={
+                    "input": random_tensor(
+                        ndim=4, dim2=spatial_size, dim3=spatial_size
+                    ),
+                    "padding": random(0, 3),
+                    "value": random(0, 10),
+                },
+                device=device,
+            )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestConstantPad3d(flow.unittest.TestCase):
+    def test_with_random_data(test_case):
+        for device in ["cpu", "cuda"]:
+            spatial_size = np.random.randint(1, 6)
+            test_module_against_pytorch(
+                test_case,
+                "nn.ConstantPad3d",
+                extra_annotations={"padding": int, "value": float},
+                extra_generators={
+                    "input": random_tensor(
+                        ndim=5, dim2=spatial_size, dim3=spatial_size, dim4=spatial_size
+                    ),
+                    "padding": random(0, 3),
+                    "value": random(0, 10),
+                },
+                device=device,
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_conv.py b/python/oneflow/test/modules/test_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed0e15ad9994c5f475eb3206263d0b1aaaf3c38e
--- /dev/null
+++ b/python/oneflow/test/modules/test_conv.py
@@ -0,0 +1,1820 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from automated_test_util import *
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+test_conv2d_weight = np.array(
+    [
+        [
+            [
+                [0.8586049675941467, -0.2279418259859085, 0.2013147622346878],
+                [0.35005471110343933, 0.5360521078109741, 1.5194443464279175],
+                [1.9040879011154175, -1.5734431743621826, -0.14007866382598877],
+            ]
+        ],
+        [
+            [
+                [0.29670074582099915, 1.3111951351165771, 0.5035904049873352],
+                [-1.1894450187683105, -0.5502137541770935, -1.591875672340393],
+                [-1.1081947088241577, 0.07872020453214645, -0.9185634255409241],
+            ]
+        ],
+        [
+            [
+                [-0.7457143664360046, -1.2080862522125244, 1.8140212297439575],
+                [-1.5227429866790771, -2.515244960784912, -1.3549325466156006],
+                [-0.9574840068817139, -0.7248556613922119, 1.1119636297225952],
+            ]
+        ],
+    ]
+)
+test_conv2d_data = np.array(
+    [
+        [
+            [
+                [
+                    1.1630785465240479,
+                    0.4838046133518219,
+                    0.299563467502594,
+                    0.15302546322345734,
+                    -1.168814778327942,
+                ],
+                [
+                    1.5580710172653198,
+                    -0.5459445714950562,
+                    -2.3556296825408936,
+                    0.5414402484893799,
+                    2.678506374359131,
+                ],
+                [
+                    1.2546343803405762,
+                    -0.5487740635871887,
+                    -0.6810643672943115,
+                    -0.13531559705734253,
+                    0.37723132967948914,
+                ],
+                [
+                    0.41016456484794617,
+                    0.5712682008743286,
+                    -2.757962703704834,
+                    1.0762799978256226,
+                    -0.6141325235366821,
+                ],
+                [
+                    1.830764889717102,
+                    -1.1468064785003662,
+                    0.053837940096855164,
+                    -2.5074806213378906,
+                    -0.5916498899459839,
+                ],
+            ]
+        ]
+    ]
+)
+test_conv2d_data_grad = np.array(
+    [
+        [
+            [
+                [
+                    0.4095913469791412,
+                    0.2847584038972855,
+                    2.803684800863266,
+                    2.3940934538841248,
+                    2.5189263969659805,
+                ],
+                [
+                    -1.9525419473648071,
+                    -4.606781497597694,
+                    -3.51521897315979,
+                    -1.562677025794983,
+                    1.0915625244379044,
+                ],
+                [
+                    -2.1141327619552612,
+                    -6.987950943410397,
+                    -5.84306687861681,
+                    -3.7289341166615486,
+                    1.1448840647935867,
+                ],
+                [
+                    -2.5237241089344025,
+                    -7.272709347307682,
+                    -8.646751679480076,
+                    -6.123027570545673,
+                    -1.3740423321723938,
+                ],
+                [
+                    -0.1615908145904541,
+                    -2.381169445812702,
+                    -2.32784790545702,
+                    -2.1662570908665657,
+                    0.0533215403556824,
+                ],
+            ]
+        ]
+    ]
+)
+test_conv2d_weight_grad = np.array(
+    [
+        [
+            [
+                [0.6277393400669098, -2.7888944894075394, -0.2910575419664383],
+                [-3.095237225294113, -4.835702538490295, -1.8706469237804413],
+                [-1.0139376372098923, -6.076017692685127, -5.780256435275078],
+            ]
+        ],
+        [
+            [
+                [0.6277393400669098, -2.7888944894075394, -0.2910575419664383],
+                [-3.095237225294113, -4.835702538490295, -1.8706469237804413],
+                [-1.0139376372098923, -6.076017692685127, -5.780256435275078],
+            ]
+        ],
+        [
+            [
+                [0.6277393400669098, -2.7888944894075394, -0.2910575419664383],
+                [-3.095237225294113, -4.835702538490295, -1.8706469237804413],
+                [-1.0139376372098923, -6.076017692685127, -5.780256435275078],
+            ]
+        ],
+    ]
+)
+test_conv2d_output = np.array(
+    [
+        [
+            [
+                [0.9699610471725464, -0.20758534967899323, 2.3857712745666504],
+                [0.3666309118270874, 4.690882682800293, -8.203354835510254],
+                [2.6072847843170166, -1.9033538103103638, 2.331153154373169],
+            ],
+            [
+                [2.519343852996826, 2.3757898807525635, -1.6613528728485107],
+                [0.5777544379234314, -3.5739502906799316, 5.349126815795898],
+                [0.729295015335083, 1.5791023969650269, 3.7627718448638916],
+            ],
+            [
+                [-0.27685487270355225, 6.446267127990723, -2.762883424758911],
+                [-8.25644588470459, 9.616064071655273, 8.005367279052734],
+                [-0.6944921016693115, 3.866114854812622, 4.788446426391602],
+            ],
+        ]
+    ]
+)
+test_conv2d_with_bias_weight = np.array(
+    [
+        [
+            [
+                [1.8271433115005493, -1.0446699857711792, 1.0062190294265747],
+                [0.5174201130867004, -0.806931734085083, 1.3769007921218872],
+                [0.205885112285614, 0.9943519234657288, -0.23580588400363922],
+            ]
+        ],
+        [
+            [
+                [0.29881811141967773, -1.9982075691223145, 0.3511354625225067],
+                [-0.7644741535186768, 1.2594351768493652, -0.9629734754562378],
+                [0.5080506205558777, 0.7561734318733215, 1.6839302778244019],
+            ]
+        ],
+        [
+            [
+                [1.2573646306991577, 0.13123232126235962, 1.6403018236160278],
+                [-1.2138012647628784, 2.399970531463623, -0.38509097695350647],
+                [-0.9878040552139282, 0.9585888385772705, -1.4976465702056885],
+            ]
+        ],
+    ]
+)
+test_conv2d_with_bias_bias = np.array(
+    [0.6605162620544434, -0.18903568387031555, -0.27302607893943787]
+)
+test_conv2d_with_bias_data = np.array(
+    [
+        [
+            [
+                [
+                    -0.47827261686325073,
+                    -1.1739492416381836,
+                    -0.7921845316886902,
+                    0.9321041703224182,
+                    -3.1557741165161133,
+                ],
+                [
+                    2.1935296058654785,
+                    -0.5385921001434326,
+                    -0.8611332774162292,
+                    -1.881519079208374,
+                    -0.7205708026885986,
+                ],
+                [
+                    -0.35601571202278137,
+                    -0.15963983535766602,
+                    1.797447681427002,
+                    0.19594945013523102,
+                    -1.7376397848129272,
+                ],
+                [
+                    0.047347065061330795,
+                    0.14580930769443512,
+                    0.32604914903640747,
+                    0.4578782916069031,
+                    -0.8942581415176392,
+                ],
+                [
+                    0.49383941292762756,
+                    -0.9043426513671875,
+                    -1.2140793800354004,
+                    2.1564064025878906,
+                    1.0938222408294678,
+                ],
+            ]
+        ]
+    ]
+)
+test_conv2d_with_bias_output = np.array(
+    [
+        [
+            [
+                [-0.05607491731643677, -0.185230553150177, -3.8808679580688477],
+                [6.861937046051025, -2.3341472148895264, -0.5597308874130249],
+                [1.8299254179000854, -2.770848274230957, 2.1958212852478027],
+            ],
+            [
+                [2.9348952770233154, 4.117504119873047, -6.278541088104248],
+                [0.2638452351093292, 3.998856782913208, 2.612290620803833],
+                [-1.9891828298568726, -1.6476304531097412, 3.39066219329834],
+            ],
+            [
+                [-8.44466781616211, 0.5747121572494507, -8.501373291015625],
+                [-0.036642804741859436, -0.23458999395370483, -2.370849370956421],
+                [2.8372013568878174, -2.987276077270508, 1.8382092714309692],
+            ],
+        ]
+    ]
+)
+test_conv2d_group_weight = np.array(
+    [
+        [
+            [
+                [-0.7248556613922119, 1.1119636297225952, -0.47827261686325073],
+                [-1.1739492416381836, -0.7921845316886902, 0.9321041703224182],
+                [-3.1557741165161133, 2.1935296058654785, -0.5385921001434326],
+            ]
+        ],
+        [
+            [
+                [-0.8611332774162292, -1.881519079208374, -0.7205708026885986],
+                [-0.35601571202278137, -0.15963983535766602, 1.797447681427002],
+                [0.19594945013523102, -1.7376397848129272, 0.047347065061330795],
+            ]
+        ],
+    ]
+)
+test_conv2d_group_data_grad = np.array(
+    [
+        [
+            [
+                [
+                    -0.7248556613922119,
+                    0.3871079683303833,
+                    -0.0911646485328674,
+                    0.6336910128593445,
+                    -0.4782726168632507,
+                ],
+                [
+                    -1.8988049030303955,
+                    -1.5790258049964905,
+                    -1.125194251537323,
+                    0.7736106514930725,
+                    0.4538315534591675,
+                ],
+                [
+                    -5.054579019546509,
+                    -2.5412703156471252,
+                    -2.6260308623313904,
+                    2.4285481572151184,
+                    -0.0847605466842651,
+                ],
+                [
+                    -4.329723358154297,
+                    -2.9283782839775085,
+                    -2.534866213798523,
+                    1.794857144355774,
+                    0.3935120701789856,
+                ],
+                [
+                    -3.1557741165161133,
+                    -0.9622445106506348,
+                    -1.5008366107940674,
+                    1.654937505722046,
+                    -0.5385921001434326,
+                ],
+            ],
+            [
+                [
+                    -0.8611332774162292,
+                    -2.7426523566246033,
+                    -3.463223159313202,
+                    -2.6020898818969727,
+                    -0.7205708026885986,
+                ],
+                [
+                    -1.2171489894390106,
+                    -3.2583079040050507,
+                    -2.1814310252666473,
+                    -0.9642820358276367,
+                    1.0768768787384033,
+                ],
+                [
+                    -1.0211995393037796,
+                    -4.799998238682747,
+                    -3.6757742948830128,
+                    -2.654574755579233,
+                    1.1242239437997341,
+                ],
+                [
+                    -0.1600662618875504,
+                    -2.0573458820581436,
+                    -0.2125511355698109,
+                    -0.0524848736822605,
+                    1.8447947464883327,
+                ],
+                [
+                    0.195949450135231,
+                    -1.5416903346776962,
+                    -1.4943432696163654,
+                    -1.6902927197515965,
+                    0.0473470650613308,
+                ],
+            ],
+        ]
+    ]
+)
+test_conv2d_group_weight_grad = np.array(
+    [
+        [
+            [
+                [0.6277393400669098, -2.7888944894075394, -0.2910575419664383],
+                [-3.095237225294113, -4.835702538490295, -1.8706469237804413],
+                [-1.0139376372098923, -6.076017692685127, -5.780256435275078],
+            ]
+        ],
+        [
+            [
+                [3.30740749835968, -0.7220746576786041, -3.660933956503868],
+                [0.5273916646838188, -2.631059892475605, -7.6207195818424225],
+                [-3.5466641262173653, -8.214546449482441, -11.031560003757477],
+            ]
+        ],
+    ]
+)
+test_conv2d_group_data = np.array(
+    [
+        [
+            [
+                [
+                    1.1630785465240479,
+                    0.4838046133518219,
+                    0.299563467502594,
+                    0.15302546322345734,
+                    -1.168814778327942,
+                ],
+                [
+                    1.5580710172653198,
+                    -0.5459445714950562,
+                    -2.3556296825408936,
+                    0.5414402484893799,
+                    2.678506374359131,
+                ],
+                [
+                    1.2546343803405762,
+                    -0.5487740635871887,
+                    -0.6810643672943115,
+                    -0.13531559705734253,
+                    0.37723132967948914,
+                ],
+                [
+                    0.41016456484794617,
+                    0.5712682008743286,
+                    -2.757962703704834,
+                    1.0762799978256226,
+                    -0.6141325235366821,
+                ],
+                [
+                    1.830764889717102,
+                    -1.1468064785003662,
+                    0.053837940096855164,
+                    -2.5074806213378906,
+                    -0.5916498899459839,
+                ],
+            ],
+            [
+                [
+                    0.8586049675941467,
+                    -0.2279418259859085,
+                    0.2013147622346878,
+                    0.35005471110343933,
+                    0.5360521078109741,
+                ],
+                [
+                    1.5194443464279175,
+                    1.9040879011154175,
+                    -1.5734431743621826,
+                    -0.14007866382598877,
+                    0.29670074582099915,
+                ],
+                [
+                    1.3111951351165771,
+                    0.5035904049873352,
+                    -1.1894450187683105,
+                    -0.5502137541770935,
+                    -1.591875672340393,
+                ],
+                [
+                    -1.1081947088241577,
+                    0.07872020453214645,
+                    -0.9185634255409241,
+                    -0.7457143664360046,
+                    -1.2080862522125244,
+                ],
+                [
+                    1.8140212297439575,
+                    -1.5227429866790771,
+                    -2.515244960784912,
+                    -1.3549325466156006,
+                    -0.9574840068817139,
+                ],
+            ],
+        ]
+    ]
+)
+test_conv2d_group_output = np.array(
+    [
+        [
+            [
+                [-8.836943626403809, 3.2316627502441406, 6.994439601898193],
+                [-0.8386597037315369, -9.857108116149902, 13.68197250366211],
+                [-13.020713806152344, 7.310227870941162, -3.3760271072387695],
+            ],
+            [
+                [-4.803101539611816, 1.026240587234497, 0.5452112555503845],
+                [-6.839838027954102, 2.0195930004119873, 0.11328654736280441],
+                [0.393694669008255, 4.987061023712158, 3.297354221343994],
+            ],
+        ]
+    ]
+)
+test_conv2d_padding_weight = np.array(
+    [
+        [
+            [
+                [0.8586049675941467, -0.2279418259859085, 0.2013147622346878],
+                [0.35005471110343933, 0.5360521078109741, 1.5194443464279175],
+                [1.9040879011154175, -1.5734431743621826, -0.14007866382598877],
+            ]
+        ]
+    ]
+)
+test_conv2d_padding_data = np.array(
+    [
+        [
+            [
+                [
+                    1.1630785465240479,
+                    0.4838046133518219,
+                    0.299563467502594,
+                    0.15302546322345734,
+                    -1.168814778327942,
+                ],
+                [
+                    1.5580710172653198,
+                    -0.5459445714950562,
+                    -2.3556296825408936,
+                    0.5414402484893799,
+                    2.678506374359131,
+                ],
+                [
+                    1.2546343803405762,
+                    -0.5487740635871887,
+                    -0.6810643672943115,
+                    -0.13531559705734253,
+                    0.37723132967948914,
+                ],
+                [
+                    0.41016456484794617,
+                    0.5712682008743286,
+                    -2.757962703704834,
+                    1.0762799978256226,
+                    -0.6141325235366821,
+                ],
+                [
+                    1.830764889717102,
+                    -1.1468064785003662,
+                    0.053837940096855164,
+                    -2.5074806213378906,
+                    -0.5916498899459839,
+                ],
+            ]
+        ]
+    ]
+)
+test_conv2d_padding_data_grad = np.array(
+    [
+        [
+            [
+                [
+                    3.237529069185257,
+                    3.237529069185257,
+                    3.237529069185257,
+                    3.237529069185257,
+                    3.237529069185257,
+                ],
+                [
+                    3.428095132112503,
+                    3.428095132112503,
+                    3.428095132112503,
+                    3.428095132112503,
+                    3.428095132112503,
+                ],
+                [
+                    3.428095132112503,
+                    3.428095132112503,
+                    3.428095132112503,
+                    3.428095132112503,
+                    3.428095132112503,
+                ],
+                [
+                    3.428095132112503,
+                    3.428095132112503,
+                    3.428095132112503,
+                    3.428095132112503,
+                    3.428095132112503,
+                ],
+                [
+                    2.596117228269577,
+                    2.596117228269577,
+                    2.596117228269577,
+                    2.596117228269577,
+                    2.596117228269577,
+                ],
+            ]
+        ]
+    ]
+)
+test_conv2d_padding_weight_grad = np.array(
+    [
+        [
+            [
+                [1.7594299167394638, 1.7594299167394638, 1.7594299167394638],
+                [-0.6019042432308197, -0.6019042432308197, -0.6019042432308197],
+                [-1.532561555504799, -1.532561555504799, -1.532561555504799],
+            ]
+        ]
+    ]
+)
+test_conv2d_padding_output = np.array(
+    [
+        [
+            [
+                [
+                    1.5489805936813354,
+                    -1.0164761543273926,
+                    5.277345657348633,
+                    3.153532028198242,
+                    -7.301508903503418,
+                    -3.7565059661865234,
+                    4.690962314605713,
+                ],
+                [
+                    2.425799608230591,
+                    -2.0592665672302246,
+                    0.9699610471725464,
+                    -0.20758534967899323,
+                    2.3857712745666504,
+                    1.1719579696655273,
+                    0.6523551940917969,
+                ],
+                [
+                    2.1625545024871826,
+                    -1.3517316579818726,
+                    0.3666309118270874,
+                    4.690882682800293,
+                    -8.203354835510254,
+                    3.0248217582702637,
+                    1.2624683380126953,
+                ],
+                [
+                    0.6193475723266602,
+                    -2.0285415649414062,
+                    2.6072847843170166,
+                    -1.9033538103103638,
+                    2.331153154373169,
+                    -3.998155355453491,
+                    -1.0176407098770142,
+                ],
+                [
+                    2.8643176555633545,
+                    -0.7396122217178345,
+                    -0.2253415733575821,
+                    -2.846742630004883,
+                    -4.961236476898193,
+                    -0.1308247298002243,
+                    -0.7344070672988892,
+                ],
+            ]
+        ]
+    ]
+)
+test_conv2d_stride_weight = np.array(
+    [
+        [
+            [
+                [0.8586049675941467, -0.2279418259859085, 0.2013147622346878],
+                [0.35005471110343933, 0.5360521078109741, 1.5194443464279175],
+                [1.9040879011154175, -1.5734431743621826, -0.14007866382598877],
+            ]
+        ]
+    ]
+)
+test_conv2d_stride_data = np.array(
+    [
+        [
+            [
+                [
+                    1.1630785465240479,
+                    0.4838046133518219,
+                    0.299563467502594,
+                    0.15302546322345734,
+                    -1.168814778327942,
+                ],
+                [
+                    1.5580710172653198,
+                    -0.5459445714950562,
+                    -2.3556296825408936,
+                    0.5414402484893799,
+                    2.678506374359131,
+                ],
+                [
+                    1.2546343803405762,
+                    -0.5487740635871887,
+                    -0.6810643672943115,
+                    -0.13531559705734253,
+                    0.37723132967948914,
+                ],
+                [
+                    0.41016456484794617,
+                    0.5712682008743286,
+                    -2.757962703704834,
+                    1.0762799978256226,
+                    -0.6141325235366821,
+                ],
+                [
+                    1.830764889717102,
+                    -1.1468064785003662,
+                    0.053837940096855164,
+                    -2.5074806213378906,
+                    -0.5916498899459839,
+                ],
+            ]
+        ]
+    ]
+)
+test_conv2d_stride_data_grad = np.array(
+    [
+        [
+            [
+                [
+                    0.5360521078109741,
+                    1.5194443464279175,
+                    0.3500547111034393,
+                    0.5360521078109741,
+                    1.5194443464279175,
+                ],
+                [
+                    -1.8013850003480911,
+                    0.061236098408699,
+                    2.762692868709564,
+                    -1.8013850003480911,
+                    0.061236098408699,
+                ],
+                [
+                    0.5360521078109741,
+                    1.5194443464279175,
+                    0.3500547111034393,
+                    0.5360521078109741,
+                    1.5194443464279175,
+                ],
+                [
+                    -1.8013850003480911,
+                    0.061236098408699,
+                    2.762692868709564,
+                    -1.8013850003480911,
+                    0.061236098408699,
+                ],
+                [
+                    0.5360521078109741,
+                    1.5194443464279175,
+                    0.3500547111034393,
+                    0.5360521078109741,
+                    1.5194443464279175,
+                ],
+            ]
+        ]
+    ]
+)
+test_conv2d_stride_weight_grad = np.array(
+    [
+        [
+            [
+                [-5.1135923862457275, 3.5859558284282684, 2.089697480201721],
+                [-0.3276629596948624, 1.7587070614099503, -2.5950092673301697],
+                [-5.1135923862457275, 3.5859558284282684, 2.089697480201721],
+            ]
+        ]
+    ]
+)
+test_conv2d_stride_output = np.array(
+    [
+        [
+            [
+                [-1.0164761543273926, -7.301508903503418],
+                [-1.3517316579818726, -8.203354835510254],
+                [-0.7396122217178345, -4.961236476898193],
+            ]
+        ]
+    ]
+)
+test_conv2d_kernel_weight = np.array(
+    [
+        [
+            [
+                [
+                    -0.9574840068817139,
+                    -0.7248556613922119,
+                    1.1119636297225952,
+                    -0.47827261686325073,
+                    -1.1739492416381836,
+                ],
+                [
+                    -0.7921845316886902,
+                    0.9321041703224182,
+                    -3.1557741165161133,
+                    2.1935296058654785,
+                    -0.5385921001434326,
+                ],
+                [
+                    -0.8611332774162292,
+                    -1.881519079208374,
+                    -0.7205708026885986,
+                    -0.35601571202278137,
+                    -0.15963983535766602,
+                ],
+            ]
+        ]
+    ]
+)
+test_conv2d_kernel_data = np.array(
+    [
+        [
+            [
+                [
+                    1.1630785465240479,
+                    0.4838046133518219,
+                    0.299563467502594,
+                    0.15302546322345734,
+                    -1.168814778327942,
+                    1.5580710172653198,
+                    -0.5459445714950562,
+                ],
+                [
+                    -2.3556296825408936,
+                    0.5414402484893799,
+                    2.678506374359131,
+                    1.2546343803405762,
+                    -0.5487740635871887,
+                    -0.6810643672943115,
+                    -0.13531559705734253,
+                ],
+                [
+                    0.37723132967948914,
+                    0.41016456484794617,
+                    0.5712682008743286,
+                    -2.757962703704834,
+                    1.0762799978256226,
+                    -0.6141325235366821,
+                    1.830764889717102,
+                ],
+                [
+                    -1.1468064785003662,
+                    0.053837940096855164,
+                    -2.5074806213378906,
+                    -0.5916498899459839,
+                    0.8586049675941467,
+                    -0.2279418259859085,
+                    0.2013147622346878,
+                ],
+                [
+                    0.35005471110343933,
+                    0.5360521078109741,
+                    1.5194443464279175,
+                    1.9040879011154175,
+                    -1.5734431743621826,
+                    -0.14007866382598877,
+                    0.29670074582099915,
+                ],
+                [
+                    1.3111951351165771,
+                    0.5035904049873352,
+                    -1.1894450187683105,
+                    -0.5502137541770935,
+                    -1.591875672340393,
+                    -1.1081947088241577,
+                    0.07872020453214645,
+                ],
+                [
+                    -0.9185634255409241,
+                    -0.7457143664360046,
+                    -1.2080862522125244,
+                    1.8140212297439575,
+                    -1.5227429866790771,
+                    -2.515244960784912,
+                    -1.3549325466156006,
+                ],
+            ]
+        ]
+    ]
+)
+test_conv2d_kernel_data_grad = np.array(
+    [
+        [
+            [
+                [
+                    -0.9574840068817139,
+                    -1.6823396682739258,
+                    -0.5703760385513306,
+                    -0.0911646485328674,
+                    -0.5402582287788391,
+                    -1.6522218585014343,
+                    -1.1739492416381836,
+                ],
+                [
+                    -1.749668538570404,
+                    -1.5424200296401978,
+                    -3.586230516433716,
+                    -0.121304988861084,
+                    -2.0410948395729065,
+                    0.0027156472206116,
+                    -1.7125413417816162,
+                ],
+                [
+                    -2.6108018159866333,
+                    -4.285072386264801,
+                    -7.049453675746918,
+                    -3.079410582780838,
+                    -3.2773211896419525,
+                    -0.5129399001598358,
+                    -1.8721811771392822,
+                ],
+                [
+                    -2.6108018159866333,
+                    -4.285072386264801,
+                    -7.049453675746918,
+                    -3.079410582780838,
+                    -3.2773211896419525,
+                    -0.5129399001598358,
+                    -1.8721811771392822,
+                ],
+                [
+                    -2.6108018159866333,
+                    -4.285072386264801,
+                    -7.049453675746918,
+                    -3.079410582780838,
+                    -3.2773211896419525,
+                    -0.5129399001598358,
+                    -1.8721811771392822,
+                ],
+                [
+                    -1.6533178091049194,
+                    -2.6027327179908752,
+                    -6.479077637195587,
+                    -2.9882459342479706,
+                    -2.7370629608631134,
+                    1.1392819583415985,
+                    -0.6982319355010986,
+                ],
+                [
+                    -0.8611332774162292,
+                    -2.7426523566246033,
+                    -3.463223159313202,
+                    -2.958105593919754,
+                    -1.236226350069046,
+                    -0.5156555473804474,
+                    -0.159639835357666,
+                ],
+            ]
+        ]
+    ]
+)
+test_conv2d_kernel_weight_grad = np.array(
+    [
+        [
+            [
+                [
+                    2.974529668688774,
+                    4.548736393451691,
+                    1.1672898679971695,
+                    -1.499158263206482,
+                    0.1862268149852753,
+                ],
+                [
+                    1.6534235626459122,
+                    2.3762744814157486,
+                    -1.448018729686737,
+                    -5.2917241007089615,
+                    -2.278435029089451,
+                ],
+                [
+                    -2.083257421851158,
+                    -2.23808591067791,
+                    -5.749193429946899,
+                    -7.540486767888069,
+                    -6.306201495230198,
+                ],
+            ]
+        ]
+    ]
+)
+test_conv2d_kernel_output = np.array(
+    [
+        [
+            [
+                [-3.5647754669189453, -4.234736919403076, 1.4046944379806519],
+                [-0.6964312791824341, 16.42838478088379, -9.649789810180664],
+                [4.312150478363037, -6.283960819244385, -4.8443922996521],
+                [-2.772286891937256, -4.483709812164307, 12.315184593200684],
+                [7.39893913269043, 1.305102825164795, -2.049992561340332],
+            ]
+        ]
+    ]
+)
+test_conv2d_dilation_weight = np.array(
+    [
+        [
+            [
+                [-0.9574840068817139, -0.7248556613922119, 1.1119636297225952],
+                [-0.47827261686325073, -1.1739492416381836, -0.7921845316886902],
+                [0.9321041703224182, -3.1557741165161133, 2.1935296058654785],
+            ]
+        ]
+    ]
+)
+test_conv2d_dilation_data = np.array(
+    [
+        [
+            [
+                [
+                    1.1630785465240479,
+                    0.4838046133518219,
+                    0.299563467502594,
+                    0.15302546322345734,
+                    -1.168814778327942,
+                    1.5580710172653198,
+                    -0.5459445714950562,
+                ],
+                [
+                    -2.3556296825408936,
+                    0.5414402484893799,
+                    2.678506374359131,
+                    1.2546343803405762,
+                    -0.5487740635871887,
+                    -0.6810643672943115,
+                    -0.13531559705734253,
+                ],
+                [
+                    0.37723132967948914,
+                    0.41016456484794617,
+                    0.5712682008743286,
+                    -2.757962703704834,
+                    1.0762799978256226,
+                    -0.6141325235366821,
+                    1.830764889717102,
+                ],
+                [
+                    -1.1468064785003662,
+                    0.053837940096855164,
+                    -2.5074806213378906,
+                    -0.5916498899459839,
+                    0.8586049675941467,
+                    -0.2279418259859085,
+                    0.2013147622346878,
+                ],
+                [
+                    0.35005471110343933,
+                    0.5360521078109741,
+                    1.5194443464279175,
+                    1.9040879011154175,
+                    -1.5734431743621826,
+                    -0.14007866382598877,
+                    0.29670074582099915,
+                ],
+                [
+                    1.3111951351165771,
+                    0.5035904049873352,
+                    -1.1894450187683105,
+                    -0.5502137541770935,
+                    -1.591875672340393,
+                    -1.1081947088241577,
+                    0.07872020453214645,
+                ],
+                [
+                    -0.9185634255409241,
+                    -0.7457143664360046,
+                    -1.2080862522125244,
+                    1.8140212297439575,
+                    -1.5227429866790771,
+                    -2.515244960784912,
+                    -1.3549325466156006,
+                ],
+            ]
+        ]
+    ]
+)
+test_conv2d_dilation_data_grad = np.array(
+    [
+        [
+            [
+                [
+                    -0.9574840068817139,
+                    0.0,
+                    0.0,
+                    -0.7248556613922119,
+                    0.0,
+                    0.0,
+                    1.1119636297225952,
+                ],
+                [
+                    -0.9574840068817139,
+                    0.0,
+                    0.0,
+                    -0.7248556613922119,
+                    0.0,
+                    0.0,
+                    1.1119636297225952,
+                ],
+                [
+                    -1.4357566237449646,
+                    0.0,
+                    0.0,
+                    -1.8988049030303955,
+                    0.0,
+                    0.0,
+                    0.319779098033905,
+                ],
+                [
+                    -0.4782726168632507,
+                    0.0,
+                    0.0,
+                    -1.1739492416381836,
+                    0.0,
+                    0.0,
+                    -0.7921845316886902,
+                ],
+                [
+                    0.4538315534591675,
+                    0.0,
+                    0.0,
+                    -4.329723358154297,
+                    0.0,
+                    0.0,
+                    1.4013450741767883,
+                ],
+                [
+                    0.9321041703224182,
+                    0.0,
+                    0.0,
+                    -3.1557741165161133,
+                    0.0,
+                    0.0,
+                    2.1935296058654785,
+                ],
+                [
+                    0.9321041703224182,
+                    0.0,
+                    0.0,
+                    -3.1557741165161133,
+                    0.0,
+                    0.0,
+                    2.1935296058654785,
+                ],
+            ]
+        ]
+    ]
+)
+test_conv2d_dilation_weight_grad = np.array(
+    [
+        [
+            [
+                [-0.8153198063373566, -1.3503028601408005, 1.1495047211647034],
+                [-0.4195204377174377, -1.4455246925354004, 2.328780397772789],
+                [0.7426864206790924, 3.1678953766822815, -0.979511596262455],
+            ]
+        ]
+    ]
+)
+test_conv2d_dilation_output = np.array(
+    [[[[-5.2563982009887695], [5.410353183746338], [-8.517012596130371]]]]
+)
+
+
+def _test_conv2d(test_case, conv, data, weight, output, bias=None, device="cuda"):
+    to_device = flow.device(device)
+    x = flow.Tensor(data, device=to_device)
+    conv.weight = flow.nn.Parameter(flow.Tensor(weight))
+    if bias is not None:
+        conv.bias = flow.nn.Parameter(flow.Tensor(bias))
+    conv.to(to_device)
+    of_out = conv(x)
+    test_case.assertTrue(np.allclose(of_out.numpy(), output, rtol=0.001, atol=1e-07))
+
+
+def _test_conv2d_backward(
+    test_case, conv, data, weight, data_grad, weight_grad, bias=None, device="cuda"
+):
+    to_device = flow.device(device)
+    x = flow.Tensor(data, device=to_device, requires_grad=True)
+    conv.weight = flow.nn.Parameter(flow.Tensor(weight), requires_grad=True)
+    if bias is not None:
+        conv.bias = flow.nn.Parameter(flow.Tensor(bias))
+    conv.to(to_device)
+    of_out = conv(x)
+    of_out.sum().backward()
+    test_case.assertTrue(
+        np.allclose(x.grad.numpy(), data_grad, rtol=0.0001, atol=1e-08)
+    )
+    test_case.assertTrue(
+        np.allclose(conv.weight.grad.numpy(), weight_grad, rtol=0.0001, atol=1e-08)
+    )
+
+
+def _test_conv2d_large_in_channel(test_case, device):
+    np_arr = np.array(
+        [
+            [
+                [
+                    [
+                        0.6206631238581714,
+                        -1.1225329393404626,
+                        0.8407155480700242,
+                        -0.6845162855236345,
+                    ],
+                    [
+                        -0.5186484633906412,
+                        0.10420735184519186,
+                        -0.1711568947473012,
+                        0.5168640476046483,
+                    ],
+                    [
+                        -0.12429464919764661,
+                        0.050277779246134253,
+                        -1.0144501797426606,
+                        -2.184600444658526,
+                    ],
+                    [
+                        0.28918126931309923,
+                        -0.822872663244595,
+                        0.44019150436683663,
+                        -1.0247720130825562,
+                    ],
+                ],
+                [
+                    [
+                        0.7786504412818226,
+                        -0.7501839068078657,
+                        -0.8187283189941765,
+                        -1.1116653569170698,
+                    ],
+                    [
+                        0.18085524152316743,
+                        -1.3461349607476678,
+                        1.142505437476448,
+                        -0.000649619704040145,
+                    ],
+                    [
+                        0.03160672782674317,
+                        -0.006318157449953413,
+                        1.2218487782604377,
+                        0.15903027907930234,
+                    ],
+                    [
+                        1.5857011815642381,
+                        0.6656477116332891,
+                        -0.04036621813223574,
+                        -0.3427168687988546,
+                    ],
+                ],
+                [
+                    [
+                        -1.1774346070102524,
+                        1.6195241269303395,
+                        -0.36185552303441965,
+                        -1.1382193113192487,
+                    ],
+                    [
+                        0.08061907334568702,
+                        1.5025447613238763,
+                        -1.1591348706634745,
+                        1.6449050139676873,
+                    ],
+                    [
+                        1.1539915649822392,
+                        -2.414624939646017,
+                        0.3056063774849572,
+                        1.1920089257083162,
+                    ],
+                    [
+                        0.7623012858982319,
+                        -0.01685314742940813,
+                        -1.096666898224702,
+                        -0.4406476137098582,
+                    ],
+                ],
+                [
+                    [
+                        0.9383797282214235,
+                        -1.1075876842796508,
+                        -0.4420913825139058,
+                        -1.0736097610655628,
+                    ],
+                    [
+                        -0.3101376466546291,
+                        1.6578227745160954,
+                        -0.6225454278031398,
+                        0.6831188620748697,
+                    ],
+                    [
+                        0.00743800968372913,
+                        -0.8089158949698473,
+                        2.08084287836801,
+                        0.721204366332351,
+                    ],
+                    [
+                        0.5694701823297723,
+                        0.031519314469744895,
+                        -0.5041680957766629,
+                        -0.4738588233094669,
+                    ],
+                ],
+            ]
+        ]
+    )
+    input = flow.Tensor(
+        np_arr, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    weight = np.array(
+        [
+            [
+                [
+                    [0.06456436216831207, -0.10852358490228653, -0.21638715267181396],
+                    [-0.2279110550880432, 0.1476770043373108, 0.19457484781742096],
+                    [0.05026858672499657, 0.10818571597337723, 0.02056501805782318],
+                ],
+                [
+                    [0.205095112323761, 0.1488947868347168, -0.2344113141298294],
+                    [0.1684819906949997, -0.21986986696720123, 0.1082606166601181],
+                    [-0.1528974026441574, 0.17120417952537537, 0.01954500749707222],
+                ],
+            ],
+            [
+                [
+                    [-0.09441672265529633, -0.03644559532403946, -0.22235223650932312],
+                    [-0.1771145612001419, 0.08043312281370163, 0.06938580423593521],
+                    [0.054393064230680466, -0.05483492836356163, 0.23438701033592224],
+                ],
+                [
+                    [0.22666795551776886, 0.0874653309583664, 0.07092718034982681],
+                    [0.08883464336395264, -0.052362944930791855, -0.1720171570777893],
+                    [0.10441060364246368, 0.011952142231166363, -0.0894528403878212],
+                ],
+            ],
+        ]
+    )
+    m = flow.nn.Conv2d(4, 2, 3, groups=2, bias=False)
+    m.weight = flow.nn.Parameter(flow.Tensor(weight), requires_grad=True)
+    m = m.to(device)
+    output = m(input)
+    np_out = [
+        [
+            [
+                [0.7666134238243103, -0.3961866497993469],
+                [-0.656266987323761, -1.1613956689834595],
+            ],
+            [
+                [0.3077264130115509, -0.42817503213882446],
+                [-0.5761325359344482, 0.1300736665725708],
+            ],
+        ]
+    ]
+    test_case.assertTrue(np.allclose(output.numpy(), np_out, 1e-06, 1e-06))
+    output = output.sum()
+    output.backward()
+    np_grad = [
+        [
+            [
+                [
+                    0.06456436216831207,
+                    -0.04395922273397446,
+                    -0.3249107301235199,
+                    -0.21638715267181396,
+                ],
+                [
+                    -0.16334669291973114,
+                    -0.12419328093528748,
+                    0.017341122031211853,
+                    -0.021812304854393005,
+                ],
+                [
+                    -0.17764246463775635,
+                    0.07822024822235107,
+                    0.47100257873535156,
+                    0.21513986587524414,
+                ],
+                [
+                    0.05026858672499657,
+                    0.1584542989730835,
+                    0.128750741481781,
+                    0.02056501805782318,
+                ],
+            ],
+            [
+                [
+                    0.205095112323761,
+                    0.3539898991584778,
+                    -0.08551652729511261,
+                    -0.2344113141298294,
+                ],
+                [
+                    0.3735771179199219,
+                    0.30260205268859863,
+                    -0.19712577760219574,
+                    -0.1261506974697113,
+                ],
+                [
+                    0.015584588050842285,
+                    -0.03308109939098358,
+                    0.07913993299007416,
+                    0.12780562043190002,
+                ],
+                [
+                    -0.1528974026441574,
+                    0.018306776881217957,
+                    0.1907491832971573,
+                    0.01954500749707222,
+                ],
+            ],
+            [
+                [
+                    -0.09441672265529633,
+                    -0.13086232542991638,
+                    -0.258797824382782,
+                    -0.22235223650932312,
+                ],
+                [
+                    -0.27153128385543823,
+                    -0.22754377126693726,
+                    -0.10897888988256454,
+                    -0.1529664397239685,
+                ],
+                [
+                    -0.12272149324417114,
+                    -0.09712330251932144,
+                    0.32937100529670715,
+                    0.30377280712127686,
+                ],
+                [
+                    0.054393064230680466,
+                    -0.00044186413288116455,
+                    0.1795520782470703,
+                    0.23438701033592224,
+                ],
+            ],
+            [
+                [
+                    0.22666795551776886,
+                    0.31413328647613525,
+                    0.1583925187587738,
+                    0.07092718034982681,
+                ],
+                [
+                    0.3155025839805603,
+                    0.35060498118400574,
+                    -0.06598758697509766,
+                    -0.1010899767279625,
+                ],
+                [
+                    0.19324524700641632,
+                    0.1528344452381134,
+                    -0.301880806684494,
+                    -0.2614699900150299,
+                ],
+                [
+                    0.10441060364246368,
+                    0.11636274307966232,
+                    -0.07750070095062256,
+                    -0.0894528403878212,
+                ],
+            ],
+        ]
+    ]
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-06, 1e-06))
+
+
+def _test_conv2d_large_out_channel(test_case, device):
+    np_arr = np.array(
+        [
+            [
+                [
+                    [0.56573248, -0.1968932, -0.67875558, 0.34328273, 0.31964567],
+                    [-1.33715475, 0.33422229, -1.27643383, 0.37904647, 0.35891593],
+                    [0.84579802, 2.12729621, -0.51423287, 0.6129756, -1.31156564],
+                    [-0.71047139, 1.02679253, -0.76686019, -0.72969633, 0.7342515],
+                    [-0.13592879, -1.03207183, -0.22554775, 0.74148071, 0.9660151],
+                ],
+                [
+                    [0.51595992, 0.49624804, 0.91145641, 0.49247262, 0.41002217],
+                    [-1.08001196, 1.55497086, -0.8196314, -0.45511565, -0.60269165],
+                    [0.05563145, -0.94318372, -1.17058158, -0.73568577, 0.57810956],
+                    [-0.40260276, -0.10309298, 1.123788, -0.23510537, -0.73893374],
+                    [-0.52712536, -0.00717016, -1.85051966, -1.5079056, 1.38335907],
+                ],
+            ]
+        ]
+    )
+    input = flow.Tensor(
+        np_arr, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    weight = np.array(
+        [
+            [
+                [
+                    [-0.19489679, -0.32377058, 0.21736273],
+                    [0.04095296, -0.21552679, -0.14626531],
+                    [-0.19359522, -0.00742865, -0.19832158],
+                ]
+            ],
+            [
+                [
+                    [0.29926914, 0.00931164, 0.2619766],
+                    [0.27611443, -0.15439281, -0.19027126],
+                    [-0.2890912, 0.30367029, -0.05168664],
+                ]
+            ],
+            [
+                [
+                    [-0.03155736, 0.17610769, 0.22111714],
+                    [0.2279067, -0.32897446, -0.03260243],
+                    [-0.10274851, -0.06903386, -0.19438276],
+                ]
+            ],
+            [
+                [
+                    [-0.24573688, -0.06723209, -0.21363299],
+                    [-0.02136187, -0.24994437, -0.18691199],
+                    [0.12189507, 0.29469389, 0.03398871],
+                ]
+            ],
+        ]
+    )
+    m = flow.nn.Conv2d(2, 4, 3, groups=2, bias=False)
+    m.weight = flow.nn.Parameter(flow.Tensor(weight), requires_grad=True)
+    m = m.to(device)
+    output = m(input)
+    np_out = np.array(
+        [
+            [
+                [
+                    [-0.21170563, 0.03652292, 0.25926736],
+                    [-0.19168918, 0.49044561, 0.25099146],
+                    [-1.0248934, 0.25361472, -0.51828313],
+                ],
+                [
+                    [0.23977707, -0.56090075, -0.19285655],
+                    [-0.17167747, 0.24558367, -0.3093586],
+                    [-0.33303234, 1.52472734, -0.49013454],
+                ],
+                [
+                    [-0.17137986, 1.21333742, 0.18988736],
+                    [0.31785482, -0.1212157, -0.18676008],
+                    [-0.10680684, -0.30298883, 0.41809759],
+                ],
+                [
+                    [-0.87821335, -0.51665992, -0.44061098],
+                    [0.7480458, 0.5310725, 0.50418228],
+                    [-0.00512899, -0.3645584, -0.23643512],
+                ],
+            ]
+        ]
+    )
+    test_case.assertTrue(np.allclose(output.numpy(), np_out, 1e-05, 1e-06))
+    output = output.sum()
+    output.backward()
+    np_grad = np.array(
+        [
+            [
+                [
+                    [0.10437235, -0.21008658, 0.26925275, 0.16488039, 0.47933933],
+                    [0.42143974, -0.2629388, -0.12013602, -0.54157579, 0.14280275],
+                    [-0.06124666, -0.44938356, -0.55658901, -0.49534237, -0.10720548],
+                    [-0.16561902, -0.23929697, -0.82584178, -0.66022277, -0.58654481],
+                    [-0.4826864, -0.18644476, -0.43645298, 0.04623342, -0.25000823],
+                ],
+                [
+                    [-0.27729425, -0.16841865, -0.16093449, 0.11635975, 0.00748415],
+                    [-0.07074942, -0.54079264, -0.75282294, -0.68207347, -0.21203026],
+                    [-0.05160286, -0.29598606, -0.66841042, -0.61680746, -0.3724243],
+                    [0.22569139, -0.12756741, -0.50747585, -0.73316729, -0.37990844],
+                    [0.01914656, 0.24480659, 0.08441254, 0.06526598, -0.16039404],
+                ],
+            ]
+        ]
+    )
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-06, 1e-06))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestConv2d(flow.unittest.TestCase):
+    def test_conv2d_default_init(test_case):
+        for device in ["cuda", "cpu"]:
+            conv = flow.nn.Conv2d(1, 1, (3, 3), bias=True).to(flow.device(device))
+            test_case.assertTrue(
+                not np.allclose(
+                    conv.weight.numpy(), np.zeros((1, 1, 3, 3)), rtol=1e-09, atol=1e-10
+                )
+            )
+            test_case.assertTrue(
+                not np.allclose(
+                    conv.bias.numpy(), np.zeros((1,)), rtol=1e-09, atol=1e-10
+                )
+            )
+
+    def test_conv2d(test_case):
+        for device in ["cuda", "cpu"]:
+            conv = flow.nn.Conv2d(1, 3, (3, 3), bias=False).to(flow.device(device))
+            _test_conv2d(
+                test_case,
+                conv,
+                test_conv2d_data,
+                test_conv2d_weight,
+                test_conv2d_output,
+                device=device,
+            )
+
+    def test_conv2d_backward(test_case):
+        for device in ["cuda", "cpu"]:
+            conv = flow.nn.Conv2d(1, 3, (3, 3), bias=False).to(flow.device(device))
+            _test_conv2d_backward(
+                test_case,
+                conv,
+                test_conv2d_data,
+                test_conv2d_weight,
+                test_conv2d_data_grad,
+                test_conv2d_weight_grad,
+                device=device,
+            )
+
+    def test_conv2d_with_bias(test_case):
+        for device in ["cuda", "cpu"]:
+            conv = flow.nn.Conv2d(1, 3, (3, 3), bias=True).to(flow.device(device))
+            _test_conv2d(
+                test_case,
+                conv,
+                test_conv2d_with_bias_data,
+                test_conv2d_with_bias_weight,
+                test_conv2d_with_bias_output,
+                bias=test_conv2d_with_bias_bias,
+                device=device,
+            )
+
+    def test_conv2d_group(test_case):
+        for device in ["cuda", "cpu"]:
+            conv = flow.nn.Conv2d(2, 2, (3, 3), groups=2, bias=False).to(
+                flow.device(device)
+            )
+            _test_conv2d(
+                test_case,
+                conv,
+                test_conv2d_group_data,
+                test_conv2d_group_weight,
+                test_conv2d_group_output,
+                device=device,
+            )
+
+    def test_conv2d_group_backward(test_case):
+        for device in ["cuda", "cpu"]:
+            conv = flow.nn.Conv2d(2, 2, (3, 3), groups=2, bias=False).to(
+                flow.device(device)
+            )
+            _test_conv2d_backward(
+                test_case,
+                conv,
+                test_conv2d_group_data,
+                test_conv2d_group_weight,
+                test_conv2d_group_data_grad,
+                test_conv2d_group_weight_grad,
+                device=device,
+            )
+
+    def test_conv2d_padding(test_case):
+        for device in ["cuda", "cpu"]:
+            conv = flow.nn.Conv2d(1, 1, (3, 3), padding=(1, 2), bias=False).to(
+                flow.device(device)
+            )
+            _test_conv2d(
+                test_case,
+                conv,
+                test_conv2d_padding_data,
+                test_conv2d_padding_weight,
+                test_conv2d_padding_output,
+                device=device,
+            )
+
+    def test_conv2d_padding_backward(test_case):
+        for device in ["cuda", "cpu"]:
+            conv = flow.nn.Conv2d(1, 1, (3, 3), padding=(1, 2), bias=False).to(
+                flow.device(device)
+            )
+            _test_conv2d_backward(
+                test_case,
+                conv,
+                test_conv2d_padding_data,
+                test_conv2d_padding_weight,
+                test_conv2d_padding_data_grad,
+                test_conv2d_padding_weight_grad,
+                device=device,
+            )
+
+    def test_conv2d_stride(test_case):
+        for device in ["cuda", "cpu"]:
+            conv = flow.nn.Conv2d(
+                1, 1, (3, 3), padding=(1, 1), stride=(2, 3), bias=False
+            ).to(flow.device(device))
+            _test_conv2d(
+                test_case,
+                conv,
+                test_conv2d_stride_data,
+                test_conv2d_stride_weight,
+                test_conv2d_stride_output,
+                device=device,
+            )
+
+    def test_conv2d_stride_backward(test_case):
+        for device in ["cuda", "cpu"]:
+            conv = flow.nn.Conv2d(
+                1, 1, (3, 3), padding=(1, 1), stride=(2, 3), bias=False
+            ).to(flow.device(device))
+            _test_conv2d_backward(
+                test_case,
+                conv,
+                test_conv2d_stride_data,
+                test_conv2d_stride_weight,
+                test_conv2d_stride_data_grad,
+                test_conv2d_stride_weight_grad,
+                device=device,
+            )
+
+    def test_conv2d_kernel(test_case):
+        for device in ["cuda", "cpu"]:
+            conv = flow.nn.Conv2d(1, 1, (3, 5), bias=False).to(flow.device(device))
+            conv.to(flow.device("cuda"))
+            _test_conv2d(
+                test_case,
+                conv,
+                test_conv2d_kernel_data,
+                test_conv2d_kernel_weight,
+                test_conv2d_kernel_output,
+            )
+
+    def test_conv2d_kernel_backward(test_case):
+        for device in ["cuda", "cpu"]:
+            conv = flow.nn.Conv2d(1, 1, (3, 5), bias=False).to(flow.device(device))
+            conv.to(flow.device("cuda"))
+            _test_conv2d_backward(
+                test_case,
+                conv,
+                test_conv2d_kernel_data,
+                test_conv2d_kernel_weight,
+                test_conv2d_kernel_data_grad,
+                test_conv2d_kernel_weight_grad,
+            )
+
+    def test_conv2d_dilation(test_case):
+        for device in ["cuda", "cpu"]:
+            conv = flow.nn.Conv2d(1, 1, (3, 3), dilation=(2, 3), bias=False).to(
+                flow.device(device)
+            )
+            _test_conv2d(
+                test_case,
+                conv,
+                test_conv2d_dilation_data,
+                test_conv2d_dilation_weight,
+                test_conv2d_dilation_output,
+                device=device,
+            )
+
+    def test_conv2d_dilation_backward(test_case):
+        for device in ["cuda", "cpu"]:
+            conv = flow.nn.Conv2d(1, 1, (3, 3), dilation=(2, 3), bias=False).to(
+                flow.device(device)
+            )
+            _test_conv2d_backward(
+                test_case,
+                conv,
+                test_conv2d_dilation_data,
+                test_conv2d_dilation_weight,
+                test_conv2d_dilation_data_grad,
+                test_conv2d_dilation_weight_grad,
+                device=device,
+            )
+
+    def test_large_in_channel_group_conv(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [_test_conv2d_large_in_channel]
+        arg_dict["device"] = ["cuda", "cpu"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+    def test_large_out_channel_group_conv(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [_test_conv2d_large_out_channel]
+        arg_dict["device"] = ["cuda", "cpu"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+    @unittest.skip("need a more relaxed tolerance")
+    def test_with_random_data(test_case):
+        for device in ["cpu", "cuda"]:
+            channels = random(1, 6)
+            test_module_against_pytorch(
+                test_case,
+                "nn.Conv2d",
+                extra_generators={
+                    "input": random_tensor(ndim=4, dim1=channels),
+                    "in_channels": channels,
+                    "out_channels": random(1, 129),
+                    "kernel_size": random(1, 4),
+                    "stride": random(1, 4),
+                    "padding": random(1, 5),
+                    "dilation": random(1, 5),
+                    "groups": random(1, 5),
+                    "padding_mode": constant("zeros"),
+                },
+                device=device,
+            )
+
+    @unittest.skip("need a more relaxed tolerance")
+    @autotest()
+    def test_against_pytorch(test_case):
+        channels = random(1, 6)
+        m = torch.nn.Conv2d(
+            channels,
+            random(1, 6),
+            random(1, 6),
+            stride=random(1, 3) | nothing(),
+            padding=random(1, 3) | nothing(),
+            dilation=random(1, 3) | nothing(),
+            groups=random(1, 3) | nothing(),
+            bias=random() | nothing(),
+            padding_mode=constant("zeros") | nothing(),
+        )
+        m.train(random())
+        device = random_device()
+        m.to(device)
+        x = random_pytorch_tensor(
+            ndim=4, dim1=channels, dim2=random(1, 8), dim3=random(1, 8)
+        ).to(device)
+        y = m(x)
+        return y
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_conv1d.py b/python/oneflow/test/modules/test_conv1d.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb91c0c647473e14d16a1a7361bfbc52e1ac2756
--- /dev/null
+++ b/python/oneflow/test/modules/test_conv1d.py
@@ -0,0 +1,439 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.nn as nn
+import oneflow.unittest
+
+
+def _test_conv1d_bias_false(test_case, device):
+    np_arr = np.array([[[1.28795946, -0.2921792, 0.20338029, 0.78604293, -1.89607573]]])
+    input = flow.Tensor(
+        np_arr, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    weight = np.array(
+        [
+            [[0.10197904, 0.3372305, -0.25743008]],
+            [[0.27720425, -0.52435774, -0.38381988]],
+            [[0.56016803, -0.10063095, -0.10760903]],
+        ]
+    )
+    m = nn.Conv1d(1, 3, 3, stride=1, bias=False)
+    m.weight = flow.nn.Parameter(flow.Tensor(weight))
+    m = m.to(device)
+    output = m(input)
+    np_out = np.array(
+        [
+            [
+                [-0.01954307, -0.16356121, 0.77392507],
+                [0.43217283, -0.48933625, 0.37196174],
+                [0.72899038, -0.2687211, 0.23886177],
+            ]
+        ]
+    )
+    test_case.assertTrue(np.allclose(output.numpy(), np_out, 1e-06, 1e-06))
+    output = output.sum()
+    output.backward()
+    np_grad = np.array(
+        [[[0.93935132, 0.65159315, -0.09726584, -1.03661716, -0.74885899]]]
+    )
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-06, 1e-06))
+
+
+def _test_conv1d_bias_true(test_case, device):
+    np_arr = np.array(
+        [
+            [
+                [0.90499806, -1.11683071, 0.71605605, -0.56754625, 0.61944169],
+                [-0.31317389, -0.26271924, 0.95579433, 0.52468461, 1.48926127],
+            ]
+        ]
+    )
+    input = flow.Tensor(
+        np_arr, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    weight = np.array(
+        [
+            [
+                [0.01997352, 0.23834395, 0.00526353],
+                [-0.04861857, -0.22751901, -0.06725175],
+            ],
+            [
+                [0.13344523, -0.35202524, 0.15168799],
+                [-0.25714493, -0.17459838, 0.28768948],
+            ],
+            [
+                [0.10671382, -0.28205597, -0.39752254],
+                [0.36393702, 0.07843742, -0.33898622],
+            ],
+            [
+                [0.20485674, 0.04222689, -0.1898618],
+                [0.22519711, -0.15910202, -0.35057363],
+            ],
+        ]
+    )
+    bias = np.array([0.01012857, 0.38912651, -0.01600273, -0.3883304])
+    m = nn.Conv1d(2, 4, 3, stride=1, bias=True)
+    m.weight = flow.nn.Parameter(flow.Tensor(weight))
+    m.bias = flow.nn.Parameter(flow.Tensor(bias))
+    m = m.to(device)
+    np_out = np.array(
+        [
+            [
+                [-0.22349545, -0.08447243, -0.37358052],
+                [1.4130373, -0.04644597, 0.86949122],
+                [-0.34765026, -0.31004351, -0.14158708],
+                [-0.74985039, -0.87430149, -0.77354753],
+            ]
+        ]
+    )
+    output = m(input)
+    test_case.assertTrue(np.allclose(output.numpy(), np_out, 1e-06, 1e-06))
+    output = output.sum()
+    output.backward()
+    np_grad = np.array(
+        [
+            [
+                [0.4649893, 0.11147892, -0.3189539, -0.78394318, -0.43043283],
+                [0.28337064, -0.19941133, -0.66853344, -0.95190406, -0.46912211],
+            ]
+        ]
+    )
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-06, 1e-06))
+
+
+def _test_conv1d_dilation(test_case, device):
+    np_arr = np.array(
+        [[[-0.43016902, 1.74619496, -0.57338119, 0.25563857, 0.12575546]]]
+    )
+    input = flow.Tensor(
+        np_arr, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    weight = np.array(
+        [
+            [[-0.35057205, -0.31304273, 0.46250814]],
+            [[-0.40786612, 0.36518192, 0.46280444]],
+            [[-0.00921835, -0.38710043, 0.47566161]],
+        ]
+    )
+    m = nn.Conv1d(1, 3, 3, stride=1, bias=False)
+    m.weight = flow.nn.Parameter(flow.Tensor(weight))
+    m = m.to(device)
+    output = m(input)
+    np_out = np.array(
+        [
+            [
+                [-0.66102189, -0.31443936, 0.17914855],
+                [0.54776692, -0.8032915, 0.38541752],
+                [-0.94472277, 0.32745653, -0.03385513],
+            ]
+        ]
+    )
+    test_case.assertTrue(np.allclose(output.numpy(), np_out, 1e-06, 1e-06))
+    output = output.sum()
+    output.backward()
+    np_grad = np.array(
+        [[[-0.76765651, -1.10261774, 0.29835641, 1.06601286, 1.40097415]]]
+    )
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-06, 1e-06))
+
+
+def _test_conv1d_stride(test_case, device):
+    np_arr = np.array(
+        [[[-1.01312506, -0.40687919, 1.5985316, 0.53594196, -1.89935565]]]
+    )
+    input = flow.Tensor(
+        np_arr, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    weight = np.array(
+        [
+            [[0.5751484, 0.26589182, -0.026546]],
+            [[-0.10313249, -0.20797005, -0.48268208]],
+            [[-0.22216944, -0.14962578, 0.57433963]],
+        ]
+    )
+    m = nn.Conv1d(1, 3, 3, stride=2, bias=False)
+    m.weight = flow.nn.Parameter(flow.Tensor(weight))
+    m = m.to(device)
+    output = m(input)
+    np_out = np.array(
+        [
+            [
+                [-0.73331773, 1.11231577],
+                [-0.58247775, 0.64046454],
+                [1.20406508, -1.5262109],
+            ]
+        ]
+    )
+    test_case.assertTrue(np.allclose(output.numpy(), np_out, 1e-06, 1e-06))
+    output = output.sum()
+    output.backward()
+    np_grad = np.array(
+        [[[0.24984647, -0.09170401, 0.31495798, -0.09170401, 0.06511152]]]
+    )
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-06, 1e-06))
+
+
+def _test_conv1d_group_bias_true(test_case, device):
+    np_arr = np.array(
+        [
+            [
+                [1.48566079, 0.54937589, 0.62353903, -0.94114172, -0.60260266],
+                [0.61150503, -0.50289607, 1.41735041, -1.85877609, -1.04875529],
+            ]
+        ]
+    )
+    input = flow.Tensor(
+        np_arr, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    weight = np.array(
+        [
+            [[0.25576305, 0.40814576, -0.05900212]],
+            [[-0.24829513, 0.42756805, -0.01354307]],
+            [[0.44658303, 0.46889144, 0.41060263]],
+            [[0.30083328, -0.5221613, 0.12215579]],
+        ]
+    )
+    bias = np.array([-0.03368823, -0.4212504, -0.42130581, -0.17434336])
+    m = nn.Conv1d(2, 4, 3, groups=2, stride=1, bias=True)
+    m.weight = flow.nn.Parameter(flow.Tensor(weight))
+    m.bias = flow.nn.Parameter(flow.Tensor(bias))
+    m = m.to(device)
+    np_out = np.array(
+        [
+            [
+                [0.53372419, 0.41684598, -0.22277816],
+                [-0.56368178, -0.27830642, -0.97031319],
+                [0.19794616, -0.74452549, -1.09052706],
+                [0.44534814, -1.29277706, 1.09451222],
+            ]
+        ]
+    )
+    output = m(input)
+    test_case.assertTrue(np.allclose(output.numpy(), np_out, 1e-06, 1e-06))
+    output = output.sum()
+    output.backward()
+    np_grad = np.array(
+        [
+            [
+                [0.00746793, 0.84318173, 0.77063656, 0.76316863, -0.07254519],
+                [0.74741632, 0.69414645, 1.22690487, 0.47948855, 0.53275841],
+            ]
+        ]
+    )
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-06, 1e-06))
+
+
+def _test_conv1d_group_large_out_bias_true(test_case, device):
+    np_arr = np.array(
+        [
+            [
+                [2.17964911, 0.91623521, 1.24746692, 0.73605931, -0.23738743],
+                [-0.70412433, 0.10727754, 1.0207864, -0.09711888, -1.10814202],
+            ]
+        ]
+    )
+    input = flow.Tensor(
+        np_arr, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    weight = np.array(
+        [
+            [[-0.207307473, 0.12856324, 0.371991515]],
+            [[-0.416422307, 3.26921181e-05, -0.385845661]],
+            [[-0.182592362, 0.143281639, 0.419321984]],
+            [[-0.27117458, 0.0421470925, 0.377335936]],
+            [[0.546190619, -0.211819887, -0.29785803]],
+            [[0.334832489, 0.255918801, -0.0556600206]],
+        ]
+    )
+    bias = np.array(
+        [-0.56865668, 0.17631066, -0.43992457, -0.24307285, -0.53672957, -0.52927947]
+    )
+    m = nn.Conv1d(2, 6, 3, groups=2, stride=1, bias=True)
+    m.weight = flow.nn.Parameter(flow.Tensor(weight))
+    m.bias = flow.nn.Parameter(flow.Tensor(bias))
+    m = m.to(device)
+    np_out = np.array(
+        [
+            [
+                [-0.43867296, -0.32441288, -0.82094181],
+                [-1.21264362, -0.48919463, -0.25154343],
+                [-0.18354186, -0.11983716, -0.66178048],
+                [0.33756858, -0.26578707, -0.9421193],
+                [-1.2480886, -0.66543078, 0.37145507],
+                [-0.79440582, -0.22671542, -0.15066233],
+            ]
+        ]
+    )
+    output = m(input)
+    test_case.assertTrue(np.allclose(output.numpy(), np_out, 1e-06, 1e-06))
+    output = output.sum()
+    output.backward()
+    np_grad = np.array(
+        [
+            [
+                [-0.8063221, -0.53444451, -0.12897667, 0.6773454, 0.40546784],
+                [0.6098485, 0.69609451, 0.71991241, 0.1100639, 0.02381789],
+            ]
+        ]
+    )
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-06, 1e-06))
+
+
+def _test_conv1d_group_large_in_bias_true(test_case, device):
+    np_arr = np.array(
+        [
+            [
+                [0.7382921, 0.3227571, -0.73204273, -0.01697334, 1.72585976],
+                [0.52866709, 0.28417364, 1.12931311, 1.73048413, -0.60748184],
+                [0.43222603, 0.7882517, -0.62105948, 0.10097823, 0.81639361],
+                [0.36671457, 0.24468753, -0.5824874, -0.74464536, -0.38901371],
+            ]
+        ]
+    )
+    input = flow.Tensor(
+        np_arr, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    weight = np.array(
+        [
+            [
+                [-0.29574063, -0.31176069, 0.17234495],
+                [0.06092392, 0.30691007, -0.36685407],
+            ],
+            [
+                [0.26149744, 0.07149458, 0.3209756],
+                [0.18960869, -0.37148297, -0.13602243],
+            ],
+        ]
+    )
+    bias = np.array([-0.35048512, -0.0093792])
+    m = nn.Conv1d(4, 2, 3, groups=2, stride=1, bias=True)
+    m.weight = flow.nn.Parameter(flow.Tensor(weight))
+    m.bias = flow.nn.Parameter(flow.Tensor(bias))
+    m = m.to(device)
+    np_out = np.array(
+        [[[-1.09048378, -0.49156523, 0.99150705], [0.01852397, 0.54882324, 0.31657016]]]
+    )
+    output = m(input)
+    test_case.assertTrue(np.allclose(output.numpy(), np_out, 1e-06, 1e-06))
+    output = output.sum()
+    output.backward()
+    np_grad = np.array(
+        [
+            [
+                [-0.29574063, -0.60750133, -0.43515638, -0.13941574, 0.17234495],
+                [0.06092392, 0.36783397, 0.0009799, -0.059944, -0.36685407],
+                [0.26149744, 0.33299202, 0.65396762, 0.39247018, 0.3209756],
+                [0.18960869, -0.18187428, -0.31789672, -0.50750542, -0.13602243],
+            ]
+        ]
+    )
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-06, 1e-06))
+
+
+def _test_conv1d_compilcate(test_case, device):
+    np_arr = np.array(
+        [
+            [
+                [-1.00674784, 0.51784992, 0.39896572, 0.11018554, 0.91136694],
+                [1.95886874, 0.89779067, 0.4748213, 0.33313531, -0.49350029],
+                [-0.19280219, 0.04023677, 1.66438103, -0.83563608, 0.15925731],
+                [1.49166429, 1.45189261, -1.86512125, 0.34329697, 0.20413807],
+            ]
+        ]
+    )
+    input = flow.Tensor(
+        np_arr, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    weight = np.array(
+        [
+            [
+                [-0.36045218, 0.37349278, 0.04565236],
+                [0.0242328, -0.09459515, -0.30684742],
+            ],
+            [
+                [-0.30345008, -0.1196513, -0.26765293],
+                [0.09876197, 0.03346226, 0.2748405],
+            ],
+            [
+                [-0.37798449, 0.00242459, -0.34125558],
+                [-0.05174343, -0.10443231, 0.09526101],
+            ],
+            [
+                [0.34196907, -0.32667893, 0.40264183],
+                [0.38025281, 0.26807079, -0.09074812],
+            ],
+        ]
+    )
+    bias = np.array([-0.03499984, -0.21616256, 0.13312563, -0.24104381])
+    m = nn.Conv1d(4, 4, 3, groups=2, stride=2, padding=2, dilation=2, bias=True)
+    m.weight = flow.nn.Parameter(flow.Tensor(weight))
+    m.bias = flow.nn.Parameter(flow.Tensor(bias))
+    m = m.to(device)
+    np_out = np.array(
+        [
+            [
+                [-0.72379637, 0.67248386, 0.21977007],
+                [-0.00643994, -0.1286152, -0.41589433],
+                [-0.76877236, 0.29273134, -0.42040929],
+                [1.0612179, -0.73787093, -0.37839717],
+            ]
+        ]
+    )
+    output = m(input)
+    test_case.assertTrue(np.allclose(output.numpy(), np_out, 1e-06, 1e-06))
+    output = output.sum()
+    output.backward()
+    np_grad = np.array(
+        [
+            [
+                [-0.41006082, 0.0, -0.63206136, 0.0, 0.03184089],
+                [0.06186188, 0.0, 0.02985496, 0.0, -0.09313981],
+                [-0.36026976, 0.0, -0.2988835, 0.0, -0.26286808],
+                [0.49214786, 0.0, 0.49666074, 0.0, 0.16815135],
+            ]
+        ]
+    )
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-06, 1e-06))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestConv1d(flow.unittest.TestCase):
+    def test_conv1d(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_conv1d_bias_true,
+            _test_conv1d_bias_false,
+            _test_conv1d_dilation,
+            _test_conv1d_stride,
+            _test_conv1d_group_bias_true,
+            _test_conv1d_group_large_out_bias_true,
+            _test_conv1d_group_large_in_bias_true,
+            _test_conv1d_compilcate,
+        ]
+        arg_dict["device"] = ["cuda", "cpu"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_crossentropyloss.py b/python/oneflow/test/modules/test_crossentropyloss.py
new file mode 100644
index 0000000000000000000000000000000000000000..23031a5067eaa317b73aa16f415d67e7820ab4b3
--- /dev/null
+++ b/python/oneflow/test/modules/test_crossentropyloss.py
@@ -0,0 +1,49 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+from automated_test_util import *
+
+import oneflow as flow
+import oneflow.unittest
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestCrossEntropyLossModule(flow.unittest.TestCase):
+    @unittest.skip("nn.CrossEntropyLoss has bug")
+    @autotest(n=200)
+    def test_CrossEntropyLoss_with_random_data(test_case):
+        num_classes = random()
+        shape = random_tensor(ndim=random(2, 5), dim1=num_classes).value().shape
+        m = torch.nn.CrossEntropyLoss(
+            reduction=oneof("none", "sum", "mean", nothing()),
+            ignore_index=random(0, num_classes) | nothing(),
+        )
+        m.train(random())
+        device = random_device()
+        m.to(device)
+        x = random_pytorch_tensor(len(shape), *shape).to(device)
+        target = random_pytorch_tensor(
+            len(shape) - 1, *shape[:1] + shape[2:], low=0, high=num_classes, dtype=int
+        ).to(device)
+        y = m(x, target)
+        return y
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_ctc_loss.py b/python/oneflow/test/modules/test_ctc_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd643d9b202e7131d141af57b4929e4aeaf7f4f0
--- /dev/null
+++ b/python/oneflow/test/modules/test_ctc_loss.py
@@ -0,0 +1,293 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+ninf = -float("inf")
+
+
+def _logsumexp(a, b):
+    if a < b:
+        (a, b) = (b, a)
+    if b == ninf:
+        return a
+    else:
+        return a + np.log(1 + np.exp(b - a))
+
+
+def logsumexp(*args):
+    res = args[0]
+    for e in args[1:]:
+        res = _logsumexp(res, e)
+    return res
+
+
+def log_softmax(logits, axis=0):
+    max_value = np.max(logits, axis, keepdims=True)
+    exp = np.exp(logits - max_value)
+    exp_sum = np.sum(exp, axis, keepdims=True)
+    dist = exp / exp_sum
+    return np.log(dist)
+
+
+def get_target_prime(targets, b, s, blank):
+    if s % 2 == 0:
+        return blank
+    else:
+        return targets[b, s // 2]
+
+
+def ctc_loss_np(log_probs, targets, input_lengths, target_lengths, blank=0):
+    (max_input_length, batch_size, _) = log_probs.shape
+    (_, max_target_length) = targets.shape
+    loss = np.zeros(batch_size)
+    alpha = np.zeros([batch_size, max_input_length, 2 * max_target_length + 1])
+    alpha[:, 0] = ninf
+    for b in range(0, batch_size):
+        input_length = input_lengths[b]
+        target_length = target_lengths[b]
+        alpha[b, 0, 0] = log_probs[0, b, blank]
+        if target_length > 0:
+            current_target_prime = get_target_prime(targets, b, 1, blank)
+            alpha[b, 0, 1] = log_probs[0, b, current_target_prime]
+        for t in range(1, input_length):
+            for s in range(0, 2 * target_length + 1):
+                current_target_prime = get_target_prime(targets, b, s, blank)
+                la1 = alpha[b, t - 1, s]
+                if s > 0:
+                    la2 = alpha[b, t - 1, s - 1]
+                else:
+                    la2 = ninf
+                if (
+                    s > 1
+                    and get_target_prime(targets, b, s - 2, blank)
+                    != current_target_prime
+                ):
+                    la3 = alpha[b, t - 1, s - 2]
+                else:
+                    la3 = ninf
+                alpha[b, t, s] = (
+                    logsumexp(la1, la2, la3) + log_probs[t, b, current_target_prime]
+                )
+        if target_length == 0:
+            loss[b] = -alpha[b, input_length - 1, 0]
+        else:
+            l1 = alpha[b, input_length - 1, target_length * 2]
+            l2 = alpha[b, input_length - 1, target_length * 2 - 1]
+            loss[b] = -logsumexp(l1, l2)
+    return (loss, alpha)
+
+
+def ctc_loss_grad_np(
+    grad_out,
+    loss,
+    alpha,
+    log_probs,
+    targets,
+    input_lengths,
+    target_lengths,
+    blank=0,
+    zero_infinity=False,
+):
+    (max_input_length, batch_size, num_labels) = log_probs.shape
+    (_, max_target_length) = targets.shape
+    beta = np.zeros([batch_size, max_input_length, 2 * max_target_length + 1])
+    grad = np.zeros(log_probs.shape, dtype=log_probs.dtype)
+    grad.fill(ninf)
+    for b in range(0, batch_size):
+        input_length = input_lengths[b]
+        target_length = target_lengths[b]
+        nll = loss[b]
+        if zero_infinity and nll == float("inf"):
+            grad[:, b, :] = 0
+            continue
+        if input_length > 0:
+            beta[b, input_length - 1, :] = ninf
+            beta[b, input_length - 1, 2 * target_length] = log_probs[
+                input_length - 1, b, blank
+            ]
+            grad[input_length - 1, b, blank] = (
+                alpha[b, input_length - 1, 2 * target_length]
+                + beta[b, input_length - 1, 2 * target_length]
+            )
+            if target_length > 0:
+                current_target_prime = get_target_prime(
+                    targets, b, 2 * target_length - 1, blank
+                )
+                beta[b, input_length - 1, 2 * target_length - 1] = log_probs[
+                    input_length - 1, b, current_target_prime
+                ]
+                grad[input_length - 1, b, current_target_prime] = (
+                    alpha[b, input_length - 1, 2 * target_length - 1]
+                    + beta[b, input_length - 1, 2 * target_length - 1]
+                )
+        for t in range(input_length - 2, -1, -1):
+            for s in range(2 * target_length, -1, -1):
+                current_target_prime = get_target_prime(targets, b, s, blank)
+                lb1 = beta[b, t + 1, s]
+                if s < 2 * target_length:
+                    lb2 = beta[b, t + 1, s + 1]
+                else:
+                    lb2 = ninf
+                if (
+                    s < 2 * target_length - 1
+                    and get_target_prime(targets, b, s + 2, blank)
+                    != current_target_prime
+                ):
+                    lb3 = beta[b, t + 1, s + 2]
+                else:
+                    lb3 = ninf
+                beta[b, t, s] = (
+                    logsumexp(lb1, lb2, lb3) + log_probs[t, b, current_target_prime]
+                )
+                alpha_beta = alpha[b, t, s] + beta[b, t, s]
+                lcab = grad[t, b, current_target_prime]
+                if lcab == ninf:
+                    grad[t, b, current_target_prime] = alpha_beta
+                else:
+                    grad[t, b, current_target_prime] = logsumexp(lcab, alpha_beta)
+        for t in range(0, input_length):
+            for c in range(0, num_labels):
+                res = grad[t, b, c]
+                lp = log_probs[t, b, c]
+                grad[t, b, c] = (np.exp(lp) - np.exp(res + nll - lp)) * grad_out[b]
+        if input_length < max_input_length:
+            grad[input_length:max_input_length, b] = 0
+    return grad
+
+
+def compare_with_np(
+    device_type,
+    device_num,
+    data_type,
+    max_input_length,
+    batch_size,
+    num_classes,
+    max_target_length,
+    blank,
+    reduction,
+    zero_infinity,
+):
+    assert data_type in ["float32", "double"]
+    assert device_type in ["cuda", "cpu"]
+    assert reduction in ["none", "mean", "sum"]
+    assert zero_infinity in [False, True]
+    log_probs = np.random.random(
+        size=(max_input_length, batch_size, num_classes)
+    ).astype(np.float32)
+    log_probs = log_softmax(log_probs, axis=2)
+    targets = np.random.randint(
+        1, high=num_classes, size=(batch_size, max_target_length), dtype=np.int32
+    )
+    input_lengths = np.random.randint(
+        max_input_length / 2, high=max_input_length, size=(batch_size,), dtype=np.int32
+    )
+    target_lengths = np.random.randint(
+        max_target_length / 2,
+        high=max_target_length,
+        size=(batch_size,),
+        dtype=np.int32,
+    )
+    (np_loss, np_alpha) = ctc_loss_np(
+        log_probs, targets, input_lengths, target_lengths, blank
+    )
+    np_out = np.where(np_loss == float("inf"), 0, np_loss) if zero_infinity else np_loss
+    if reduction == "mean":
+        np_out = np.mean(
+            np.divide(np_out, np.clip(target_lengths, 1, a_max=None).astype(np.float32))
+        )
+    elif reduction == "sum":
+        np_out = np.sum(np_out)
+    np_grad_out = np.ones_like(np_loss, dtype=np.float32)
+    if reduction == "mean":
+        np_grad_out = np.divide(
+            np_grad_out, np.clip(target_lengths, 1, a_max=None).astype(np.float32)
+        )
+        np_grad_out /= target_lengths.size
+    np_grad = ctc_loss_grad_np(
+        np_grad_out,
+        np_loss,
+        np_alpha,
+        log_probs,
+        targets,
+        input_lengths,
+        target_lengths,
+        blank,
+        zero_infinity,
+    )
+    ctc_loss = flow.nn.CTCLoss(
+        blank=blank, reduction=reduction, zero_infinity=zero_infinity
+    )
+    log_probs = flow.Tensor(
+        log_probs,
+        dtype=flow.float32,
+        requires_grad=True,
+        device=flow.device(device_type),
+    )
+    targets = flow.Tensor(
+        targets, dtype=flow.int32, requires_grad=False, device=flow.device(device_type)
+    )
+    input_lengths = flow.Tensor(
+        input_lengths,
+        dtype=flow.int32,
+        requires_grad=False,
+        device=flow.device(device_type),
+    )
+    target_lengths = flow.Tensor(
+        target_lengths,
+        dtype=flow.int32,
+        requires_grad=False,
+        device=flow.device(device_type),
+    )
+    ctc_loss = ctc_loss.to(device_type)
+    of_out = ctc_loss(log_probs, targets, input_lengths, target_lengths)
+    assert np.allclose(of_out.numpy(), np_out, atol=1e-05)
+    of_out = of_out.sum()
+    of_out.backward()
+    assert np.allclose(log_probs.grad.numpy(), np_grad, atol=1e-05, equal_nan=True)
+
+
+def gen_arg_list():
+    arg_dict = OrderedDict()
+    arg_dict["device_type"] = ["cuda", "cpu"]
+    arg_dict["device_num"] = [1]
+    arg_dict["data_type"] = ["float32"]
+    arg_dict["max_input_length"] = [20]
+    arg_dict["batch_size"] = [4]
+    arg_dict["num_classes"] = [5]
+    arg_dict["max_target_length"] = [10]
+    arg_dict["blank"] = [0, 4]
+    arg_dict["reduction"] = ["mean", "none"]
+    arg_dict["zero_infinity"] = [False, True]
+    return GenArgList(arg_dict)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestCTCLoss1n1d(flow.unittest.TestCase):
+    def test_ctc_loss(test_case):
+        for arg in gen_arg_list():
+            compare_with_np(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_dataset.py b/python/oneflow/test/modules/test_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..05abf36af24a50f8b26e803af80b0e532325b3cc
--- /dev/null
+++ b/python/oneflow/test/modules/test_dataset.py
@@ -0,0 +1,282 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import math
+import os
+import unittest
+
+import cv2
+import numpy as np
+
+import oneflow as flow
+import oneflow.unittest
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestOFRecordModule(flow.unittest.TestCase):
+    def test_record(test_case):
+        batch_size = 1
+        color_space = "RGB"
+        height = 224
+        width = 224
+        output_layout = "NCHW"
+        rgb_mean = [123.68, 116.779, 103.939]
+        rgb_std = [58.393, 57.12, 57.375]
+        record_reader = flow.nn.OfrecordReader(
+            "/dataset/imagenette/ofrecord",
+            batch_size=batch_size,
+            data_part_num=1,
+            part_name_suffix_length=5,
+            shuffle_after_epoch=False,
+        )
+        record_image_decoder = flow.nn.OFRecordImageDecoder(
+            "encoded", color_space=color_space
+        )
+        record_label_decoder = flow.nn.OfrecordRawDecoder(
+            "class/label", shape=(), dtype=flow.int32
+        )
+        resize = flow.nn.image.Resize(
+            resize_side="shorter", keep_aspect_ratio=True, target_size=256
+        )
+        crop_mirror_normal = flow.nn.CropMirrorNormalize(
+            color_space=color_space,
+            output_layout=output_layout,
+            crop_h=height,
+            crop_w=width,
+            crop_pos_y=0.5,
+            crop_pos_x=0.5,
+            mean=rgb_mean,
+            std=rgb_std,
+            output_dtype=flow.float,
+        )
+        val_record = record_reader()
+        label = record_label_decoder(val_record)
+        image_raw_buffer = record_image_decoder(val_record)
+        image_raw_buffer_nd = image_raw_buffer.numpy()[0]
+        gt_np = cv2.imread("/dataset/imagenette/ofrecord/gt_tensor_buffer_image.png")
+        test_case.assertTrue(np.array_equal(image_raw_buffer_nd, gt_np))
+        image = resize(image_raw_buffer)[0]
+        resized_image_raw_buffer_nd = image.numpy()[0]
+        gt_np = cv2.imread(
+            "/dataset/imagenette/ofrecord/gt_tensor_buffer_resized_image.png"
+        )
+        test_case.assertTrue(np.array_equal(resized_image_raw_buffer_nd, gt_np))
+        image = crop_mirror_normal(image)
+        image_np = image.numpy()
+        image_np = np.squeeze(image_np)
+        image_np = np.transpose(image_np, (1, 2, 0))
+        image_np = image_np * rgb_std + rgb_mean
+        image_np = cv2.cvtColor(np.float32(image_np), cv2.COLOR_RGB2BGR)
+        image_np = image_np.astype(np.uint8)
+        gt_np = cv2.imread("/dataset/imagenette/ofrecord/gt_val_image.png")
+        test_case.assertEqual(label.numpy()[0], 5)
+        test_case.assertTrue(np.array_equal(image_np, gt_np))
+
+
+coco_dict = dict()
+
+
+def _coco(anno_file):
+    global coco_dict
+    if anno_file not in coco_dict:
+        from pycocotools.coco import COCO
+
+        coco_dict[anno_file] = COCO(anno_file)
+    return coco_dict[anno_file]
+
+
+def _get_coco_image_samples(anno_file, image_dir, image_ids):
+    coco = _coco(anno_file)
+    category_id_to_contiguous_id_map = _get_category_id_to_contiguous_id_map(coco)
+    (image, image_size) = _read_images_with_cv(coco, image_dir, image_ids)
+    bbox = _read_bbox(coco, image_ids)
+    label = _read_label(coco, image_ids, category_id_to_contiguous_id_map)
+    img_segm_poly_list = _read_segm_poly(coco, image_ids)
+    (poly, poly_index) = _segm_poly_list_to_tensor(img_segm_poly_list)
+    samples = []
+    for (im, ims, b, l, p, pi) in zip(image, image_size, bbox, label, poly, poly_index):
+        samples.append(
+            dict(image=im, image_size=ims, bbox=b, label=l, poly=p, poly_index=pi)
+        )
+    return samples
+
+
+def _get_category_id_to_contiguous_id_map(coco):
+    return {v: i + 1 for (i, v) in enumerate(coco.getCatIds())}
+
+
+def _read_images_with_cv(coco, image_dir, image_ids):
+    image_files = [
+        os.path.join(image_dir, coco.imgs[img_id]["file_name"]) for img_id in image_ids
+    ]
+    image_size = [
+        (coco.imgs[img_id]["height"], coco.imgs[img_id]["width"])
+        for img_id in image_ids
+    ]
+    return (
+        [cv2.imread(image_file).astype(np.single) for image_file in image_files],
+        image_size,
+    )
+
+
+def _bbox_convert_from_xywh_to_xyxy(bbox, image_h, image_w):
+    (x, y, w, h) = bbox
+    (x1, y1) = (x, y)
+    x2 = x1 + max(w - 1, 0)
+    y2 = y1 + max(h - 1, 0)
+    x1 = min(max(x1, 0), image_w - 1)
+    y1 = min(max(y1, 0), image_h - 1)
+    x2 = min(max(x2, 0), image_w - 1)
+    y2 = min(max(y2, 0), image_h - 1)
+    if x1 >= x2 or y1 >= y2:
+        return None
+    return [x1, y1, x2, y2]
+
+
+def _read_bbox(coco, image_ids):
+    img_bbox_list = []
+    for img_id in image_ids:
+        anno_ids = coco.getAnnIds(imgIds=[img_id])
+        assert len(anno_ids) > 0, "image with id {} has no anno".format(img_id)
+        image_h = coco.imgs[img_id]["height"]
+        image_w = coco.imgs[img_id]["width"]
+        bbox_list = []
+        for anno_id in anno_ids:
+            anno = coco.anns[anno_id]
+            if anno["iscrowd"] != 0:
+                continue
+            bbox = anno["bbox"]
+            assert isinstance(bbox, list)
+            bbox_ = _bbox_convert_from_xywh_to_xyxy(bbox, image_h, image_w)
+            if bbox_ is not None:
+                bbox_list.append(bbox_)
+        bbox_array = np.array(bbox_list, dtype=np.single)
+        img_bbox_list.append(bbox_array)
+    return img_bbox_list
+
+
+def _read_label(coco, image_ids, category_id_to_contiguous_id_map):
+    img_label_list = []
+    for img_id in image_ids:
+        anno_ids = coco.getAnnIds(imgIds=[img_id])
+        assert len(anno_ids) > 0, "image with id {} has no anno".format(img_id)
+        label_list = []
+        for anno_id in anno_ids:
+            anno = coco.anns[anno_id]
+            if anno["iscrowd"] != 0:
+                continue
+            cate_id = anno["category_id"]
+            isinstance(cate_id, int)
+            label_list.append(category_id_to_contiguous_id_map[cate_id])
+        label_array = np.array(label_list, dtype=np.int32)
+        img_label_list.append(label_array)
+    return img_label_list
+
+
+def _read_segm_poly(coco, image_ids):
+    img_segm_poly_list = []
+    for img_id in image_ids:
+        anno_ids = coco.getAnnIds(imgIds=[img_id])
+        assert len(anno_ids) > 0, "img {} has no anno".format(img_id)
+        segm_poly_list = []
+        for anno_id in anno_ids:
+            anno = coco.anns[anno_id]
+            if anno["iscrowd"] != 0:
+                continue
+            segm = anno["segmentation"]
+            assert isinstance(segm, list)
+            assert len(segm) > 0, str(len(segm))
+            assert all([len(poly) > 0 for poly in segm]), str(
+                [len(poly) for poly in segm]
+            )
+            segm_poly_list.append(segm)
+        img_segm_poly_list.append(segm_poly_list)
+    return img_segm_poly_list
+
+
+def _segm_poly_list_to_tensor(img_segm_poly_list):
+    poly_array_list = []
+    poly_index_array_list = []
+    for (img_idx, segm_poly_list) in enumerate(img_segm_poly_list):
+        img_poly_elem_list = []
+        img_poly_index_list = []
+        for (obj_idx, poly_list) in enumerate(segm_poly_list):
+            for (poly_idx, poly) in enumerate(poly_list):
+                img_poly_elem_list.extend(poly)
+                for (pt_idx, pt) in enumerate(poly):
+                    if pt_idx % 2 == 0:
+                        img_poly_index_list.append([pt_idx / 2, poly_idx, obj_idx])
+        img_poly_array = np.array(img_poly_elem_list, dtype=np.single).reshape(-1, 2)
+        assert img_poly_array.size > 0, segm_poly_list
+        poly_array_list.append(img_poly_array)
+        img_poly_index_array = np.array(img_poly_index_list, dtype=np.int32)
+        assert img_poly_index_array.size > 0, segm_poly_list
+        poly_index_array_list.append(img_poly_index_array)
+    return (poly_array_list, poly_index_array_list)
+
+
+@flow.unittest.skip_unless_1n1d()
+@flow.unittest.skip_unless_1n1d()
+class TestCocoReader(flow.unittest.TestCase):
+    def test_coco_reader(test_case):
+        anno_file = "/dataset/mscoco_2017/annotations/instances_val2017.json"
+        image_dir = "/dataset/mscoco_2017/val2017"
+        num_iterations = 100
+        coco_reader = flow.nn.COCOReader(
+            annotation_file=anno_file,
+            image_dir=image_dir,
+            batch_size=2,
+            shuffle=True,
+            stride_partition=True,
+        )
+        image_decoder = flow.nn.image.decode(dtype=flow.float)
+        for i in range(num_iterations):
+            (
+                image,
+                image_id,
+                image_size,
+                gt_bbox,
+                gt_label,
+                gt_segm,
+                gt_segm_index,
+            ) = coco_reader()
+            decoded_image = image_decoder(image)
+            image_list = decoded_image.numpy()
+            image_id = image_id.numpy()
+            image_size = image_size.numpy()
+            bbox_list = gt_bbox.numpy()
+            label_list = gt_label.numpy()
+            segm_list = gt_segm.numpy()
+            segm_index_list = gt_segm_index.numpy()
+            samples = _get_coco_image_samples(anno_file, image_dir, image_id)
+            for (i, sample) in enumerate(samples):
+                test_case.assertTrue(np.array_equal(image_list[i], sample["image"]))
+                test_case.assertTrue(
+                    np.array_equal(image_size[i], sample["image_size"])
+                )
+                test_case.assertTrue(np.allclose(bbox_list[i], sample["bbox"]))
+                cur_label = label_list[i]
+                if len(cur_label.shape) == 0:
+                    cur_label = np.array([cur_label])
+                test_case.assertTrue(np.array_equal(cur_label, sample["label"]))
+                test_case.assertTrue(np.allclose(segm_list[i], sample["poly"]))
+                test_case.assertTrue(
+                    np.array_equal(segm_index_list[i], sample["poly_index"])
+                )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_deconv.py b/python/oneflow/test/modules/test_deconv.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6e4f7c5f25737e34489789090454c491bcc2c92
--- /dev/null
+++ b/python/oneflow/test/modules/test_deconv.py
@@ -0,0 +1,874 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.nn as nn
+import oneflow.unittest
+
+
+def _test_deconv_bias_false(test_case, device):
+    np_arr = np.array(
+        [
+            [
+                [
+                    [0.2735021114349365, -1.3842310905456543],
+                    [1.058540940284729, -0.03388553857803345],
+                ]
+            ]
+        ]
+    )
+    input = flow.Tensor(
+        np_arr, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    weight = np.array(
+        [
+            [
+                [
+                    [0.06456436216831207, -0.10852358490228653, -0.21638715267181396],
+                    [-0.2279110550880432, 0.1476770043373108, 0.19457484781742096],
+                    [0.05026858672499657, 0.10818571597337723, 0.02056501805782318],
+                ],
+                [
+                    [0.205095112323761, 0.1488947868347168, -0.2344113141298294],
+                    [0.1684819906949997, -0.21986986696720123, 0.1082606166601181],
+                    [-0.1528974026441574, 0.17120417952537537, 0.01954500749707222],
+                ],
+            ]
+        ]
+    )
+    m = nn.ConvTranspose2d(1, 2, 3, stride=1, bias=False)
+    m.weight = flow.nn.Parameter(flow.Tensor(weight))
+    m = m.to(device)
+    output = m(input)
+    np_out = np.array(
+        [
+            [
+                [
+                    [
+                        0.01765848882496357,
+                        -0.1190534234046936,
+                        0.09103937447071075,
+                        0.2995298206806183,
+                    ],
+                    [
+                        0.006009865552186966,
+                        0.2388070970773697,
+                        -0.37657976150512695,
+                        -0.26200416684150696,
+                    ],
+                    [
+                        -0.22750461101531982,
+                        0.12405071407556534,
+                        0.056831881403923035,
+                        -0.035060010850429535,
+                    ],
+                    [
+                        0.053211357444524765,
+                        0.11281562596559525,
+                        0.0181029811501503,
+                        -0.0006968567031435668,
+                    ],
+                ],
+                [
+                    [
+                        0.05609394609928131,
+                        -0.24317599833011627,
+                        -0.27021679282188416,
+                        0.32447943091392517,
+                    ],
+                    [
+                        0.26318174600601196,
+                        -0.14269141852855682,
+                        0.08078087121248245,
+                        -0.14191456139087677,
+                    ],
+                    [
+                        0.13652732968330383,
+                        0.020019691437482834,
+                        -0.10959184169769287,
+                        -0.03072327747941017,
+                    ],
+                    [
+                        -0.16184815764427185,
+                        0.1864076405763626,
+                        0.014887845143675804,
+                        -0.0006622931105084717,
+                    ],
+                ],
+            ]
+        ]
+    )
+    test_case.assertTrue(np.allclose(output.numpy(), np_out, 1e-06, 1e-06))
+    output = output.sum()
+    output.backward()
+    np_grad = [
+        [
+            [
+                [0.24731683731079102, 0.24731683731079102],
+                [0.24731683731079102, 0.24731683731079102],
+            ]
+        ]
+    ]
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-06, 1e-06))
+
+
+def _test_deconv_bias_true(test_case, device):
+    np_arr = np.array(
+        [
+            [
+                [
+                    [0.2735021114349365, -1.3842310905456543],
+                    [1.058540940284729, -0.03388553857803345],
+                ]
+            ]
+        ]
+    )
+    input = flow.Tensor(
+        np_arr, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    weight = np.array(
+        [
+            [
+                [
+                    [0.06456436216831207, -0.10852358490228653, -0.21638715267181396],
+                    [-0.2279110550880432, 0.1476770043373108, 0.19457484781742096],
+                    [0.05026858672499657, 0.10818571597337723, 0.02056501805782318],
+                ],
+                [
+                    [0.205095112323761, 0.1488947868347168, -0.2344113141298294],
+                    [0.1684819906949997, -0.21986986696720123, 0.1082606166601181],
+                    [-0.1528974026441574, 0.17120417952537537, 0.01954500749707222],
+                ],
+            ]
+        ]
+    )
+    bias = np.array([0.06456436216831207, -0.10852358490228653])
+    m = nn.ConvTranspose2d(1, 2, 3, stride=1)
+    m.weight = flow.nn.Parameter(flow.Tensor(weight))
+    m.bias = flow.nn.Parameter(flow.Tensor(bias))
+    m = m.to(device)
+    output = m(input)
+    np_out = [
+        [
+            [
+                [
+                    0.0822228491306305,
+                    -0.05448906123638153,
+                    0.15560373663902283,
+                    0.36409419775009155,
+                ],
+                [
+                    0.07057422399520874,
+                    0.30337145924568176,
+                    -0.3120154142379761,
+                    -0.19743980467319489,
+                ],
+                [
+                    -0.16294024884700775,
+                    0.188615083694458,
+                    0.12139624357223511,
+                    0.029504351317882538,
+                ],
+                [
+                    0.11777572333812714,
+                    0.17737999558448792,
+                    0.08266734331846237,
+                    0.06386750191450119,
+                ],
+            ],
+            [
+                [
+                    -0.05242963880300522,
+                    -0.3516995906829834,
+                    -0.3787403702735901,
+                    0.21595585346221924,
+                ],
+                [
+                    0.15465816855430603,
+                    -0.25121501088142395,
+                    -0.027742713689804077,
+                    -0.2504381537437439,
+                ],
+                [
+                    0.028003744781017303,
+                    -0.088503897190094,
+                    -0.2181154191493988,
+                    -0.139246866106987,
+                ],
+                [
+                    -0.2703717350959778,
+                    0.07788405567407608,
+                    -0.09363573789596558,
+                    -0.10918587446212769,
+                ],
+            ],
+        ]
+    ]
+    test_case.assertTrue(np.allclose(output.numpy(), np_out, 1e-06, 1e-06))
+    output = output.sum()
+    output.backward()
+    np_grad = [
+        [
+            [
+                [0.24731683731079102, 0.24731683731079102],
+                [0.24731683731079102, 0.24731683731079102],
+            ]
+        ]
+    ]
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-06, 1e-06))
+
+
+def _test_deconv_group_bias_false(test_case, device):
+    np_arr = np.array(
+        [
+            [
+                [
+                    [-2.0125174206754517, 1.9917882689443576],
+                    [0.13146748727936577, -0.5356457374181375],
+                ],
+                [
+                    [1.020683505853394, 1.2900643048299678],
+                    [-0.549010560600543, 0.8088391626901512],
+                ],
+            ]
+        ]
+    )
+    input = flow.Tensor(
+        np_arr, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    m = nn.ConvTranspose2d(2, 2, 3, stride=1, groups=2, bias=False)
+    weight = np.array(
+        [
+            [
+                [
+                    [0.06456436216831207, -0.10852358490228653, -0.21638715267181396],
+                    [-0.2279110550880432, 0.1476770043373108, 0.19457484781742096],
+                    [0.05026858672499657, 0.10818571597337723, 0.02056501805782318],
+                ]
+            ],
+            [
+                [
+                    [0.205095112323761, 0.1488947868347168, -0.2344113141298294],
+                    [0.1684819906949997, -0.21986986696720123, 0.1082606166601181],
+                    [-0.1528974026441574, 0.17120417952537537, 0.01954500749707222],
+                ]
+            ],
+        ]
+    )
+    m.weight = flow.nn.Parameter(flow.Tensor(weight))
+    m = m.to(device)
+    output = m(input)
+    np_out = np.array(
+        [
+            [
+                [
+                    [
+                        -0.12993690371513367,
+                        0.34700414538383484,
+                        0.219326913356781,
+                        -0.43099740147590637,
+                    ],
+                    [
+                        0.4671630859375,
+                        -0.8000040054321289,
+                        -0.06776165962219238,
+                        0.5034587383270264,
+                    ],
+                    [
+                        -0.13112929463386536,
+                        0.02389305830001831,
+                        0.12057329714298248,
+                        -0.06326202303171158,
+                    ],
+                    [
+                        0.00660868501290679,
+                        -0.012703249230980873,
+                        -0.05524558573961258,
+                        -0.011015564203262329,
+                    ],
+                ],
+                [
+                    [
+                        0.20933720469474792,
+                        0.4165603518486023,
+                        -0.04717591404914856,
+                        -0.3024056851863861,
+                    ],
+                    [
+                        0.059367403388023376,
+                        0.07707919180393219,
+                        0.07597976922988892,
+                        -0.049937888979911804,
+                    ],
+                    [
+                        -0.24855825304985046,
+                        0.2344835251569748,
+                        0.003538096323609352,
+                        0.11277973651885986,
+                    ],
+                    [
+                        0.08394229412078857,
+                        -0.21766230463981628,
+                        0.12774622440338135,
+                        0.015808766707777977,
+                    ],
+                ],
+            ]
+        ]
+    )
+    test_case.assertTrue(np.allclose(output.numpy(), np_out, 1e-06, 1e-06))
+    output = output.sum()
+    output.backward()
+    np_grad = [
+        [
+            [
+                [0.03301373869180679, 0.03301373869180679],
+                [0.03301373869180679, 0.03301373869180679],
+            ],
+            [
+                [0.21430310606956482, 0.21430310606956482],
+                [0.21430310606956482, 0.21430310606956482],
+            ],
+        ]
+    ]
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-06, 1e-06))
+
+
+def _test_deconv_group_bias_true(test_case, device):
+    np_arr = np.array(
+        [
+            [
+                [
+                    [-2.0125174206754517, 1.9917882689443576],
+                    [0.13146748727936577, -0.5356457374181375],
+                ],
+                [
+                    [1.020683505853394, 1.2900643048299678],
+                    [-0.549010560600543, 0.8088391626901512],
+                ],
+            ]
+        ]
+    )
+    input = flow.Tensor(
+        np_arr, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    m = nn.ConvTranspose2d(2, 2, 3, stride=1, groups=2)
+    weight = np.array(
+        [
+            [
+                [
+                    [0.06456436216831207, -0.10852358490228653, -0.21638715267181396],
+                    [-0.2279110550880432, 0.1476770043373108, 0.19457484781742096],
+                    [0.05026858672499657, 0.10818571597337723, 0.02056501805782318],
+                ]
+            ],
+            [
+                [
+                    [0.205095112323761, 0.1488947868347168, -0.2344113141298294],
+                    [0.1684819906949997, -0.21986986696720123, 0.1082606166601181],
+                    [-0.1528974026441574, 0.17120417952537537, 0.01954500749707222],
+                ]
+            ],
+        ]
+    )
+    m.weight = flow.nn.Parameter(flow.Tensor(weight))
+    bias = np.array([0.06456436216831207, -0.10852358490228653])
+    m.bias = flow.nn.Parameter(flow.Tensor(bias))
+    m = m.to(device)
+    output = m(input)
+    np_out = [
+        [
+            [
+                [
+                    -0.0653725415468216,
+                    0.4115685224533081,
+                    0.2838912606239319,
+                    -0.3664330244064331,
+                ],
+                [
+                    0.5317274332046509,
+                    -0.735439658164978,
+                    -0.00319729745388031,
+                    0.5680230855941772,
+                ],
+                [
+                    -0.06656493246555328,
+                    0.08845742046833038,
+                    0.18513765931129456,
+                    0.0013023391366004944,
+                ],
+                [
+                    0.0711730495095253,
+                    0.05186111479997635,
+                    0.009318776428699493,
+                    0.053548797965049744,
+                ],
+            ],
+            [
+                [
+                    0.1008136197924614,
+                    0.30803677439689636,
+                    -0.1556994915008545,
+                    -0.41092926263809204,
+                ],
+                [
+                    -0.04915618151426315,
+                    -0.03144439309835434,
+                    -0.032543815672397614,
+                    -0.15846148133277893,
+                ],
+                [
+                    -0.3570818305015564,
+                    0.12595993280410767,
+                    -0.10498549044132233,
+                    0.004256151616573334,
+                ],
+                [
+                    -0.024581290781497955,
+                    -0.3261858820915222,
+                    0.019222639501094818,
+                    -0.0927148163318634,
+                ],
+            ],
+        ]
+    ]
+    test_case.assertTrue(np.allclose(output.numpy(), np_out, 1e-06, 1e-06))
+    output = output.sum()
+    output.backward()
+    np_grad = [
+        [
+            [
+                [0.03301373869180679, 0.03301373869180679],
+                [0.03301373869180679, 0.03301373869180679],
+            ],
+            [
+                [0.21430310606956482, 0.21430310606956482],
+                [0.21430310606956482, 0.21430310606956482],
+            ],
+        ]
+    ]
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-06, 1e-06))
+
+
+def _test_deconv_group_large_out_channel(test_case, device):
+    np_arr = np.array(
+        [
+            [
+                [
+                    [-2.0125174206754517, 1.9917882689443576],
+                    [0.13146748727936577, -0.5356457374181375],
+                ],
+                [
+                    [1.020683505853394, 1.2900643048299678],
+                    [-0.549010560600543, 0.8088391626901512],
+                ],
+            ]
+        ]
+    )
+    input = flow.Tensor(
+        np_arr, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    m = nn.ConvTranspose2d(2, 6, 3, stride=1, groups=2, bias=False)
+    weight = np.array(
+        [
+            [
+                [
+                    [0.05271657928824425, -0.08860913664102554, -0.17667937278747559],
+                    [-0.18608860671520233, 0.12057777494192123, 0.1588696986436844],
+                    [0.04104413092136383, 0.08833327144384384, 0.016791267320513725],
+                ],
+                [
+                    [0.16745945811271667, 0.1215720921754837, -0.19139604270458221],
+                    [0.13756497204303741, -0.17952299118041992, 0.08839442580938339],
+                    [-0.12484020739793777, 0.13978762924671173, 0.015958432108163834],
+                ],
+                [
+                    [-0.07709092646837234, -0.029757702723145485, -0.18154984712600708],
+                    [-0.14461342990398407, 0.06567336618900299, 0.05665326863527298],
+                    [0.04441174864768982, -0.04477253183722496, 0.191376194357872],
+                ],
+            ],
+            [
+                [
+                    [0.1850736141204834, 0.07141514122486115, 0.05791180208325386],
+                    [0.07253318279981613, -0.042754165828228, -0.14045141637325287],
+                    [0.08525089919567108, 0.009758883155882359, -0.07303793728351593],
+                ],
+                [
+                    [-0.005451973062008619, 0.1499139368534088, 0.16706342995166779],
+                    [-0.05473465472459793, 0.02753184549510479, -0.06856250017881393],
+                    [0.03629609942436218, -0.06238799914717674, -0.041715867817401886],
+                ],
+                [
+                    [0.15021666884422302, -0.10501708835363388, 0.04741475358605385],
+                    [-0.16011257469654083, 0.1280348002910614, 0.11050418764352798],
+                    [-0.10031674802303314, 0.1449088454246521, -0.16990724205970764],
+                ],
+            ],
+        ]
+    )
+    m.weight = flow.nn.Parameter(flow.Tensor(weight))
+    m = m.to(device)
+    output = m(input)
+    np_out = np.array(
+        [
+            [
+                [
+                    [
+                        -0.10609303414821625,
+                        0.28332769870758057,
+                        0.17907968163490295,
+                        -0.3519079089164734,
+                    ],
+                    [
+                        0.3814370930194855,
+                        -0.653200626373291,
+                        -0.055327147245407104,
+                        0.41107234358787537,
+                    ],
+                    [
+                        -0.10706663131713867,
+                        0.019508585333824158,
+                        0.09844768047332764,
+                        -0.05165322124958038,
+                    ],
+                    [
+                        0.005395968910306692,
+                        -0.010372160002589226,
+                        -0.04510783404111862,
+                        -0.00899417046457529,
+                    ],
+                ],
+                [
+                    [
+                        -0.3370150923728943,
+                        0.08887782692909241,
+                        0.6273337602615356,
+                        -0.38122040033340454,
+                    ],
+                    [
+                        -0.25483641028404236,
+                        0.561577320098877,
+                        -0.6257490515708923,
+                        0.27858346700668335,
+                    ],
+                    [
+                        0.26932841539382935,
+                        -0.6272678375244141,
+                        0.35409244894981384,
+                        -0.015562277287244797,
+                    ],
+                    [
+                        -0.01641242951154709,
+                        0.08524765074253082,
+                        -0.0727786272764206,
+                        -0.008548066020011902,
+                    ],
+                ],
+                [
+                    [
+                        0.15514683723449707,
+                        -0.09366090595722198,
+                        0.3061012029647827,
+                        -0.3616088628768921,
+                    ],
+                    [
+                        0.28090208768844604,
+                        -0.38282686471939087,
+                        0.008863434195518494,
+                        0.21008771657943726,
+                    ],
+                    [
+                        -0.10839138925075531,
+                        0.2646597623825073,
+                        -0.5020549297332764,
+                        0.35083478689193726,
+                    ],
+                    [
+                        0.005838701035827398,
+                        -0.029675094410777092,
+                        0.04914196580648422,
+                        -0.10250984132289886,
+                    ],
+                ],
+                [
+                    [
+                        0.18890158832073212,
+                        0.3116491138935089,
+                        0.15123975276947021,
+                        0.074709951877594,
+                    ],
+                    [
+                        -0.027573950588703156,
+                        0.16042113304138184,
+                        -0.17254289984703064,
+                        -0.1343500316143036,
+                    ],
+                    [
+                        0.047192707657814026,
+                        0.20208004117012024,
+                        -0.01943095773458481,
+                        -0.20782624185085297,
+                    ],
+                    [
+                        -0.04680364578962326,
+                        0.06359653919935226,
+                        0.04799196869134903,
+                        -0.05907594412565231,
+                    ],
+                ],
+                [
+                    [
+                        -0.005564738996326923,
+                        0.1459812968969345,
+                        0.3639175295829773,
+                        0.21552257239818573,
+                    ],
+                    [
+                        -0.05287356674671173,
+                        -0.12922403216362,
+                        -0.0049260929226875305,
+                        0.04667740315198898,
+                    ],
+                    [
+                        0.06709674000740051,
+                        -0.0762409120798111,
+                        -0.06315286457538605,
+                        -0.10927218943834305,
+                    ],
+                    [
+                        -0.019926942884922028,
+                        0.06360937654972076,
+                        -0.027559401467442513,
+                        -0.03374142572283745,
+                    ],
+                ],
+                [
+                    [
+                        0.1533236801624298,
+                        0.08659995347261429,
+                        -0.08708333969116211,
+                        0.06116808205842972,
+                    ],
+                    [
+                        -0.24589480459690094,
+                        0.10328409075737,
+                        0.16698980331420898,
+                        0.1809084266424179,
+                    ],
+                    [
+                        -0.014488153159618378,
+                        -0.18130677938461304,
+                        0.056411802768707275,
+                        -0.1298111528158188,
+                    ],
+                    [
+                        0.05507495626807213,
+                        -0.1606965959072113,
+                        0.21048882603645325,
+                        -0.13742762804031372,
+                    ],
+                ],
+            ]
+        ]
+    )
+    test_case.assertTrue(np.allclose(output.numpy(), np_out, 1e-06, 1e-06))
+    output = output.sum()
+    output.backward()
+    np_grad = [
+        [
+            [
+                [0.0822635293006897, 0.0822635293006897],
+                [0.0822635293006897, 0.0822635293006897],
+            ],
+            [
+                [0.4193778932094574, 0.4193778932094574],
+                [0.4193778932094574, 0.4193778932094574],
+            ],
+        ]
+    ]
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-06, 1e-06))
+
+
+def _test_deconv_group_large_in_channel(test_case, device):
+    np_arr = [
+        [
+            [
+                [0.6393764315295867, 0.3890587560476374],
+                [0.8467359871201484, 0.24046160407703143],
+            ],
+            [
+                [0.23352071016856402, 0.6760713653927521],
+                [0.061939453383917376, 0.13541973098624682],
+            ],
+            [
+                [0.7524804920779914, 0.34366296030931365],
+                [0.4961502482687954, 0.38175448164636205],
+            ],
+            [
+                [0.01867975512238773, 0.12599156959160163],
+                [0.2658608593205851, 0.6184459583178925],
+            ],
+        ]
+    ]
+    input = flow.Tensor(
+        np_arr, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    m = nn.ConvTranspose2d(4, 2, 3, stride=1, groups=2, bias=False)
+    weight = np.array(
+        [
+            [
+                [
+                    [0.09130779653787613, -0.15347552299499512, -0.30601766705513],
+                    [-0.32231491804122925, 0.2088468372821808, 0.27517038583755493],
+                    [0.07109051942825317, 0.1529977172613144, 0.02908332832157612],
+                ]
+            ],
+            [
+                [
+                    [0.2900483012199402, 0.21056903898715973, -0.33150768280029297],
+                    [0.23826952278614044, -0.31094294786453247, 0.15310363471508026],
+                    [-0.21622958779335022, 0.24211928248405457, 0.0276408139616251],
+                ]
+            ],
+            [
+                [
+                    [-0.13352541625499725, -0.051541853696107864, -0.3144535720348358],
+                    [-0.2504778206348419, 0.11374961584806442, 0.09812634438276291],
+                    [0.07692340761423111, -0.0775483027100563, 0.33147329092025757],
+                ]
+            ],
+            [
+                [
+                    [0.3205569088459015, 0.12369465827941895, 0.1003061905503273],
+                    [0.1256311535835266, -0.07405238598585129, -0.24326899647712708],
+                    [0.14765889942646027, 0.016902882605791092, -0.12650541961193085],
+                ]
+            ],
+        ]
+    )
+    m.weight = flow.nn.Parameter(flow.Tensor(weight))
+    m = m.to(device)
+    np_out = np.array(
+        [
+            [
+                [
+                    [
+                        0.12611234188079834,
+                        0.1826610565185547,
+                        -0.19042569398880005,
+                        -0.34318169951438904,
+                    ],
+                    [
+                        -0.05516064167022705,
+                        0.04093143343925476,
+                        -0.2053149938583374,
+                        0.0920882523059845,
+                    ],
+                    [
+                        -0.2631978690624237,
+                        0.14817529916763306,
+                        0.4988565742969513,
+                        0.11690345406532288,
+                    ],
+                    [
+                        0.04680176079273224,
+                        0.13235820829868317,
+                        0.09591575711965561,
+                        0.010736535303294659,
+                    ],
+                ],
+                [
+                    [
+                        -0.09448734670877457,
+                        -0.04197392612695694,
+                        -0.2368750274181366,
+                        -0.09542831033468246,
+                    ],
+                    [
+                        -0.1671580672264099,
+                        0.16854587197303772,
+                        0.02652890235185623,
+                        -0.05493755638599396,
+                    ],
+                    [
+                        -0.030232630670070648,
+                        0.0058259665966033936,
+                        0.20417997241020203,
+                        -0.015012085437774658,
+                    ],
+                    [
+                        0.07742229104042053,
+                        0.0867031067609787,
+                        0.11167682707309723,
+                        0.048304662108421326,
+                    ],
+                ],
+            ]
+        ]
+    )
+    output = m(input)
+    test_case.assertTrue(np.allclose(output.numpy(), np_out, 1e-06, 1e-06))
+    output = output.sum()
+    output.backward()
+    np_grad = [
+        [
+            [
+                [0.046688467264175415, 0.046688467264175415],
+                [0.046688467264175415, 0.046688467264175415],
+            ],
+            [
+                [0.30307042598724365, 0.30307042598724365],
+                [0.30307042598724365, 0.30307042598724365],
+            ],
+            [
+                [-0.20727425813674927, -0.20727425813674927],
+                [-0.20727425813674927, -0.20727425813674927],
+            ],
+            [
+                [0.3909238576889038, 0.3909238576889038],
+                [0.3909238576889038, 0.3909238576889038],
+            ],
+        ]
+    ]
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-06, 1e-06))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestDeconv2d(flow.unittest.TestCase):
+    def test_deconv2d(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_deconv_bias_false,
+            _test_deconv_bias_true,
+            _test_deconv_group_bias_false,
+            _test_deconv_group_bias_true,
+            _test_deconv_group_large_out_channel,
+            _test_deconv_group_large_in_channel,
+        ]
+        arg_dict["device"] = ["cuda", "cpu"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_diag.py b/python/oneflow/test/modules/test_diag.py
new file mode 100644
index 0000000000000000000000000000000000000000..51b0e762635132ecd250b95453186603b758cf22
--- /dev/null
+++ b/python/oneflow/test/modules/test_diag.py
@@ -0,0 +1,156 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_diag_forward(test_case, shape, diagonal, device):
+    input = flow.Tensor(np.random.randn(*shape), device=flow.device(device))
+    of_out = flow.diag(input, diagonal)
+    np_out = np.diag(input.numpy(), diagonal)
+    test_case.assertTrue(
+        np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05, equal_nan=True)
+    )
+    test_case.assertTrue(
+        np.allclose(
+            input.diag(diagonal=diagonal).numpy(), np_out, 1e-05, 1e-05, equal_nan=True
+        )
+    )
+
+
+def _test_diag_one_dim_backward(test_case, diagonal, device):
+    input = flow.Tensor(
+        np.random.randn(3), device=flow.device(device), requires_grad=True
+    )
+    of_out = flow.diag(input, diagonal).sum()
+    of_out.backward()
+    np_grad = np.ones(shape=3)
+    test_case.assertTrue(
+        np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05, equal_nan=True)
+    )
+    input = flow.Tensor(
+        np.random.randn(3), device=flow.device(device), requires_grad=True
+    )
+    of_out = input.diag(diagonal=diagonal).sum()
+    of_out.backward()
+    np_grad = np.ones(shape=3)
+    test_case.assertTrue(
+        np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05, equal_nan=True)
+    )
+
+
+def _test_diag_other_dim_backward(test_case, diagonal, device):
+    input = flow.Tensor(
+        np.random.randn(3, 3), device=flow.device(device), requires_grad=True
+    )
+    of_out = flow.diag(input, diagonal).sum()
+    of_out.backward()
+    if diagonal > 0:
+        np_grad = np.array([[0, 1, 0], [0, 0, 1], [0, 0, 0]])
+    elif diagonal < 0:
+        np_grad = np.array([[0, 0, 0], [1, 0, 0], [0, 1, 0]])
+    else:
+        np_grad = np.identity(3)
+    test_case.assertTrue(
+        np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05, equal_nan=True)
+    )
+    input = flow.Tensor(
+        np.random.randn(3, 3), device=flow.device(device), requires_grad=True
+    )
+    of_out = input.diag(diagonal=diagonal).sum()
+    of_out.backward()
+    if diagonal > 0:
+        np_grad = np.array([[0, 1, 0], [0, 0, 1], [0, 0, 0]])
+    elif diagonal < 0:
+        np_grad = np.array([[0, 0, 0], [1, 0, 0], [0, 1, 0]])
+    else:
+        np_grad = np.identity(3)
+    test_case.assertTrue(
+        np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05, equal_nan=True)
+    )
+
+
+def _test_diag_other_dim_non_square_backward(test_case, diagonal, device):
+    input = flow.Tensor(
+        np.random.randn(3, 4), device=flow.device(device), requires_grad=True
+    )
+    of_out = flow.diag(input, diagonal).sum()
+    of_out.backward()
+    if diagonal > 0:
+        np_tmp = np.zeros([3, 1])
+        np_grad = np.identity(3)
+        np_grad = np.hstack((np_tmp, np_grad))
+    elif diagonal < 0:
+        np_grad = np.array([[0, 0, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0]])
+    else:
+        np_tmp = np.zeros([3, 1])
+        np_grad = np.identity(3)
+        np_grad = np.hstack((np_grad, np_tmp))
+    test_case.assertTrue(
+        np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05, equal_nan=True)
+    )
+    input = flow.Tensor(
+        np.random.randn(3, 4), device=flow.device(device), requires_grad=True
+    )
+    of_out = input.diag(diagonal=diagonal).sum()
+    of_out.backward()
+    if diagonal > 0:
+        np_tmp = np.zeros([3, 1])
+        np_grad = np.identity(3)
+        np_grad = np.hstack((np_tmp, np_grad))
+    elif diagonal < 0:
+        np_grad = np.array([[0, 0, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0]])
+    else:
+        np_tmp = np.zeros([3, 1])
+        np_grad = np.identity(3)
+        np_grad = np.hstack((np_grad, np_tmp))
+    test_case.assertTrue(
+        np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05, equal_nan=True)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestDiag(flow.unittest.TestCase):
+    def test_diag_forward(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(3,), (3, 3), (3, 4)]
+        arg_dict["diagonal"] = [1, 0, -1]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_diag_forward(test_case, *arg[0:])
+
+    def test_diag_backward(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_diag_one_dim_backward,
+            _test_diag_other_dim_backward,
+            _test_diag_other_dim_non_square_backward,
+        ]
+        arg_dict["diagonal"] = [1, 0, -1]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_div.py b/python/oneflow/test/modules/test_div.py
new file mode 100644
index 0000000000000000000000000000000000000000..daba1e6f08f7270644864c639b89177250910fea
--- /dev/null
+++ b/python/oneflow/test/modules/test_div.py
@@ -0,0 +1,105 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from automated_test_util import *
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_div_impl(test_case, shape, device):
+    x = flow.Tensor(np.random.randn(*shape), device=flow.device(device))
+    y = flow.Tensor(np.random.randn(*shape), device=flow.device(device))
+    of_out = flow.div(x, y)
+    np_out = np.divide(x.numpy(), y.numpy())
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+    x = 5
+    y = flow.Tensor(np.random.randn(*shape), device=flow.device(device))
+    of_out = flow.div(x, y)
+    np_out = np.divide(x, y.numpy())
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+    x = flow.Tensor(np.random.randn(*shape), device=flow.device(device))
+    y = 5
+    of_out = flow.div(x, y)
+    np_out = np.divide(x.numpy(), y)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+    x = flow.Tensor(np.random.randn(*shape), device=flow.device(device))
+    y = flow.Tensor(np.random.randn(1, 1), device=flow.device(device))
+    of_out = flow.div(x, y)
+    np_out = np.divide(x.numpy(), y.numpy())
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+    x = flow.Tensor(np.array([5.0]), device=flow.device(device))
+    y = flow.Tensor(np.random.randn(*shape), device=flow.device(device))
+    of_out = flow.div(x, y)
+    np_out = np.divide(x.numpy(), y.numpy())
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+    x = flow.Tensor(
+        np.random.randn(*shape), device=flow.device(device), requires_grad=True
+    )
+    y = flow.Tensor(np.array([5.0]), device=flow.device(device), requires_grad=True)
+    of_out = flow.div(x, y)
+    np_out = np.divide(x.numpy(), y.numpy())
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad_x = np.full(shape, 0.2)
+    test_case.assertTrue(np.allclose(x.grad.numpy(), np_grad_x, 0.0001, 0.0001))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestDiv(flow.unittest.TestCase):
+    def test_div(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(2, 3), (2, 3, 4), (2, 4, 5, 6)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_div_impl(test_case, *arg)
+
+    def test_sub_against_pytorch(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_type"] = [test_flow_against_pytorch, test_tensor_against_pytorch]
+        arg_dict["device"] = ["cpu", "cuda"]
+        arg_dict["op"] = ["div"]
+        for arg in GenArgList(arg_dict):
+            arg[0](
+                test_case,
+                arg[2],
+                extra_annotations={"other": flow.Tensor},
+                extra_generators={
+                    "input": random_tensor(ndim=2, dim0=2, dim1=3),
+                    "other": random_tensor(ndim=2, dim0=2, dim1=3),
+                },
+                device=arg[1],
+            )
+            arg[0](
+                test_case,
+                arg[2],
+                extra_annotations={"other": float},
+                extra_generators={
+                    "input": random_tensor(ndim=2, dim0=2, dim1=3),
+                    "other": random(0, 5),
+                },
+                device=arg[1],
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_dropout.py b/python/oneflow/test/modules/test_dropout.py
new file mode 100644
index 0000000000000000000000000000000000000000..8988f9b5b4e977785ea4e854030e97da0ef96890
--- /dev/null
+++ b/python/oneflow/test/modules/test_dropout.py
@@ -0,0 +1,113 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_dropout(test_case, shape, device):
+    input_arr = np.random.randn(*shape)
+    m = flow.nn.Dropout(p=0)
+    x = flow.Tensor(input_arr, device=flow.device(device))
+    y = m(x)
+    test_case.assertTrue(np.allclose(y.numpy(), input_arr))
+
+
+def _test_dropout_p1(test_case, shape, device):
+    input_arr = np.random.randn(*shape)
+    m = flow.nn.Dropout(p=1.0)
+    x = flow.Tensor(input_arr, device=flow.device(device))
+    y = m(x)
+    test_case.assertTrue(
+        np.allclose(y.numpy(), np.zeros(input_arr.shape, dtype=np.float32))
+    )
+
+
+def _test_dropout_backward_p0(test_case, shape, device):
+    input_arr = np.random.randn(*shape)
+    m = flow.nn.Dropout(p=0)
+    x = flow.Tensor(input_arr, device=flow.device(device), requires_grad=True)
+    y = m(x)
+    z = y.sum()
+    z.backward()
+    test_case.assertTrue(
+        np.allclose(
+            x.grad.numpy(), np.ones(input_arr.shape, dtype=np.float32), 1e-05, 1e-05
+        )
+    )
+
+
+def _test_dropout_backward_p1(test_case, shape, device):
+    input_arr = np.random.randn(*shape)
+    m = flow.nn.Dropout(p=1)
+    x = flow.Tensor(input_arr, device=flow.device(device), requires_grad=True)
+    y = m(x)
+    z = y.sum()
+    z.backward()
+    test_case.assertTrue(
+        np.allclose(
+            x.grad.numpy(), np.zeros(input_arr.shape, dtype=np.float32), 1e-05, 1e-05
+        )
+    )
+
+
+def _test_dropout_eval(test_case, shape, device):
+    input_arr = np.random.randn(*shape)
+    m = flow.nn.Dropout(p=1)
+    x = flow.Tensor(input_arr, device=flow.device(device))
+    m.eval()
+    y = m(x)
+    test_case.assertTrue(np.allclose(y.numpy(), input_arr))
+
+
+def _test_dropout_with_generator(test_case, shape, device):
+    generator = flow.Generator()
+    generator.manual_seed(0)
+    m = flow.nn.Dropout(p=0.5, generator=generator)
+    x = flow.Tensor(np.random.randn(*shape), device=flow.device(device))
+    y_1 = m(x)
+    y_1.numpy()
+    generator.manual_seed(0)
+    y_2 = m(x)
+    test_case.assertTrue(np.allclose(y_1.numpy(), y_2.numpy()))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestDropout(flow.unittest.TestCase):
+    def test_transpose(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_functions"] = [
+            _test_dropout,
+            _test_dropout_p1,
+            _test_dropout_backward_p0,
+            _test_dropout_backward_p1,
+            _test_dropout_eval,
+            _test_dropout_with_generator,
+        ]
+        arg_dict["shape"] = [(2, 3), (2, 3, 4), (2, 3, 4, 5)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_eq.py b/python/oneflow/test/modules/test_eq.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1feca9bbbe5680a848b12114a6bfc48b802d30c
--- /dev/null
+++ b/python/oneflow/test/modules/test_eq.py
@@ -0,0 +1,104 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_eq(test_case, shape, device):
+    arr1 = np.random.randn(*shape)
+    arr2 = np.random.randn(*shape)
+    input = flow.Tensor(arr1, dtype=flow.float32, device=flow.device(device))
+    other = flow.Tensor(arr2, dtype=flow.float32, device=flow.device(device))
+    of_out = flow.eq(input, other)
+    of_out2 = flow.equal(input, other)
+    np_out = np.equal(arr1, arr2)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+    test_case.assertTrue(np.array_equal(of_out2.numpy(), np_out))
+
+
+def _test_tensor_eq_operator(test_case, shape, device):
+    arr1 = np.random.randn(*shape)
+    arr2 = np.random.randn(*shape)
+    input = flow.Tensor(arr1, dtype=flow.float32, device=flow.device(device))
+    other = flow.Tensor(arr2, dtype=flow.float32, device=flow.device(device))
+    of_out = input.eq(other)
+    np_out = np.equal(arr1, arr2)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+def _test_eq_int(test_case, shape, device):
+    arr = np.random.randn(*shape)
+    input = flow.Tensor(arr, dtype=flow.float32, device=flow.device(device))
+    num = 1
+    of_out = flow.eq(input, num)
+    np_out = np.equal(arr, num)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+def _test_tensor_eq_operator_int(test_case, shape, device):
+    arr = np.random.randn(*shape)
+    input = flow.Tensor(arr, dtype=flow.float32, device=flow.device(device))
+    num = 1
+    of_out = input.eq(num)
+    np_out = np.equal(arr, num)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+def _test_eq_float(test_case, shape, device):
+    arr = np.random.randn(*shape)
+    input = flow.Tensor(arr, dtype=flow.float32, device=flow.device(device))
+    num = 1.0
+    of_out = flow.eq(input, num)
+    np_out = np.equal(arr, num)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+def _test_tensor_eq_operator_float(test_case, shape, device):
+    arr = np.random.randn(*shape)
+    input = flow.Tensor(arr, dtype=flow.float32, device=flow.device(device))
+    num = 1.0
+    of_out = input.eq(num)
+    np_out = np.equal(arr, num)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestEq(flow.unittest.TestCase):
+    def test_eq(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_func"] = [
+            _test_eq,
+            _test_tensor_eq_operator,
+            _test_eq_int,
+            _test_tensor_eq_operator_int,
+            _test_eq_float,
+            _test_tensor_eq_operator_float,
+        ]
+        arg_dict["shape"] = [(2, 3), (2, 3, 4), (2, 4, 5, 6)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_erf.py b/python/oneflow/test/modules/test_erf.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d9b856f91f42e8c5180b6b23f0deddee140a357
--- /dev/null
+++ b/python/oneflow/test/modules/test_erf.py
@@ -0,0 +1,80 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from scipy import special
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_erf_impl(test_case, shape, device):
+    np_input = np.random.randn(*shape)
+    of_input = flow.Tensor(
+        np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    of_out = flow.erf(of_input)
+    np_out = special.erf(np_input)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    test_case.assertTrue(
+        np.allclose(
+            of_input.grad.numpy(),
+            2 / np.sqrt(np.pi) * np.exp(-np.square(of_input.numpy())),
+            1e-05,
+            1e-05,
+        )
+    )
+
+
+def _test_tensor_erf_impl(test_case, shape, device):
+    np_input = np.random.randn(*shape)
+    of_input = flow.Tensor(
+        np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    of_out = of_input.erf()
+    np_out = special.erf(np_input)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    test_case.assertTrue(
+        np.allclose(
+            of_input.grad.numpy(),
+            2 / np.sqrt(np.pi) * np.exp(-np.square(of_input.numpy())),
+            1e-05,
+            1e-05,
+        )
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestErfModule(flow.unittest.TestCase):
+    def test_erf(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(2,), (2, 3), (2, 3, 4), (2, 4, 5, 6)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_erf_impl(test_case, *arg)
+            _test_tensor_erf_impl(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_erfc.py b/python/oneflow/test/modules/test_erfc.py
new file mode 100644
index 0000000000000000000000000000000000000000..2520f5fa95a8b903a794320137f0cb1564de44d4
--- /dev/null
+++ b/python/oneflow/test/modules/test_erfc.py
@@ -0,0 +1,80 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from scipy import special
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_erfc_impl(test_case, shape, device):
+    np_input = np.random.randn(*shape)
+    of_input = flow.Tensor(
+        np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    of_out = flow.erfc(of_input)
+    np_out = special.erfc(np_input)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    test_case.assertTrue(
+        np.allclose(
+            of_input.grad.numpy(),
+            -(2 / np.sqrt(np.pi)) * np.exp(-np.square(of_input.numpy())),
+            1e-05,
+            1e-05,
+        )
+    )
+
+
+def _test_tensor_erfc_impl(test_case, shape, device):
+    np_input = np.random.randn(*shape)
+    of_input = flow.Tensor(
+        np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    of_out = of_input.erfc()
+    np_out = special.erfc(np_input)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    test_case.assertTrue(
+        np.allclose(
+            of_input.grad.numpy(),
+            -(2 / np.sqrt(np.pi)) * np.exp(-np.square(of_input.numpy())),
+            1e-05,
+            1e-05,
+        )
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestErfcModule(flow.unittest.TestCase):
+    def test_erfc(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(2,), (2, 3), (2, 3, 4), (2, 4, 5, 6)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_erfc_impl(test_case, *arg)
+            _test_tensor_erfc_impl(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_exp.py b/python/oneflow/test/modules/test_exp.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ae35229e0fd93f55a8121c2d672523c9d289c76
--- /dev/null
+++ b/python/oneflow/test/modules/test_exp.py
@@ -0,0 +1,51 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_exp_impl(test_case, shape, device):
+    np_input = np.random.randn(*shape)
+    of_input = flow.Tensor(
+        np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    of_out = flow.exp(of_input)
+    np_out = np.exp(np_input)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+    of_out = of_out.sum()
+    of_out.backward()
+    test_case.assertTrue(np.allclose(of_input.grad.numpy(), np_out, 0.0001, 0.0001))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestExp(flow.unittest.TestCase):
+    def test_exp(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(2, 3), (2, 3, 4), (2, 4, 5, 6)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_exp_impl(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_expand.py b/python/oneflow/test/modules/test_expand.py
new file mode 100644
index 0000000000000000000000000000000000000000..1046dae0606cd2faec9f6aea5539a16e205df540
--- /dev/null
+++ b/python/oneflow/test/modules/test_expand.py
@@ -0,0 +1,197 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _np_get_expand(input_shape, expand_size):
+    input = np.random.random(size=input_shape).astype(np.float32)
+    input_stride = [1]
+    for i in range(len(input_shape) - 2, -1, -1):
+        input_stride.insert(0, input_stride[0] * input_shape[i + 1])
+    new_size = []
+    new_stride = []
+    diff = len(expand_size) - len(input_shape)
+    for i in range(len(expand_size) - 1, -1, -1):
+        if i >= diff:
+            if expand_size[i] == -1 or expand_size[i] == input_shape[i - diff]:
+                new_size.insert(0, input_shape[i - diff])
+                new_stride.insert(0, input_stride[i - diff])
+            else:
+                assert expand_size[i] >= 1 and input_shape[i - diff] == 1
+                new_size.insert(0, expand_size[i])
+                new_stride.insert(0, 0)
+        else:
+            assert expand_size[i] >= 1
+            new_size.insert(0, expand_size[i])
+            if expand_size[i] == 1:
+                new_stride.insert(0, new_stride[0])
+            else:
+                new_stride.insert(0, 0)
+    gout = np.random.random(size=tuple(new_size)).astype(np.float32)
+    out_stride = [1]
+    for i in range(len(new_size) - 2, -1, -1):
+        out_stride.insert(0, out_stride[0] * new_size[i + 1])
+    gin = np.zeros(input_shape).flatten()
+    out = np.zeros(np.product(new_size))
+
+    def getOffset(i_offset, stride, expand_stride, n):
+        remain = i_offset
+        o_offset = 0
+        for i in range(n):
+            idx = int(remain / stride[i])
+            o_offset += idx * expand_stride[i]
+            remain = remain - idx * stride[i]
+        return o_offset
+
+    in_flatten = input.flatten()
+    gout_flatten = gout.flatten()
+    num_elem = np.product(new_size)
+    dims = len(new_size)
+    for i in range(num_elem):
+        offset = getOffset(i, out_stride, new_stride, dims)
+        gin[offset] += gout_flatten[i]
+        out[i] = in_flatten[offset]
+    return (input, gout, out.reshape(tuple(new_size)), gin.reshape(input_shape))
+
+
+def _test_expand_new_dims(test_case, device):
+    input_shape = (1, 4, 1, 32)
+    expand_dim = [2, 1, 2, 4, 2, 32]
+    (input, gout, out_np, gin_np) = _np_get_expand(input_shape, expand_dim)
+    of_input = flow.Tensor(
+        input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    of_out = of_input.expand(2, 1, 2, 4, 2, 32)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), out_np))
+
+
+def _test_expand_same_dim(test_case, device):
+    input_shape = (2, 4, 1, 32)
+    expand_dim = [2, 4, 2, 32]
+    (input, gout, out_np, gin_np) = _np_get_expand(input_shape, expand_dim)
+    of_input = flow.Tensor(input, dtype=flow.float32, device=flow.device(device))
+    of_out = of_input.expand(2, 4, 2, 32)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), out_np))
+
+
+def _test_expand_same_dim_negative(test_case, device):
+    input_shape = (1, 6, 5, 3)
+    expand_dim = [4, -1, 5, 3]
+    (input, gout, out_np, gin_np) = _np_get_expand(input_shape, expand_dim)
+    of_input = flow.Tensor(input, dtype=flow.float32, device=flow.device(device))
+    of_out = of_input.expand(4, -1, 5, 3)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), out_np))
+
+
+def _test_expand_same_int(test_case, device):
+    input_shape = (2, 4, 1, 32)
+    expand_dim = [2, 4, 2, 32]
+    (input, gout, out_np, gin_np) = _np_get_expand(input_shape, expand_dim)
+    of_input = flow.Tensor(input, dtype=flow.int, device=flow.device(device))
+    of_out = of_input.expand(2, 4, 2, 32)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), out_np.astype(np.int32)))
+
+
+def _test_expand_same_int8(test_case, device):
+    input_shape = (2, 4, 1, 32)
+    expand_dim = [2, 4, 2, 32]
+    (input, gout, out_np, gin_np) = _np_get_expand(input_shape, expand_dim)
+    of_input = flow.Tensor(input, dtype=flow.int8, device=flow.device(device))
+    of_out = of_input.expand(2, 4, 2, 32)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), out_np.astype(np.int32)))
+
+
+def _test_expand_backward_same_dim(test_case, device):
+    input_shape = (2, 4, 1, 1)
+    expand_dim = [2, 4, 2, 1]
+    input = np.array(
+        [
+            [
+                [[0.9876952171325684]],
+                [[0.8772538304328918]],
+                [[0.9200366735458374]],
+                [[0.2810221314430237]],
+            ],
+            [
+                [[0.3037724494934082]],
+                [[0.7783719897270203]],
+                [[0.08884672075510025]],
+                [[0.17156553268432617]],
+            ],
+        ]
+    )
+    of_input = flow.Tensor(
+        input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    of_out = of_input.expand(2, 4, 2, 1)
+    y = of_out.sum().backward()
+    np_grad = [
+        [[[2.0]], [[2.0]], [[2.0]], [[2.0]]],
+        [[[2.0]], [[2.0]], [[2.0]], [[2.0]]],
+    ]
+    test_case.assertTrue(np.array_equal(of_input.grad.numpy(), np_grad))
+
+
+def _test_expand_backward(test_case, device):
+    input_shape = (1, 4, 1, 2)
+    expand_dim = [2, 1, 2, 4, 2, 2]
+    input = np.array(
+        [
+            [
+                [[0.8981702327728271, 0.5372866988182068]],
+                [[0.45116370916366577, 0.8656941056251526]],
+                [[0.8811476230621338, 0.5552017688751221]],
+                [[0.6291894316673279, 0.5786571502685547]],
+            ]
+        ]
+    )
+    of_input = flow.Tensor(
+        input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    of_out = of_input.expand(2, 1, 2, 4, 2, 2)
+    y = of_out.sum().backward()
+    np_grad = [[[[8.0, 8.0]], [[8.0, 8.0]], [[8.0, 8.0]], [[8.0, 8.0]]]]
+    test_case.assertTrue(np.array_equal(of_input.grad.numpy(), np_grad))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestModule(flow.unittest.TestCase):
+    def test_expand(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_expand_new_dims,
+            _test_expand_same_dim,
+            _test_expand_same_dim_negative,
+            _test_expand_same_int,
+            _test_expand_same_int8,
+            _test_expand_backward,
+            _test_expand_backward_same_dim,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_expm1.py b/python/oneflow/test/modules/test_expm1.py
new file mode 100644
index 0000000000000000000000000000000000000000..9084a370b62d39c1ade318bc7cf775d57ebe2b05
--- /dev/null
+++ b/python/oneflow/test/modules/test_expm1.py
@@ -0,0 +1,51 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_expm1_impl(test_case, device, shape):
+    x = flow.Tensor(
+        np.random.randn(*shape), device=flow.device(device), requires_grad=True
+    )
+    of_out = flow.expm1(x)
+    np_out = np.expm1(x.numpy())
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+    of_out = of_out.sum()
+    of_out.backward()
+    test_case.assertTrue(np.allclose(x.grad.numpy(), np.exp(x.numpy()), 0.0001, 0.0001))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestExpm1Module(flow.unittest.TestCase):
+    def test_expm1(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [_test_expm1_impl]
+        arg_dict["device"] = ["cpu", "cuda"]
+        arg_dict["shape"] = [(1,), (2, 3), (2, 3, 4), (2, 3, 4, 5)]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_flatten.py b/python/oneflow/test/modules/test_flatten.py
new file mode 100644
index 0000000000000000000000000000000000000000..53cb985a24e57d00f5f0ecd67bd8ab8c7afc16e3
--- /dev/null
+++ b/python/oneflow/test/modules/test_flatten.py
@@ -0,0 +1,92 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from automated_test_util import *
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_flatten(test_case, device):
+    m = flow.nn.Flatten()
+    x = flow.Tensor(32, 2, 5, 5, device=flow.device(device))
+    flow.nn.init.uniform_(x)
+    y = m(x)
+    test_case.assertTrue(y.shape == flow.Size((32, 50)))
+    test_case.assertTrue(np.array_equal(y.numpy().flatten(), x.numpy().flatten()))
+    y2 = flow.flatten(x, start_dim=2)
+    test_case.assertTrue(y2.shape == flow.Size((32, 2, 25)))
+    test_case.assertTrue(np.array_equal(y2.numpy().flatten(), x.numpy().flatten()))
+    y3 = x.flatten(start_dim=1)
+    test_case.assertTrue(y3.shape == flow.Size((32, 50)))
+    test_case.assertTrue(np.array_equal(y3.numpy().flatten(), x.numpy().flatten()))
+    y4 = x.flatten(start_dim=1, end_dim=2)
+    test_case.assertTrue(y4.shape == flow.Size((32, 10, 5)))
+    test_case.assertTrue(np.array_equal(y4.numpy().flatten(), x.numpy().flatten()))
+    y5 = flow.flatten(x)
+    test_case.assertTrue(y5.shape == flow.Size((1600,)))
+    test_case.assertTrue(np.array_equal(y5.numpy().flatten(), x.numpy().flatten()))
+
+
+def _test_flatten_backward(test_case, device):
+    m = flow.nn.Flatten().to(flow.device(device))
+    x = flow.Tensor(2, 3, 4, 5, device=flow.device(device), requires_grad=True)
+    flow.nn.init.uniform_(x)
+    y = m(x)
+    z = y.sum()
+    z.backward()
+    test_case.assertTrue(np.array_equal(np.ones(shape=(2, 3, 4, 5)), x.grad.numpy()))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestFlattenModule(flow.unittest.TestCase):
+    def test_cast(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [_test_flatten, _test_flatten_backward]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+    @autotest(auto_backward=False)
+    def test_against_pytorch(test_case):
+        m = torch.nn.Flatten(
+            start_dim=random(1, 6) | nothing(), end_dim=random(1, 6) | nothing()
+        )
+        m.train(random())
+        device = random_device()
+        m.to(device)
+        x = random_pytorch_tensor().to(device)
+        y = m(x)
+        return y
+
+    @autotest(auto_backward=False)
+    def test_tensor_against_pytorch(test_case):
+        device = random_device()
+        x = random_pytorch_tensor().to(device)
+        y = x.flatten(
+            start_dim=random(1, 6).to(int) | nothing(),
+            end_dim=random(1, 6).to(int) | nothing(),
+        )
+        return y
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_flip.py b/python/oneflow/test/modules/test_flip.py
new file mode 100644
index 0000000000000000000000000000000000000000..037a745fde2d0e4973de28b175c8bc3006396d24
--- /dev/null
+++ b/python/oneflow/test/modules/test_flip.py
@@ -0,0 +1,53 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from automated_test_util import *
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_flip(test_case, device):
+    np_arr = np.arange(0, 16).reshape((2, 2, 2, 2)).astype(np.float32)
+    input = flow.Tensor(np_arr, device=flow.device(device), requires_grad=True)
+    out = flow.flip(input, [0, 1, 2])
+    np_out = [
+        [[[14.0, 15.0], [12.0, 13.0]], [[10.0, 11.0], [8.0, 9.0]]],
+        [[[6.0, 7.0], [4.0, 5.0]], [[2.0, 3.0], [0.0, 1.0]]],
+    ]
+    test_case.assertTrue(np.allclose(out.numpy(), np_out, 1e-05, 1e-05))
+    out = out.sum()
+    out = out.backward()
+    np_grad = np.ones_like(np_arr)
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+class TestFlip(flow.unittest.TestCase):
+    def test_flip(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [_test_flip]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_floor.py b/python/oneflow/test/modules/test_floor.py
new file mode 100644
index 0000000000000000000000000000000000000000..6886405786310656ee38477fac97560186fb5fb8
--- /dev/null
+++ b/python/oneflow/test/modules/test_floor.py
@@ -0,0 +1,56 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_floor(test_case, shape, device):
+    np_input = np.random.randn(*shape)
+    of_input = flow.Tensor(
+        np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    of_out = flow.floor(of_input)
+    np_out = np.floor(np_input)
+    test_case.assertTrue(
+        np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05, equal_nan=True)
+    )
+    of_out = of_out.sum()
+    of_out.backward()
+    np_out_grad = np.zeros_like(of_out, dtype=np.float32)
+    test_case.assertTrue(
+        np.allclose(of_input.grad.numpy(), np_out_grad, 0.0001, 0.0001, equal_nan=True)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestFloor(flow.unittest.TestCase):
+    def test_floor(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(2,), (2, 3), (2, 4, 5, 6)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_floor(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_fmod.py b/python/oneflow/test/modules/test_fmod.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0ab4da4d81684eb000f723d7c72f9fff6e15cee
--- /dev/null
+++ b/python/oneflow/test/modules/test_fmod.py
@@ -0,0 +1,96 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import random as rd
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from automated_test_util import *
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _numpy_fmod(x, y):
+    sign = np.sign(x)
+    res = np.fmod(np.abs(x), np.abs(y))
+    return sign * res
+
+
+def _numpy_fmod_grad(x):
+    grad = np.ones_like(x)
+    return grad
+
+
+def _test_fmod_same_shape_tensor(test_case, shape, device):
+    input = flow.Tensor(
+        np.random.uniform(-100, 100, shape),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    other = flow.Tensor(
+        np.random.uniform(-10, 10, shape),
+        dtype=flow.float32,
+        device=flow.device(device),
+    )
+    of_out = flow.fmod(input, other)
+    np_out = _numpy_fmod(input.numpy(), other.numpy())
+    of_out.sum().backward()
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    test_case.assertTrue(
+        np.allclose(input.grad.numpy(), _numpy_fmod_grad(input.numpy()), 1e-05, 1e-05)
+    )
+
+
+def _test_fmod_tensor_vs_scalar(test_case, shape, device):
+    input = flow.Tensor(
+        np.random.randint(-100, 100, shape),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    other = rd.uniform(-1, 1) * 100
+    of_out = flow.fmod(input, other)
+    np_out = _numpy_fmod(input.numpy(), other)
+    of_out.sum().backward()
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    test_case.assertTrue(
+        np.allclose(input.grad.numpy(), _numpy_fmod_grad(input.numpy()), 1e-05, 1e-05)
+    )
+
+
+class TestFmodModule(flow.unittest.TestCase):
+    def test_fmod(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["fun"] = [_test_fmod_same_shape_tensor, _test_fmod_tensor_vs_scalar]
+        arg_dict["shape"] = [(2,), (2, 3), (2, 4, 5, 6)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+    @autotest
+    def test_flow_fmod_with_random_data(test_case):
+        device = random_device()
+        input = random_pytorch_tensor().to(device)
+        other = random_pytorch_tensor().to(device)
+        return torch.fmod(input, other)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_functional_docstr.py b/python/oneflow/test/modules/test_functional_docstr.py
new file mode 100644
index 0000000000000000000000000000000000000000..8481df5eee5c2afe35b22cdcbfab4437a135eb9d
--- /dev/null
+++ b/python/oneflow/test/modules/test_functional_docstr.py
@@ -0,0 +1,67 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import inspect
+import unittest
+from collections import OrderedDict
+
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+from oneflow.framework.functional import Function
+
+
+def _is_oneflow_functional(object):
+    return isinstance(object, Function)
+
+
+def _run_functional_doctest(
+    test_case,
+    globs=None,
+    verbose=None,
+    optionflags=0,
+    raise_on_error=True,
+    module=flow.F,
+):
+    import doctest
+
+    parser = doctest.DocTestParser()
+    if raise_on_error:
+        runner = doctest.DebugRunner(verbose=verbose, optionflags=optionflags)
+    else:
+        runner = doctest.DocTestRunner(verbose=verbose, optionflags=optionflags)
+    r = inspect.getmembers(flow.F, _is_oneflow_functional)
+    for (name, fun) in r:
+        if fun.__doc__ is not None:
+            print("test on docstr of: ", ".".join([module.__name__, name]))
+            test = parser.get_doctest(fun.__doc__, {}, __name__, __file__, 0)
+            runner.run(test)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestFunctionalDocstrModule(flow.unittest.TestCase):
+    def test_functional_docstr(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["module"] = [flow.F]
+        for arg in GenArgList(arg_dict):
+            _run_functional_doctest(
+                test_case, raise_on_error=True, verbose=None, module=arg[0]
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_gather.py b/python/oneflow/test/modules/test_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed1c072c544d91437c84fd6a3cc5350ac9f0fee9
--- /dev/null
+++ b/python/oneflow/test/modules/test_gather.py
@@ -0,0 +1,119 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _scatter_add_numpy(src, dim, index, outshape):
+    output = np.zeros(outshape)
+    for srcidx in range(0, src.size):
+        outcoord = np.unravel_index(srcidx, src.shape)
+        outcoord = [*outcoord]
+        outcoord[dim] = index[np.unravel_index(srcidx, index.shape)]
+        output_offset = np.ravel_multi_index(outcoord, outshape)
+        output[np.unravel_index(output_offset, outshape)] += src[
+            np.unravel_index(srcidx, src.shape)
+        ]
+    return output
+
+
+def _test_gather(test_case, device):
+    input = np.array([[1, 2], [3, 4]])
+    index = np.array([[0, 0], [1, 0]])
+    np_out = np.take_along_axis(input, index, 0)
+    output = flow.gather(
+        flow.Tensor(input, device=flow.device(device)),
+        flow.Tensor(index, dtype=flow.int, device=flow.device(device)),
+        dim=0,
+    )
+    test_case.assertTrue(np.array_equal(output.numpy(), np_out))
+
+
+def _test_gather_tensor_function(test_case, device):
+    input = np.array([[1, 2], [3, 4]])
+    index = np.array([[0, 0], [1, 0]])
+    np_out = np.take_along_axis(input, index, 1)
+    input = flow.Tensor(input, device=flow.device(device))
+    index = flow.Tensor(index, dtype=flow.int, device=flow.device(device))
+    output = input.gather(index, dim=1)
+    test_case.assertTrue(np.array_equal(output.numpy(), np_out))
+
+
+def _test_gather_random_array(test_case, device):
+    input = np.random.randn(3, 4, 3, 5)
+    index = np.random.choice(np.arange(3), size=180, replace=True).reshape((3, 4, 3, 5))
+    np_out = np.take_along_axis(input, index, 1)
+    output = flow.gather(
+        flow.Tensor(input, device=flow.device(device)),
+        flow.Tensor(index, dtype=flow.int, device=flow.device(device)),
+        dim=1,
+    )
+    test_case.assertTrue(np.allclose(output.numpy(), np_out))
+    np_out2 = np.take_along_axis(input, index, 2)
+    output2 = flow.gather(
+        flow.Tensor(input, device=flow.device(device)),
+        flow.Tensor(index, dtype=flow.int, device=flow.device(device)),
+        dim=2,
+    )
+    test_case.assertTrue(np.allclose(output2.numpy(), np_out2))
+    np_out3 = np.take_along_axis(input, index, 3)
+    output3 = flow.gather(
+        flow.Tensor(input, device=flow.device(device)),
+        flow.Tensor(index, dtype=flow.int, device=flow.device(device)),
+        dim=3,
+    )
+    test_case.assertTrue(np.allclose(output3.numpy(), np_out3))
+
+
+def _test_gather_backward(test_case, device):
+    input = np.array([[1, 2], [3, 4]])
+    index = np.array([[0, 0], [1, 0]])
+    np_out = np.take_along_axis(input, index, 0)
+    np_grad = _scatter_add_numpy(np.ones_like(np_out), 0, index, input.shape)
+    of_input = flow.Tensor(input, requires_grad=True, device=flow.device(device))
+    output = flow.gather(
+        of_input, flow.Tensor(index, dtype=flow.int, device=flow.device(device)), dim=0
+    )
+    out_sum = output.sum()
+    out_sum.backward()
+    test_case.assertTrue(np.array_equal(output.numpy(), np_out))
+    test_case.assertTrue(np.array_equal(of_input.grad.numpy(), np_grad))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestGather(flow.unittest.TestCase):
+    def test_gather(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_gather,
+            _test_gather_tensor_function,
+            _test_gather_random_array,
+            _test_gather_backward,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_gather_nd.py b/python/oneflow/test/modules/test_gather_nd.py
new file mode 100644
index 0000000000000000000000000000000000000000..8256f3ffbb57c01bd1eb720498409220c454e84a
--- /dev/null
+++ b/python/oneflow/test/modules/test_gather_nd.py
@@ -0,0 +1,99 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_gather_nd(test_case, device):
+    input = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+    indices = np.array([[0], [2]])
+    np_out = np.array([[1, 2, 3], [7, 8, 9]])
+    output = flow.gather_nd(
+        flow.Tensor(input, dtype=flow.float, device=flow.device(device)),
+        flow.Tensor(indices, dtype=flow.int, device=flow.device(device)),
+    )
+    test_case.assertTrue(np.array_equal(output.numpy(), np_out))
+
+
+def _test_gather_nd_t(test_case, device):
+    input = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+    indices = np.array([[0, 2], [2, 1]])
+    np_out = np.array([3, 8])
+    output = flow.gather_nd(
+        flow.Tensor(input, dtype=flow.float, device=flow.device(device)),
+        flow.Tensor(indices, dtype=flow.int, device=flow.device(device)),
+    )
+    test_case.assertTrue(np.array_equal(output.numpy(), np_out))
+
+
+def _test_gather_nd_backward(test_case, device):
+    input = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+    indices = np.array([[0], [2]])
+    np_out = np.array([[1, 2, 3], [7, 8, 9]])
+    np_grad = np.array([[1, 1, 1], [0, 0, 0], [1, 1, 1]])
+    of_input = flow.Tensor(
+        input, requires_grad=True, dtype=flow.float, device=flow.device(device)
+    )
+    output = flow.gather_nd(
+        of_input, flow.Tensor(indices, dtype=flow.int, device=flow.device(device))
+    )
+    out_sum = output.sum()
+    out_sum.backward()
+    test_case.assertTrue(np.array_equal(output.numpy(), np_out))
+    test_case.assertTrue(np.array_equal(of_input.grad.numpy(), np_grad))
+
+
+def _test_gather_nd_backward_t(test_case, device):
+    input = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+    indices = np.array([[0, 2], [2, 1]])
+    np_out = np.array([3, 8])
+    np_grad = np.array([[0, 0, 1], [0, 0, 0], [0, 1, 0]])
+    of_input = flow.Tensor(
+        input, requires_grad=True, dtype=flow.float, device=flow.device(device)
+    )
+    output = flow.gather_nd(
+        of_input, flow.Tensor(indices, dtype=flow.int, device=flow.device(device))
+    )
+    out_sum = output.sum()
+    out_sum.backward()
+    test_case.assertTrue(np.array_equal(output.numpy(), np_out))
+    test_case.assertTrue(np.array_equal(of_input.grad.numpy(), np_grad))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestGather_nd(flow.unittest.TestCase):
+    def test_gather_nd(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_gather_nd,
+            _test_gather_nd_t,
+            _test_gather_nd_backward,
+            _test_gather_nd_backward_t,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_generator.py b/python/oneflow/test/modules/test_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ada714ac4acbf8ef76a12721b9f436d2a738759
--- /dev/null
+++ b/python/oneflow/test/modules/test_generator.py
@@ -0,0 +1,105 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import oneflow as flow
+import oneflow.unittest
+
+
+class TestGenerator(flow.unittest.TestCase):
+    def test_different_devices(test_case):
+        auto_gen = flow.Generator(device="auto")
+        cpu_gen = flow.Generator(device="cpu")
+        test_case.assertTrue(auto_gen.initial_seed(), cpu_gen.initial_seed())
+        with test_case.assertRaises(Exception) as context:
+            flow.Generator(device="invalid")
+        test_case.assertTrue("unimplemented" in str(context.exception))
+        if not os.getenv("ONEFLOW_TEST_CPU_ONLY"):
+            cuda_gen = flow.Generator(device="cuda")
+            test_case.assertTrue(auto_gen.initial_seed(), cuda_gen.initial_seed())
+
+    def test_generator_manual_seed(test_case):
+        generator = flow.Generator()
+        generator.manual_seed(1)
+        test_case.assertTrue(generator.initial_seed() == 1)
+        generator.manual_seed(2)
+        test_case.assertTrue(generator.initial_seed() == 2)
+
+
+class TestDefaultGenerator(flow.unittest.TestCase):
+    def test_global_manual_seed(test_case):
+        global_seed = 10
+        flow.manual_seed(10)
+        auto_gen = flow.default_generator(device="auto")
+        cpu_gen = flow.default_generator(device="cpu")
+        test_gens = [auto_gen, cpu_gen]
+        if not os.getenv("ONEFLOW_TEST_CPU_ONLY"):
+            cuda_gen = flow.default_generator(device="cuda")
+            cuda0_gen = flow.default_generator(device="cuda:0")
+            test_gens += [cuda_gen, cuda0_gen]
+        for gen in test_gens:
+            test_case.assertTrue(gen.initial_seed() == global_seed)
+
+    def test_different_devices(test_case):
+        auto_gen = flow.default_generator(device="auto")
+        cpu_gen = flow.default_generator(device="cpu")
+        with test_case.assertRaises(Exception) as context:
+            flow.default_generator(device="invalid")
+        test_case.assertTrue("unimplemented" in str(context.exception))
+        with test_case.assertRaises(Exception) as context:
+            flow.default_generator(device="cpu:1000")
+        test_case.assertTrue("check_failed" in str(context.exception))
+        test_gens = [cpu_gen]
+        if not os.getenv("ONEFLOW_TEST_CPU_ONLY"):
+            with test_case.assertRaises(Exception) as context:
+                flow.default_generator(device="cuda:1000")
+            test_case.assertTrue("check_failed" in str(context.exception))
+            cuda_gen = flow.default_generator(device="cuda")
+            cuda0_gen = flow.default_generator(device="cuda:0")
+            test_gens += [cuda_gen, cuda0_gen]
+        for gen in test_gens:
+            test_case.assertTrue(auto_gen.initial_seed() == gen.initial_seed())
+
+    def test_generator_manual_seed(test_case):
+        auto_gen = flow.default_generator(device="auto")
+        cpu_gen = flow.default_generator(device="cpu")
+        test_gens = [auto_gen, cpu_gen]
+        if not os.getenv("ONEFLOW_TEST_CPU_ONLY"):
+            cuda_gen = flow.default_generator(device="cuda")
+            cuda0_gen = flow.default_generator(device="cuda:0")
+            test_gens += [cuda_gen, cuda0_gen]
+        for seed in [1, 2]:
+            auto_gen.manual_seed(seed)
+            for gen in test_gens:
+                test_case.assertTrue(gen.initial_seed() == seed)
+
+    def test_generator_seed(test_case):
+        auto_gen = flow.default_generator(device="auto")
+        cpu_gen = flow.default_generator(device="cpu")
+        test_gens = [auto_gen, cpu_gen]
+        if not os.getenv("ONEFLOW_TEST_CPU_ONLY"):
+            cuda_gen = flow.default_generator(device="cuda")
+            cuda0_gen = flow.default_generator(device="cuda:0")
+            test_gens += [cuda_gen, cuda0_gen]
+        for gen in test_gens:
+            seed = gen.seed()
+            test_case.assertTrue(seed == gen.initial_seed())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_greater.py b/python/oneflow/test/modules/test_greater.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7990b6374b0526115d7ffaa3edd0235b3255b12
--- /dev/null
+++ b/python/oneflow/test/modules/test_greater.py
@@ -0,0 +1,123 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from automated_test_util import *
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_greater_normal(test_case, device):
+    input1 = flow.Tensor(
+        np.array([1, 1, 4]).astype(np.float32),
+        dtype=flow.float32,
+        device=flow.device(device),
+    )
+    input2 = flow.Tensor(
+        np.array([1, 2, 3]).astype(np.float32),
+        dtype=flow.float32,
+        device=flow.device(device),
+    )
+    of_out = flow.gt(input1, input2)
+    np_out = np.greater(input1.numpy(), input2.numpy())
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+def _test_greater_symbol(test_case, device):
+    input1 = flow.Tensor(
+        np.array([1, 1, 4]).astype(np.float32),
+        dtype=flow.float32,
+        device=flow.device(device),
+    )
+    input2 = flow.Tensor(
+        np.array([1, 2, 3]).astype(np.float32),
+        dtype=flow.float32,
+        device=flow.device(device),
+    )
+    of_out = input1 > input2
+    np_out = np.greater(input1.numpy(), input2.numpy())
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+def _test_greater_int_scalar(test_case, device):
+    np_arr = np.random.randn(2, 3, 4, 5)
+    input1 = flow.Tensor(np_arr, dtype=flow.float32, device=flow.device(device))
+    input2 = 1
+    of_out = input1 > input2
+    np_out = np.greater(np_arr, input2)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+def _test_greater_int_tensor_int_scalar(test_case, device):
+    np_arr = np.random.randint(2, size=(2, 3, 4, 5))
+    input1 = flow.Tensor(np_arr, dtype=flow.int, device=flow.device(device))
+    input2 = 1
+    of_out = input1 > input2
+    np_out = np.greater(np_arr, input2)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+def _test_greater_float_scalar(test_case, device):
+    np_arr = np.random.randn(3, 2, 5, 7)
+    input1 = flow.Tensor(np_arr, dtype=flow.float32, device=flow.device(device))
+    input2 = 2.3
+    of_out = input1 > input2
+    np_out = np.greater(np_arr, input2)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestGreater(flow.unittest.TestCase):
+    def test_greater(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_greater_normal,
+            _test_greater_symbol,
+            _test_greater_int_scalar,
+            _test_greater_int_tensor_int_scalar,
+            _test_greater_float_scalar,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+    @autotest(n=60, auto_backward=False)
+    def test_greater_with_random_data(test_case):
+        device = random_device()
+        shape = random_tensor().value().shape
+        x1 = random_pytorch_tensor(len(shape), *shape, requires_grad=False).to(device)
+        x2 = random_pytorch_tensor(len(shape), *shape, requires_grad=False).to(device)
+        y = torch.gt(x1, oneof(x2, random().to(int), random().to(float)))
+        return y
+
+    @autotest(n=60, auto_backward=False)
+    def test_tensor_greater_with_random_data(test_case):
+        device = random_device()
+        shape = random_tensor().value().shape
+        x1 = random_pytorch_tensor(len(shape), *shape, requires_grad=False).to(device)
+        x2 = random_pytorch_tensor(len(shape), *shape, requires_grad=False).to(device)
+        y1 = x1.gt(oneof(x2, random().to(int), random().to(float)))
+        y2 = x1 > x2
+        return (y1, y2)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_greater_equal.py b/python/oneflow/test/modules/test_greater_equal.py
new file mode 100644
index 0000000000000000000000000000000000000000..0216751c315dde6bb040e47ece7a287b67a7bcad
--- /dev/null
+++ b/python/oneflow/test/modules/test_greater_equal.py
@@ -0,0 +1,103 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_greater_equal_normal(test_case, device):
+    input1 = flow.Tensor(
+        np.array([1, 1, 4]).astype(np.float32),
+        dtype=flow.float32,
+        device=flow.device(device),
+    )
+    input2 = flow.Tensor(
+        np.array([1, 2, 3]).astype(np.float32),
+        dtype=flow.float32,
+        device=flow.device(device),
+    )
+    of_out = flow.ge(input1, input2)
+    np_out = np.greater_equal(input1.numpy(), input2.numpy())
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+def _test_greater_equal_symbol(test_case, device):
+    input1 = flow.Tensor(
+        np.array([1, 1, 4]).astype(np.float32),
+        dtype=flow.float32,
+        device=flow.device(device),
+    )
+    input2 = flow.Tensor(
+        np.array([1, 2, 3]).astype(np.float32),
+        dtype=flow.float32,
+        device=flow.device(device),
+    )
+    of_out = input1 >= input2
+    np_out = np.greater_equal(input1.numpy(), input2.numpy())
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+def _test_greater_equal_int_scalar(test_case, device):
+    np_arr = np.random.randn(2, 3, 4, 5)
+    input1 = flow.Tensor(np_arr, dtype=flow.float32, device=flow.device(device))
+    input2 = 1
+    of_out = input1 >= input2
+    np_out = np.greater_equal(np_arr, input2)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+def _test_greater_equal_int_tensor_int_scalr(test_case, device):
+    np_arr = np.random.randint(2, size=(2, 3, 4, 5))
+    input1 = flow.Tensor(np_arr, dtype=flow.int, device=flow.device(device))
+    input2 = 1
+    of_out = input1 >= input2
+    np_out = np.greater_equal(np_arr, input2)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+def _test_greater_equal_float_scalar(test_case, device):
+    np_arr = np.random.randn(3, 2, 5, 7)
+    input1 = flow.Tensor(np_arr, dtype=flow.float32, device=flow.device(device))
+    input2 = 2.3
+    of_out = input1 >= input2
+    np_out = np.greater_equal(np_arr, input2)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestGreaterEqual(flow.unittest.TestCase):
+    def test_greter_equal(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_greater_equal_normal,
+            _test_greater_equal_symbol,
+            _test_greater_equal_int_scalar,
+            _test_greater_equal_int_tensor_int_scalr,
+            _test_greater_equal_float_scalar,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_groupnorm.py b/python/oneflow/test/modules/test_groupnorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..23df51cd3935dc7df158922441e422e2d7627152
--- /dev/null
+++ b/python/oneflow/test/modules/test_groupnorm.py
@@ -0,0 +1,341 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_groupnorm(test_case, device):
+    input_arr = np.array(
+        [
+            [
+                [
+                    [-0.8791, 0.2553, 0.7403, -0.2859],
+                    [0.8006, -1.7701, -0.9617, 0.1705],
+                    [0.2842, 1.7825, 0.3365, -0.8525],
+                ],
+                [
+                    [0.7332, -0.0737, 0.7245, -0.6551],
+                    [1.4461, -0.1827, 0.9737, -2.1571],
+                    [0.4657, 0.7244, 0.3378, 0.1775],
+                ],
+            ],
+            [
+                [
+                    [1.8896, 1.8686, 0.1896, 0.9817],
+                    [-0.0671, 1.5569, 1.1449, 0.0086],
+                    [-0.9468, -0.0124, 1.3227, -0.6567],
+                ],
+                [
+                    [-0.8472, 1.3012, -1.1065, 0.9348],
+                    [1.0346, 1.5703, 0.2419, -0.7048],
+                    [0.6957, -0.4523, -0.8819, 1.0164],
+                ],
+            ],
+        ],
+        dtype=np.float32,
+    )
+    output = np.array(
+        [
+            [
+                [
+                    [-1.0548115, 0.18125379, 0.7097197, -0.4084487],
+                    [0.77542377, -2.0256634, -1.1448141, 0.08885399],
+                    [0.21274385, 1.845322, 0.26973096, -1.0258276],
+                ],
+                [
+                    [0.7019834, -0.17723128, 0.6925037, -0.81073654],
+                    [1.4787737, -0.2959999, 0.96403706, -2.4473464],
+                    [0.4105099, 0.69239473, 0.2711475, 0.09648134],
+                ],
+            ],
+            [
+                [
+                    [1.5438884, 1.5218256, -0.24213786, 0.5900453],
+                    [-0.5118278, 1.1943525, 0.76150376, -0.43229714],
+                    [-1.4360437, -0.4543598, 0.94830114, -1.1312639],
+                ],
+                [
+                    [-1.3314037, 0.9257132, -1.6038253, 0.54077196],
+                    [0.6456222, 1.2084305, -0.18719131, -1.1817979],
+                    [0.28957263, -0.91652036, -1.3678597, 0.6265012],
+                ],
+            ],
+        ],
+        dtype=np.float32,
+    )
+    x = flow.Tensor(input_arr, device=flow.device(device))
+    m = flow.nn.GroupNorm(num_groups=1, num_channels=2).to(device=flow.device(device))
+    y = m(x)
+    test_case.assertTrue(np.allclose(y.numpy(), output, 1e-05, 1e-05))
+
+
+def _test_groupnorm_3d(test_case, device):
+    input_arr = np.array(
+        [
+            [
+                [
+                    [
+                        [1.04569761, 0.22863248, 1.42439335, 1.62249689],
+                        [-0.80578825, -0.27276461, 1.04556507, 0.56864134],
+                        [-1.24085419, -1.23960097, 0.33451416, -1.84820402],
+                    ],
+                    [
+                        [-1.511261, 1.06157517, -0.26715858, -1.32888141],
+                        [1.17976881, -0.07931171, 0.33910684, -1.93458573],
+                        [-1.72659647, 0.79049652, 0.39102785, -1.16264882],
+                    ],
+                ],
+                [
+                    [
+                        [0.30067973, -1.2912226, -0.61508225, 0.56454001],
+                        [0.87074187, -1.69257376, 0.36119148, -0.31014289],
+                        [0.20776964, 1.26195488, -1.37122193, -0.17945234],
+                    ],
+                    [
+                        [-0.31112407, -0.80682631, 0.8233194, 0.6384975],
+                        [0.57617527, 0.45505028, 1.68286151, -1.09590744],
+                        [-1.18127546, -1.07529277, 0.52779943, 1.21755926],
+                    ],
+                ],
+            ],
+            [
+                [
+                    [
+                        [-0.12832351, 1.05625455, -0.23253249, -0.64747611],
+                        [-0.00738123, -1.41390089, -1.92664144, -0.21427625],
+                        [-0.94631219, -0.86493989, 0.21026905, 0.24989732],
+                    ],
+                    [
+                        [1.3859182, 1.72002107, 0.50091892, 1.04198896],
+                        [0.71694594, 1.66417023, -1.63030052, 0.77182641],
+                        [0.71545083, 1.96458366, -1.99031931, 1.3196714],
+                    ],
+                ],
+                [
+                    [
+                        [1.80091702, 0.02834973, 0.82259214, -1.05597501],
+                        [-0.58212207, 0.44205949, -0.14740003, -0.994508],
+                        [1.14678114, -0.39196097, 1.2554798, -0.41829324],
+                    ],
+                    [
+                        [-1.0153903, -0.25755713, -1.81756333, -1.06781159],
+                        [1.79680841, -1.9107133, -0.64325796, -1.94640775],
+                        [1.30671156, 1.20445339, -1.26262901, -0.79494188],
+                    ],
+                ],
+            ],
+        ],
+        dtype=np.float32,
+    )
+    output = np.array(
+        [
+            [
+                [
+                    [
+                        [1.0670303, 0.3324034, 1.4075173, 1.5856332],
+                        [-0.5976489, -0.11840499, 1.0669112, 0.6381069],
+                        [-0.9888186, -0.9876919, 0.42760208, -1.5348896],
+                    ],
+                    [
+                        [-1.2319425, 1.0813059, -0.11336456, -1.0679643],
+                        [1.1875744, 0.05552938, 0.43173137, -1.6125557],
+                        [-1.4255517, 0.8375778, 0.4784138, -0.9185038],
+                    ],
+                ],
+                [
+                    [
+                        [0.3447361, -1.3750811, -0.6446106, 0.62979853],
+                        [0.9606047, -1.8086823, 0.41011015, -0.3151683],
+                        [0.24436034, 1.3832531, -1.4615086, -0.17397629],
+                    ],
+                    [
+                        [-0.31622827, -0.8517619, 0.9093717, 0.7096987],
+                        [0.6423687, 0.51151085, 1.8379811, -1.1640717],
+                        [-1.2562994, -1.1418006, 0.59010565, 1.3352901],
+                    ],
+                ],
+            ],
+            [
+                [
+                    [
+                        [-0.23265934, 0.8016156, -0.32364592, -0.6859402],
+                        [-0.12706259, -1.3551185, -1.802801, -0.30770612],
+                        [-0.946859, -0.8758114, 0.06297152, 0.09757163],
+                    ],
+                    [
+                        [1.0894505, 1.3811613, 0.3167428, 0.78916013],
+                        [0.50535965, 1.3323971, -1.5440607, 0.55327666],
+                        [0.50405425, 1.5946931, -1.8583992, 1.0316093],
+                    ],
+                ],
+                [
+                    [
+                        [1.7506906, 0.19012147, 0.8893728, -0.7645185],
+                        [-0.3473382, 0.5543517, 0.03539129, -0.71040297],
+                        [1.174789, -0.17992027, 1.2704874, -0.20310321],
+                    ],
+                    [
+                        [-0.7287877, -0.06159106, -1.4350212, -0.7749395],
+                        [1.7470733, -1.5170306, -0.40116227, -1.548456],
+                        [1.3155918, 1.2255636, -0.9464568, -0.53470486],
+                    ],
+                ],
+            ],
+        ],
+        dtype=np.float32,
+    )
+    x = flow.Tensor(input_arr, device=flow.device(device))
+    m = flow.nn.GroupNorm(num_groups=2, num_channels=2, affine=False).to(
+        device=flow.device(device)
+    )
+    y = m(x)
+    test_case.assertTrue(np.allclose(y.numpy(), output, 1e-05, 1e-05))
+
+
+def _test_groupnorm_backward(test_case, device):
+    input_arr = np.array(
+        [
+            [
+                [
+                    [-0.8791, 0.2553, 0.7403, -0.2859],
+                    [0.8006, -1.7701, -0.9617, 0.1705],
+                    [0.2842, 1.7825, 0.3365, -0.8525],
+                ],
+                [
+                    [0.7332, -0.0737, 0.7245, -0.6551],
+                    [1.4461, -0.1827, 0.9737, -2.1571],
+                    [0.4657, 0.7244, 0.3378, 0.1775],
+                ],
+            ],
+            [
+                [
+                    [1.8896, 1.8686, 0.1896, 0.9817],
+                    [-0.0671, 1.5569, 1.1449, 0.0086],
+                    [-0.9468, -0.0124, 1.3227, -0.6567],
+                ],
+                [
+                    [-0.8472, 1.3012, -1.1065, 0.9348],
+                    [1.0346, 1.5703, 0.2419, -0.7048],
+                    [0.6957, -0.4523, -0.8819, 1.0164],
+                ],
+            ],
+        ],
+        dtype=np.float32,
+    )
+    x = flow.Tensor(input_arr, device=flow.device(device), requires_grad=True)
+    m = flow.nn.GroupNorm(num_groups=1, num_channels=2).to(device=flow.device(device))
+    y = m(x)
+    z = y.sum()
+    z.backward()
+    test_case.assertTrue(
+        np.allclose(x.grad.numpy(), np.zeros(shape=input_arr.shape), 1e-05, 1e-05)
+    )
+
+
+def _test_groupnorm_backward_3d(test_case, device):
+    input_arr = np.array(
+        [
+            [
+                [
+                    [
+                        [1.04569761, 0.22863248, 1.42439335, 1.62249689],
+                        [-0.80578825, -0.27276461, 1.04556507, 0.56864134],
+                        [-1.24085419, -1.23960097, 0.33451416, -1.84820402],
+                    ],
+                    [
+                        [-1.511261, 1.06157517, -0.26715858, -1.32888141],
+                        [1.17976881, -0.07931171, 0.33910684, -1.93458573],
+                        [-1.72659647, 0.79049652, 0.39102785, -1.16264882],
+                    ],
+                ],
+                [
+                    [
+                        [0.30067973, -1.2912226, -0.61508225, 0.56454001],
+                        [0.87074187, -1.69257376, 0.36119148, -0.31014289],
+                        [0.20776964, 1.26195488, -1.37122193, -0.17945234],
+                    ],
+                    [
+                        [-0.31112407, -0.80682631, 0.8233194, 0.6384975],
+                        [0.57617527, 0.45505028, 1.68286151, -1.09590744],
+                        [-1.18127546, -1.07529277, 0.52779943, 1.21755926],
+                    ],
+                ],
+            ],
+            [
+                [
+                    [
+                        [-0.12832351, 1.05625455, -0.23253249, -0.64747611],
+                        [-0.00738123, -1.41390089, -1.92664144, -0.21427625],
+                        [-0.94631219, -0.86493989, 0.21026905, 0.24989732],
+                    ],
+                    [
+                        [1.3859182, 1.72002107, 0.50091892, 1.04198896],
+                        [0.71694594, 1.66417023, -1.63030052, 0.77182641],
+                        [0.71545083, 1.96458366, -1.99031931, 1.3196714],
+                    ],
+                ],
+                [
+                    [
+                        [1.80091702, 0.02834973, 0.82259214, -1.05597501],
+                        [-0.58212207, 0.44205949, -0.14740003, -0.994508],
+                        [1.14678114, -0.39196097, 1.2554798, -0.41829324],
+                    ],
+                    [
+                        [-1.0153903, -0.25755713, -1.81756333, -1.06781159],
+                        [1.79680841, -1.9107133, -0.64325796, -1.94640775],
+                        [1.30671156, 1.20445339, -1.26262901, -0.79494188],
+                    ],
+                ],
+            ],
+        ],
+        dtype=np.float32,
+    )
+    x = flow.Tensor(input_arr, device=flow.device(device), requires_grad=True)
+    m = flow.nn.GroupNorm(num_groups=2, num_channels=2, affine=False).to(
+        device=flow.device(device)
+    )
+    y = m(x)
+    z = y.sum()
+    z.backward()
+    test_case.assertTrue(
+        np.allclose(x.grad.numpy(), np.zeros(shape=input_arr.shape), 1e-05, 1e-05)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestGroupNorm(flow.unittest.TestCase):
+    def test_groupnorm(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_groupnorm,
+            _test_groupnorm_3d,
+            _test_groupnorm_backward,
+            _test_groupnorm_backward_3d,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_image_batch_align.py b/python/oneflow/test/modules/test_image_batch_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf21c0b92cee6f2a6f63261e6eb5c985849a54d0
--- /dev/null
+++ b/python/oneflow/test/modules/test_image_batch_align.py
@@ -0,0 +1,90 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import operator
+import unittest
+from functools import reduce
+
+import cv2
+import numpy as np
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _read_images_by_cv(image_files):
+    images = [cv2.imread(image_file).astype(np.single) for image_file in image_files]
+    return images
+
+
+def _get_images_static_shape(images):
+    image_shapes = [image.shape for image in images]
+    image_static_shape = np.amax(image_shapes, axis=0)
+    assert isinstance(
+        image_static_shape, np.ndarray
+    ), "image_shapes: {}, image_static_shape: {}".format(
+        str(image_shapes), str(image_static_shape)
+    )
+    image_static_shape = image_static_shape.tolist()
+    image_static_shape.insert(0, len(image_shapes))
+    return image_static_shape
+
+
+def _roundup(x, n):
+    return int((x + n - 1) / n) * n
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestImageBatchAlign(flow.unittest.TestCase):
+    def test_image_batch_align(test_case):
+        image_files = [
+            "/dataset/mscoco_2017/val2017/000000000139.jpg",
+            "/dataset/mscoco_2017/val2017/000000000632.jpg",
+            "/dataset/mscoco_2017/val2017/000000000785.jpg",
+            "/dataset/mscoco_2017/val2017/000000001000.jpg",
+        ]
+        alignment = 16
+        images = _read_images_by_cv(image_files)
+        image_shape = _get_images_static_shape(images)
+        assert len(image_shape) == 4
+        aligned_image_shape = [
+            image_shape[0],
+            _roundup(image_shape[1], alignment),
+            _roundup(image_shape[2], alignment),
+            image_shape[3],
+        ]
+        image_batch_aligner = flow.nn.image.batch_align(
+            shape=aligned_image_shape[1:], dtype=flow.float, alignment=alignment
+        )
+        images_np_arr_static = np.zeros(image_shape, dtype=np.float32)
+        for (idx, np_arr) in enumerate(images):
+            images_np_arr_static[idx, : np_arr.shape[0], : np_arr.shape[1], :] = np_arr
+        input = flow.Tensor(
+            images_np_arr_static, dtype=flow.float, device=flow.device("cpu")
+        )
+        images_buffer = flow.tensor_to_tensor_buffer(input, instance_dims=3)
+        of_aligned_image = image_batch_aligner(images_buffer).numpy()
+        test_case.assertTrue(
+            np.array_equal(aligned_image_shape, of_aligned_image.shape)
+        )
+        empty_image_array = np.zeros(aligned_image_shape, np.float32)
+        for (empty_image, image) in zip(empty_image_array, images):
+            empty_image[0 : image.shape[0], 0 : image.shape[1], :] = image
+        test_case.assertTrue(np.array_equal(of_aligned_image, empty_image_array))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_image_decode.py b/python/oneflow/test/modules/test_image_decode.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9a355c7a110ad6a1b2cd6660fe63107a7b45712
--- /dev/null
+++ b/python/oneflow/test/modules/test_image_decode.py
@@ -0,0 +1,62 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import cv2
+import numpy as np
+
+import oneflow as flow
+import oneflow.unittest
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestImageDecode(flow.unittest.TestCase):
+    def test_image_decode(test_case):
+        images = [
+            "/dataset/mscoco_2017/val2017/000000000139.jpg",
+            "/dataset/mscoco_2017/val2017/000000000632.jpg",
+        ]
+        image_files = [open(im, "rb") for im in images]
+        images_bytes = [imf.read() for imf in image_files]
+        static_shape = (len(images_bytes), max([len(bys) for bys in images_bytes]))
+        for imf in image_files:
+            imf.close()
+        image_decoder = flow.nn.image.decode(color_space="BGR")
+        images_np_arr = [
+            np.frombuffer(bys, dtype=np.byte).reshape(1, -1) for bys in images_bytes
+        ]
+        images_np_arr_static = np.zeros(static_shape, dtype=np.int8)
+        for (idx, np_arr) in enumerate(images_np_arr):
+            images_np_arr_static[idx, : np_arr.shape[1]] = np_arr
+        input = flow.Tensor(
+            images_np_arr_static, dtype=flow.int8, device=flow.device("cpu")
+        )
+        images_buffer = flow.tensor_to_tensor_buffer(input, instance_dims=1)
+        decoded_images_buffer = image_decoder(images_buffer)
+        of_decoded_images = decoded_images_buffer.numpy()
+        cv2_images = [cv2.imread(image) for image in images]
+        cv2_decoded_images = [np.array(image) for image in cv2_images]
+        for (of_decoded_image, cv2_decoded_image) in zip(
+            of_decoded_images, cv2_decoded_images
+        ):
+            test_case.assertTrue(len(of_decoded_image.shape) == 3)
+            test_case.assertTrue(len(cv2_decoded_image.shape) == 3)
+            test_case.assertTrue(np.allclose(of_decoded_image, cv2_decoded_image))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_image_flip.py b/python/oneflow/test/modules/test_image_flip.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec4e2e0d82c860f822b178cce0ad8fb8d74aa03b
--- /dev/null
+++ b/python/oneflow/test/modules/test_image_flip.py
@@ -0,0 +1,80 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import cv2
+import numpy as np
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _of_image_flip(images, image_static_shape, flip_code):
+    image_tensors = flow.Tensor(images, dtype=flow.float, device=flow.device("cpu"))
+    image_tensor_buffer = flow.tensor_to_tensor_buffer(image_tensors, instance_dims=3)
+    flip_images = flow.nn.image.flip(flip_code)(image_tensor_buffer)
+    return flip_images.numpy()
+
+
+def _read_images_by_cv(image_files):
+    images = [cv2.imread(image_file).astype(np.single) for image_file in image_files]
+    return [np.expand_dims(image, axis=0) for image in images]
+
+
+def _get_images_static_shape(images):
+    image_shapes = [image.shape for image in images]
+    image_static_shape = np.amax(image_shapes, axis=0)
+    assert isinstance(
+        image_static_shape, np.ndarray
+    ), "image_shapes: {}, image_static_shape: {}".format(
+        str(image_shapes), str(image_static_shape)
+    )
+    image_static_shape = image_static_shape.tolist()
+    assert image_static_shape[0] == 1, str(image_static_shape)
+    image_static_shape[0] = len(image_shapes)
+    return image_static_shape
+
+
+def _compare_image_flip_with_cv(test_case, image_files):
+    images = _read_images_by_cv(image_files)
+    assert all([len(image.shape) == 4 for image in images])
+    image_static_shape = _get_images_static_shape(images)
+    image_paddings = np.zeros(tuple(image_static_shape))
+    for (idx, image) in enumerate(images):
+        image_paddings[
+            idx, : image.shape[1], : image.shape[2], : image.shape[3]
+        ] = image
+    flip_images = _of_image_flip(image_paddings, image_static_shape, 1)
+    for (image, flip_image) in zip(image_paddings, flip_images):
+        exp_flip_image = cv2.flip(image.squeeze(), 1)
+        test_case.assertTrue(np.allclose(exp_flip_image, flip_image))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestImageFlip(flow.unittest.TestCase):
+    def test_image_flip(test_case):
+        _compare_image_flip_with_cv(
+            test_case,
+            [
+                "/dataset/mscoco_2017/val2017/000000000139.jpg",
+                "/dataset/mscoco_2017/val2017/000000000632.jpg",
+            ],
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_image_normalize.py b/python/oneflow/test/modules/test_image_normalize.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae2030b9afc313d13b4fcb24e11d15849e0c4d47
--- /dev/null
+++ b/python/oneflow/test/modules/test_image_normalize.py
@@ -0,0 +1,88 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import cv2
+import numpy as np
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _of_image_normalize(images, image_static_shape, std, mean):
+    image_zeros = np.zeros(tuple(image_static_shape))
+    for (idx, image) in enumerate(images):
+        image_zeros[idx, : image.shape[1], : image.shape[2], : image.shape[3]] = image
+    image_tensors = flow.Tensor(
+        image_zeros, dtype=flow.float, device=flow.device("cpu")
+    )
+    image_tensor_buffer = flow.tensor_to_tensor_buffer(image_tensors, instance_dims=3)
+    image_normalizer = flow.nn.image.normalize(std, mean)
+    norm_images = image_normalizer(image_tensor_buffer)
+    return norm_images.numpy()
+
+
+def _read_images_by_cv(image_files):
+    images = [cv2.imread(image_file).astype(np.single) for image_file in image_files]
+    return [np.expand_dims(image, axis=0) for image in images]
+
+
+def _get_images_static_shape(images):
+    image_shapes = [image.shape for image in images]
+    image_static_shape = np.amax(image_shapes, axis=0)
+    assert isinstance(
+        image_static_shape, np.ndarray
+    ), "image_shapes: {}, image_static_shape: {}".format(
+        str(image_shapes), str(image_static_shape)
+    )
+    image_static_shape = image_static_shape.tolist()
+    assert image_static_shape[0] == 1, str(image_static_shape)
+    image_static_shape[0] = len(image_shapes)
+    return image_static_shape
+
+
+def _compare_image_normalize(test_case, image_files, std, mean):
+    images = _read_images_by_cv(image_files)
+    assert all([len(image.shape) == 4 for image in images])
+    image_static_shape = _get_images_static_shape(images)
+    norm_images = _of_image_normalize(images, image_static_shape, std, mean)
+    std_array = np.array(std).reshape(1, 1, 1, -1)
+    mean_array = np.array(mean).reshape(1, 1, 1, -1)
+    for (image, norm_image) in zip(images, norm_images):
+        np_norm_image = np.squeeze((image - mean_array) / std_array, axis=0)
+        norm_image = norm_image[
+            : np_norm_image.shape[0], : np_norm_image.shape[1], : np_norm_image.shape[2]
+        ]
+        test_case.assertTrue(np.allclose(np_norm_image, norm_image))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestImageNormalize(flow.unittest.TestCase):
+    def test_image_normalize(test_case):
+        _compare_image_normalize(
+            test_case,
+            [
+                "/dataset/mscoco_2017/val2017/000000000139.jpg",
+                "/dataset/mscoco_2017/val2017/000000000632.jpg",
+            ],
+            (102.9801, 115.9465, 122.7717),
+            (1.0, 1.0, 1.0),
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_image_resize.py b/python/oneflow/test/modules/test_image_resize.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e513796e5e64820f86f74a7c718516fff693def
--- /dev/null
+++ b/python/oneflow/test/modules/test_image_resize.py
@@ -0,0 +1,277 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import cv2
+import image_test_util
+import numpy as np
+
+import oneflow as flow
+import oneflow.nn as nn
+import oneflow.unittest
+
+
+def _of_image_resize(
+    image_list,
+    dtype=flow.float32,
+    origin_dtype=flow.float32,
+    channels=3,
+    keep_aspect_ratio=False,
+    target_size=None,
+    min_size=None,
+    max_size=None,
+    resize_side="shorter",
+    interpolation_type="bilinear",
+):
+    assert isinstance(image_list, (list, tuple))
+    assert all((isinstance(image, np.ndarray) for image in image_list))
+    assert all((image.ndim == 3 for image in image_list))
+    assert all((image.shape[2] == channels for image in image_list))
+    res_image_list = []
+    res_size_list = []
+    res_scale_list = []
+    image_resize_module = nn.image.Resize(
+        target_size=target_size,
+        min_size=min_size,
+        max_size=max_size,
+        keep_aspect_ratio=keep_aspect_ratio,
+        resize_side=resize_side,
+        dtype=dtype,
+        interpolation_type=interpolation_type,
+        channels=channels,
+    )
+    for image in image_list:
+        tensor_dtype = dtype if keep_aspect_ratio else origin_dtype
+        input = flow.Tensor(
+            np.expand_dims(image, axis=0), dtype=tensor_dtype, device=flow.device("cpu")
+        )
+        image_buffer = flow.tensor_to_tensor_buffer(input, instance_dims=3)
+        (res_image, scale, new_size) = image_resize_module(image_buffer)
+        res_image = res_image.numpy()
+        scale = scale.numpy()
+        if not keep_aspect_ratio:
+            new_size = np.asarray([(target_size, target_size)])
+        else:
+            new_size = new_size.numpy()
+        res_image_list.append(res_image[0])
+        res_size_list.append(new_size[0])
+        res_scale_list.append(scale[0])
+    return (res_image_list, res_scale_list, res_size_list)
+
+
+def _get_resize_size_and_scale(
+    w,
+    h,
+    target_size,
+    min_size=None,
+    max_size=None,
+    keep_aspect_ratio=True,
+    resize_side="shorter",
+):
+    if keep_aspect_ratio:
+        assert isinstance(target_size, int)
+        aspect_ratio = float(min((w, h))) / float(max((w, h)))
+        (
+            min_res_size,
+            max_res_size,
+        ) = image_test_util.compute_keep_aspect_ratio_resized_size(
+            target_size, min_size, max_size, aspect_ratio, resize_side
+        )
+        if w < h:
+            res_w = min_res_size
+            res_h = max_res_size
+        else:
+            res_w = max_res_size
+            res_h = min_res_size
+    else:
+        assert isinstance(target_size, (list, tuple))
+        assert len(target_size) == 2
+        assert all((isinstance(size, int) for size in target_size))
+        (res_w, res_h) = target_size
+    scale_w = res_w / w
+    scale_h = res_h / h
+    return ((res_w, res_h), (scale_w, scale_h))
+
+
+def _cv_image_resize(
+    image_list,
+    target_size,
+    keep_aspect_ratio=True,
+    min_size=None,
+    max_size=None,
+    resize_side="shorter",
+    interpolation=cv2.INTER_LINEAR,
+    dtype=np.float32,
+):
+    res_image_list = []
+    res_size_list = []
+    res_scale_list = []
+    for image in image_list:
+        (h, w) = image.shape[:2]
+        (new_size, scale) = _get_resize_size_and_scale(
+            w, h, target_size, min_size, max_size, keep_aspect_ratio, resize_side
+        )
+        res_image_list.append(
+            cv2.resize(image.squeeze(), new_size, interpolation=interpolation).astype(
+                dtype
+            )
+        )
+        res_size_list.append(new_size)
+        res_scale_list.append(scale)
+    return (res_image_list, res_scale_list, res_size_list)
+
+
+def _test_image_resize_with_cv(
+    test_case,
+    image_files,
+    target_size,
+    min_size=None,
+    max_size=None,
+    keep_aspect_ratio=True,
+    resize_side="shorter",
+    dtype=flow.float32,
+    origin_dtype=None,
+):
+    if origin_dtype is None:
+        origin_dtype = dtype
+    image_list = image_test_util.read_images_by_cv(image_files, origin_dtype)
+    (of_res_images, of_scales, of_new_sizes) = _of_image_resize(
+        image_list=image_list,
+        dtype=dtype,
+        origin_dtype=origin_dtype,
+        keep_aspect_ratio=keep_aspect_ratio,
+        target_size=target_size,
+        min_size=min_size,
+        max_size=max_size,
+        resize_side=resize_side,
+    )
+    (cv_res_images, cv_scales, cv_new_sizes) = _cv_image_resize(
+        image_list=image_list,
+        target_size=target_size,
+        keep_aspect_ratio=keep_aspect_ratio,
+        min_size=min_size,
+        max_size=max_size,
+        resize_side=resize_side,
+        dtype=flow.convert_oneflow_dtype_to_numpy_dtype(dtype),
+    )
+    for (
+        of_res_image,
+        cv_res_image,
+        of_scale,
+        cv_scale,
+        of_new_size,
+        cv_new_size,
+    ) in zip(
+        of_res_images, cv_res_images, of_scales, cv_scales, of_new_sizes, cv_new_sizes
+    ):
+        test_case.assertTrue(np.allclose(of_res_image, cv_res_image))
+        test_case.assertTrue(np.allclose(of_scale, cv_scale))
+        test_case.assertTrue(np.allclose(of_new_size, cv_new_size))
+
+
+@flow.unittest.skip_unless_1n1d()
+@unittest.skipIf(
+    not flow.unittest.env.eager_execution_enabled(),
+    ".numpy() doesn't work in lazy mode",
+)
+class TestImageResize(flow.unittest.TestCase):
+    def test_image_resize_to_fixed_size(test_case):
+        (image_files, _) = image_test_util.random_sample_images_from_coco()
+        _test_image_resize_with_cv(
+            test_case, image_files, target_size=(224, 224), keep_aspect_ratio=False
+        )
+
+    def test_image_resize_shorter_to_target_size(test_case):
+        (image_files, _) = image_test_util.random_sample_images_from_coco()
+        _test_image_resize_with_cv(
+            test_case,
+            image_files,
+            target_size=800,
+            keep_aspect_ratio=True,
+            resize_side="shorter",
+        )
+
+    def test_image_resize_longer_to_target_size(test_case):
+        (image_files, _) = image_test_util.random_sample_images_from_coco()
+        _test_image_resize_with_cv(
+            test_case,
+            image_files,
+            target_size=1000,
+            keep_aspect_ratio=True,
+            resize_side="longer",
+        )
+
+    def test_image_resize_shorter_to_target_size_with_max_size(test_case):
+        (image_files, _) = image_test_util.random_sample_images_from_coco()
+        _test_image_resize_with_cv(
+            test_case,
+            image_files,
+            target_size=800,
+            max_size=1333,
+            keep_aspect_ratio=True,
+            resize_side="shorter",
+        )
+
+    def test_image_resize_longer_to_target_size_with_min_size(test_case):
+        (image_files, _) = image_test_util.random_sample_images_from_coco()
+        _test_image_resize_with_cv(
+            test_case,
+            image_files,
+            target_size=1000,
+            min_size=600,
+            keep_aspect_ratio=True,
+            resize_side="longer",
+        )
+
+    def test_image_resize_to_fixed_size_with_dtype_uint8(test_case):
+        (image_files, _) = image_test_util.random_sample_images_from_coco()
+        _test_image_resize_with_cv(
+            test_case,
+            image_files,
+            target_size=(1000, 1000),
+            keep_aspect_ratio=False,
+            dtype=flow.uint8,
+        )
+
+    def test_image_reisze_shorter_to_target_size_with_max_size_with_dtype_uint8(
+        test_case,
+    ):
+        (image_files, _) = image_test_util.random_sample_images_from_coco()
+        _test_image_resize_with_cv(
+            test_case,
+            image_files,
+            target_size=1000,
+            max_size=1600,
+            keep_aspect_ratio=True,
+            resize_side="shorter",
+            dtype=flow.uint8,
+        )
+
+    def test_image_resize_uint8_to_float(test_case):
+        (image_files, _) = image_test_util.random_sample_images_from_coco()
+        _test_image_resize_with_cv(
+            test_case,
+            image_files,
+            target_size=(1000, 1000),
+            keep_aspect_ratio=False,
+            dtype=flow.float32,
+            origin_dtype=flow.uint8,
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_in_top_k.py b/python/oneflow/test/modules/test_in_top_k.py
new file mode 100644
index 0000000000000000000000000000000000000000..2da4903f084145319a5f542da5d05e9befebac71
--- /dev/null
+++ b/python/oneflow/test/modules/test_in_top_k.py
@@ -0,0 +1,113 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _topk_np(input, k, dim: int = -1, largest: bool = True, _sorted: bool = True):
+    in_dims = input.shape
+    out_dims = list(in_dims)
+    num_axes = len(input.shape)
+    if dim < 0:
+        dim = dim + num_axes
+    n = in_dims[dim]
+    if k > n:
+        k = n
+    out_dims[dim] = k
+    out_dims = tuple(out_dims)
+    prev_dims = 1
+    next_dims = 1
+    for i in range(dim):
+        prev_dims *= in_dims[i]
+    for i in range(dim + 1, len(in_dims)):
+        next_dims *= in_dims[i]
+    input_flat = input.reshape((prev_dims, n, next_dims))
+    values_ref = np.ndarray(shape=(prev_dims, k, next_dims), dtype=input.dtype)
+    values_ref.fill(0)
+    indices_ref = np.ndarray(shape=(prev_dims, k, next_dims), dtype=np.int64)
+    indices_ref.fill(-1)
+    for i in range(prev_dims):
+        for j in range(next_dims):
+            kv = []
+            for x in range(n):
+                val = input_flat[i, x, j]
+                y = x * next_dims + i * in_dims[dim] * next_dims + j
+                kv.append((val, x, y))
+            cnt = 0
+            for (val, x, y) in sorted(kv, key=lambda x: (x[0], -x[1]), reverse=largest):
+                values_ref[i, cnt, j] = val
+                indices_ref[i, cnt, j] = x
+                cnt += 1
+                if cnt >= k or cnt >= n:
+                    break
+    values_ref = values_ref.reshape(out_dims)
+    indices_ref = indices_ref.reshape(out_dims)
+    return (values_ref, indices_ref)
+
+
+def _in_top_k_np(targets, predictions, k):
+    assert (
+        targets.shape[0] == predictions.shape[0]
+    ), "The num of targets must equal the num of predictions"
+    assert len(targets.shape) == 1, "The dimension of targets must be 1"
+    assert len(predictions.shape) == 2, "The dimension of predictions must be 2"
+    results = np.zeros_like(targets, dtype=np.int8)
+    for i in range(len(results)):
+        (_, indices_topk) = _topk_np(predictions[i], k)
+        if targets[i] in indices_topk:
+            results[i] = 1
+    return results
+
+
+def _test_in_top_k_impl(test_case, shape, k, device):
+    np_targets = np.random.randint(0, shape[1], size=shape[0])
+    np_predictions = np.random.rand(*shape)
+    of_targets = flow.Tensor(
+        np_targets, dtype=flow.int32, device=flow.device(device), requires_grad=True
+    )
+    of_predictions = flow.Tensor(
+        np_predictions,
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    of_out = flow.in_top_k(of_targets, of_predictions, k)
+    np_out = _in_top_k_np(np_targets, np_predictions, k)
+    test_case.assertTrue(
+        np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001, equal_nan=True)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestInTopK(flow.unittest.TestCase):
+    def test_in_top_k(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(2, 3), (3, 4), (5, 6)]
+        arg_dict["k"] = [1, 2, 5]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_in_top_k_impl(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_instancenorm.py b/python/oneflow/test/modules/test_instancenorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..27760fa9a13eabf4bcf3befd0a488130673697c8
--- /dev/null
+++ b/python/oneflow/test/modules/test_instancenorm.py
@@ -0,0 +1,423 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_instancenorm1d(test_case, device):
+    input_arr = np.array(
+        [
+            [
+                [-0.1091, 2.0041, 0.885, -0.0412],
+                [-1.2055, 0.7442, 2.33, 1.2411],
+                [-1.2466, 0.3667, 1.2267, 0.3043],
+            ],
+            [
+                [-0.2484, -1.1407, 0.3352, 0.6687],
+                [-0.2975, -0.0227, -0.2302, -0.3762],
+                [-0.7759, -0.6789, 1.1444, 1.8077],
+            ],
+        ],
+        dtype=np.float32,
+    )
+    output_arr = np.array(
+        [
+            [
+                [-0.9262, 1.5395, 0.2337, -0.847],
+                [-1.5486, -0.026, 1.2125, 0.3621],
+                [-1.5807, 0.2287, 1.1933, 0.1587],
+            ],
+            [
+                [-0.2215, -1.5212, 0.6285, 1.1143],
+                [-0.5016, 1.5917, 0.011, -1.1011],
+                [-1.0207, -0.9346, 0.6833, 1.2719],
+            ],
+        ],
+        dtype=np.float32,
+    )
+    m = flow.nn.InstanceNorm1d(num_features=3, eps=1e-05, momentum=0.1).to(
+        device=flow.device(device)
+    )
+    x = flow.Tensor(input_arr, device=flow.device(device))
+    y = m(x)
+    test_case.assertTrue(np.allclose(y.numpy(), output_arr, rtol=0.0001, atol=0.0001))
+    m.eval()
+    y = m(x)
+    test_case.assertTrue(np.allclose(y.numpy(), output_arr, rtol=0.0001, atol=0.0001))
+
+
+def _test_instancenorm2d(test_case, device):
+    input_arr = np.array(
+        [
+            [
+                [
+                    [-0.8791, 0.2553, 0.7403, -0.2859],
+                    [0.8006, -1.7701, -0.9617, 0.1705],
+                    [0.2842, 1.7825, 0.3365, -0.8525],
+                ],
+                [
+                    [0.7332, -0.0737, 0.7245, -0.6551],
+                    [1.4461, -0.1827, 0.9737, -2.1571],
+                    [0.4657, 0.7244, 0.3378, 0.1775],
+                ],
+            ],
+            [
+                [
+                    [1.8896, 1.8686, 0.1896, 0.9817],
+                    [-0.0671, 1.5569, 1.1449, 0.0086],
+                    [-0.9468, -0.0124, 1.3227, -0.6567],
+                ],
+                [
+                    [-0.8472, 1.3012, -1.1065, 0.9348],
+                    [1.0346, 1.5703, 0.2419, -0.7048],
+                    [0.6957, -0.4523, -0.8819, 1.0164],
+                ],
+            ],
+        ],
+        dtype=np.float32,
+    )
+    output = np.array(
+        [
+            [
+                [
+                    [-0.9155, 0.31, 0.8339, -0.2747],
+                    [0.8991, -1.8781, -1.0048, 0.2183],
+                    [0.3412, 1.9598, 0.3977, -0.8868],
+                ],
+                [
+                    [0.586, -0.3169, 0.5763, -0.9675],
+                    [1.3837, -0.4389, 0.8551, -2.6483],
+                    [0.2867, 0.5761, 0.1435, -0.0358],
+                ],
+            ],
+            [
+                [
+                    [1.374, 1.3515, -0.4466, 0.4017],
+                    [-0.7215, 1.0177, 0.5765, -0.6405],
+                    [-1.6636, -0.663, 0.7669, -1.353],
+                ],
+                [
+                    [-1.1583, 1.1444, -1.4363, 0.7516],
+                    [0.8586, 1.4328, 0.009, -1.0057],
+                    [0.4954, -0.7351, -1.1955, 0.8391],
+                ],
+            ],
+        ],
+        dtype=np.float32,
+    )
+    m = flow.nn.InstanceNorm2d(num_features=2, eps=1e-05, momentum=0.1).to(
+        device=flow.device(device)
+    )
+    x = flow.Tensor(input_arr, device=flow.device(device))
+    y = m(x)
+    test_case.assertTrue(np.allclose(y.numpy(), output, 0.0001, 0.0001))
+    m.eval()
+    y = m(x)
+    test_case.assertTrue(np.allclose(y.numpy(), output, 0.0001, 0.0001))
+
+
+def _test_instancenorm3d(test_case, device):
+    input_arr = np.array(
+        [
+            [
+                [
+                    [
+                        [1.04569761, 0.22863248, 1.42439335, 1.62249689],
+                        [-0.80578825, -0.27276461, 1.04556507, 0.56864134],
+                        [-1.24085419, -1.23960097, 0.33451416, -1.84820402],
+                    ],
+                    [
+                        [-1.511261, 1.06157517, -0.26715858, -1.32888141],
+                        [1.17976881, -0.07931171, 0.33910684, -1.93458573],
+                        [-1.72659647, 0.79049652, 0.39102785, -1.16264882],
+                    ],
+                ],
+                [
+                    [
+                        [0.30067973, -1.2912226, -0.61508225, 0.56454001],
+                        [0.87074187, -1.69257376, 0.36119148, -0.31014289],
+                        [0.20776964, 1.26195488, -1.37122193, -0.17945234],
+                    ],
+                    [
+                        [-0.31112407, -0.80682631, 0.8233194, 0.6384975],
+                        [0.57617527, 0.45505028, 1.68286151, -1.09590744],
+                        [-1.18127546, -1.07529277, 0.52779943, 1.21755926],
+                    ],
+                ],
+            ],
+            [
+                [
+                    [
+                        [-0.12832351, 1.05625455, -0.23253249, -0.64747611],
+                        [-0.00738123, -1.41390089, -1.92664144, -0.21427625],
+                        [-0.94631219, -0.86493989, 0.21026905, 0.24989732],
+                    ],
+                    [
+                        [1.3859182, 1.72002107, 0.50091892, 1.04198896],
+                        [0.71694594, 1.66417023, -1.63030052, 0.77182641],
+                        [0.71545083, 1.96458366, -1.99031931, 1.3196714],
+                    ],
+                ],
+                [
+                    [
+                        [1.80091702, 0.02834973, 0.82259214, -1.05597501],
+                        [-0.58212207, 0.44205949, -0.14740003, -0.994508],
+                        [1.14678114, -0.39196097, 1.2554798, -0.41829324],
+                    ],
+                    [
+                        [-1.0153903, -0.25755713, -1.81756333, -1.06781159],
+                        [1.79680841, -1.9107133, -0.64325796, -1.94640775],
+                        [1.30671156, 1.20445339, -1.26262901, -0.79494188],
+                    ],
+                ],
+            ],
+        ],
+        dtype=np.float32,
+    )
+    output_arr = np.array(
+        [
+            [
+                [
+                    [
+                        [1.067, 0.3324, 1.4075, 1.5856],
+                        [-0.5976, -0.1184, 1.0669, 0.6381],
+                        [-0.9888, -0.9877, 0.4276, -1.5349],
+                    ],
+                    [
+                        [-1.2319, 1.0813, -0.1134, -1.068],
+                        [1.1876, 0.0555, 0.4317, -1.6126],
+                        [-1.4256, 0.8376, 0.4784, -0.9185],
+                    ],
+                ],
+                [
+                    [
+                        [0.3447, -1.3751, -0.6446, 0.6298],
+                        [0.9606, -1.8087, 0.4101, -0.3152],
+                        [0.2444, 1.3833, -1.4615, -0.174],
+                    ],
+                    [
+                        [-0.3162, -0.8518, 0.9094, 0.7097],
+                        [0.6424, 0.5115, 1.838, -1.1641],
+                        [-1.2563, -1.1418, 0.5901, 1.3353],
+                    ],
+                ],
+            ],
+            [
+                [
+                    [
+                        [-0.2327, 0.8016, -0.3236, -0.6859],
+                        [-0.1271, -1.3551, -1.8028, -0.3077],
+                        [-0.9469, -0.8758, 0.063, 0.0976],
+                    ],
+                    [
+                        [1.0895, 1.3812, 0.3167, 0.7892],
+                        [0.5054, 1.3324, -1.5441, 0.5533],
+                        [0.5041, 1.5947, -1.8584, 1.0316],
+                    ],
+                ],
+                [
+                    [
+                        [1.7507, 0.1901, 0.8894, -0.7645],
+                        [-0.3473, 0.5544, 0.0354, -0.7104],
+                        [1.1748, -0.1799, 1.2705, -0.2031],
+                    ],
+                    [
+                        [-0.7288, -0.0616, -1.435, -0.7749],
+                        [1.7471, -1.517, -0.4012, -1.5485],
+                        [1.3156, 1.2256, -0.9465, -0.5347],
+                    ],
+                ],
+            ],
+        ],
+        dtype=np.float32,
+    )
+    m = flow.nn.InstanceNorm3d(num_features=2, eps=1e-05, momentum=0.1).to(
+        device=flow.device(device)
+    )
+    x = flow.Tensor(input_arr, device=flow.device(device))
+    y = m(x)
+    test_case.assertTrue(np.allclose(y.numpy(), output_arr, 0.0001, 0.0001))
+    m.eval()
+    y = m(x)
+    test_case.assertTrue(np.allclose(y.numpy(), output_arr, 0.0001, 0.0001))
+
+
+def _test_instancenorm1d_backward(test_case, device):
+    input_arr = np.array(
+        [
+            [
+                [-0.1091, 2.0041, 0.885, -0.0412],
+                [-1.2055, 0.7442, 2.33, 1.2411],
+                [-1.2466, 0.3667, 1.2267, 0.3043],
+            ],
+            [
+                [-0.2484, -1.1407, 0.3352, 0.6687],
+                [-0.2975, -0.0227, -0.2302, -0.3762],
+                [-0.7759, -0.6789, 1.1444, 1.8077],
+            ],
+        ],
+        dtype=np.float32,
+    )
+    m = flow.nn.InstanceNorm1d(num_features=2, eps=1e-05, momentum=0.1).to(
+        device=flow.device(device)
+    )
+    x = flow.Tensor(input_arr, device=flow.device(device), requires_grad=True)
+    y = m(x)
+    z = y.sum()
+    z.backward()
+    test_case.assertTrue(
+        np.allclose(x.grad.numpy(), np.zeros(shape=input_arr.shape), 1e-05, 1e-05)
+    )
+
+
+def _test_instancenorm2d_backward(test_case, device):
+    input_arr = np.array(
+        [
+            [
+                [
+                    [-0.8791, 0.2553, 0.7403, -0.2859],
+                    [0.8006, -1.7701, -0.9617, 0.1705],
+                    [0.2842, 1.7825, 0.3365, -0.8525],
+                ],
+                [
+                    [0.7332, -0.0737, 0.7245, -0.6551],
+                    [1.4461, -0.1827, 0.9737, -2.1571],
+                    [0.4657, 0.7244, 0.3378, 0.1775],
+                ],
+            ],
+            [
+                [
+                    [1.8896, 1.8686, 0.1896, 0.9817],
+                    [-0.0671, 1.5569, 1.1449, 0.0086],
+                    [-0.9468, -0.0124, 1.3227, -0.6567],
+                ],
+                [
+                    [-0.8472, 1.3012, -1.1065, 0.9348],
+                    [1.0346, 1.5703, 0.2419, -0.7048],
+                    [0.6957, -0.4523, -0.8819, 1.0164],
+                ],
+            ],
+        ],
+        dtype=np.float32,
+    )
+    m = flow.nn.InstanceNorm2d(num_features=2, eps=1e-05, momentum=0.1).to(
+        device=flow.device(device)
+    )
+    x = flow.Tensor(input_arr, device=flow.device(device), requires_grad=True)
+    y = m(x)
+    z = y.sum()
+    z.backward()
+    test_case.assertTrue(
+        np.allclose(x.grad.numpy(), np.zeros(shape=input_arr.shape), 1e-05, 1e-05)
+    )
+
+
+def _test_instancenorm3d_backward(test_case, device):
+    input_arr = np.array(
+        [
+            [
+                [
+                    [
+                        [1.04569761, 0.22863248, 1.42439335, 1.62249689],
+                        [-0.80578825, -0.27276461, 1.04556507, 0.56864134],
+                        [-1.24085419, -1.23960097, 0.33451416, -1.84820402],
+                    ],
+                    [
+                        [-1.511261, 1.06157517, -0.26715858, -1.32888141],
+                        [1.17976881, -0.07931171, 0.33910684, -1.93458573],
+                        [-1.72659647, 0.79049652, 0.39102785, -1.16264882],
+                    ],
+                ],
+                [
+                    [
+                        [0.30067973, -1.2912226, -0.61508225, 0.56454001],
+                        [0.87074187, -1.69257376, 0.36119148, -0.31014289],
+                        [0.20776964, 1.26195488, -1.37122193, -0.17945234],
+                    ],
+                    [
+                        [-0.31112407, -0.80682631, 0.8233194, 0.6384975],
+                        [0.57617527, 0.45505028, 1.68286151, -1.09590744],
+                        [-1.18127546, -1.07529277, 0.52779943, 1.21755926],
+                    ],
+                ],
+            ],
+            [
+                [
+                    [
+                        [-0.12832351, 1.05625455, -0.23253249, -0.64747611],
+                        [-0.00738123, -1.41390089, -1.92664144, -0.21427625],
+                        [-0.94631219, -0.86493989, 0.21026905, 0.24989732],
+                    ],
+                    [
+                        [1.3859182, 1.72002107, 0.50091892, 1.04198896],
+                        [0.71694594, 1.66417023, -1.63030052, 0.77182641],
+                        [0.71545083, 1.96458366, -1.99031931, 1.3196714],
+                    ],
+                ],
+                [
+                    [
+                        [1.80091702, 0.02834973, 0.82259214, -1.05597501],
+                        [-0.58212207, 0.44205949, -0.14740003, -0.994508],
+                        [1.14678114, -0.39196097, 1.2554798, -0.41829324],
+                    ],
+                    [
+                        [-1.0153903, -0.25755713, -1.81756333, -1.06781159],
+                        [1.79680841, -1.9107133, -0.64325796, -1.94640775],
+                        [1.30671156, 1.20445339, -1.26262901, -0.79494188],
+                    ],
+                ],
+            ],
+        ],
+        dtype=np.float32,
+    )
+    m = flow.nn.InstanceNorm3d(num_features=2, eps=1e-05, momentum=0.1).to(
+        device=flow.device(device)
+    )
+    x = flow.Tensor(input_arr, device=flow.device(device), requires_grad=True)
+    y = m(x)
+    z = y.sum()
+    z.backward()
+    test_case.assertTrue(
+        np.allclose(x.grad.numpy(), np.zeros(shape=input_arr.shape), 1e-05, 1e-05)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestInstanceNorm(flow.unittest.TestCase):
+    def test_instancenorm(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_instancenorm1d,
+            _test_instancenorm2d,
+            _test_instancenorm3d,
+            _test_instancenorm1d_backward,
+            _test_instancenorm2d_backward,
+            _test_instancenorm3d_backward,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_instruction_replay.py b/python/oneflow/test/modules/test_instruction_replay.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca2eb087b56cc5edf24d209cd1cc861692ab9cb3
--- /dev/null
+++ b/python/oneflow/test/modules/test_instruction_replay.py
@@ -0,0 +1,54 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_instruction_replay_impl(test_case, device, shape):
+    x = flow.Tensor(np.random.rand(*shape), device=flow.device(device))
+    y = flow.Tensor(np.random.rand(*shape), device=flow.device(device))
+    x.determine()
+    y.determine()
+    oneflow._oneflow_internal.debug.start_recording_instructions()
+    z = x + y
+    oneflow._oneflow_internal.debug.end_recording_instructions()
+    test_case.assertTrue(np.allclose(z.numpy(), x.numpy() + y.numpy(), 0.0001, 0.0001))
+    z.zeros_()
+    oneflow._oneflow_internal.debug.replay_instructions()
+    test_case.assertTrue(np.allclose(z.numpy(), x.numpy() + y.numpy(), 0.0001, 0.0001))
+    oneflow._oneflow_internal.debug.clear_recorded_instructions()
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestIntructionReplay(flow.unittest.TestCase):
+    def test_instruction_replay(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device"] = ["cpu", "cuda"]
+        arg_dict["shape"] = [[2, 3], [1, 10]]
+        for arg in GenArgList(arg_dict):
+            _test_instruction_replay_impl(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_interpolate.py b/python/oneflow/test/modules/test_interpolate.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2abbd0c4c87721e21ef8e5d0d587b77035b9eac
--- /dev/null
+++ b/python/oneflow/test/modules/test_interpolate.py
@@ -0,0 +1,681 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_interpolate_linear_1d(test_case, device):
+    input = flow.Tensor(
+        np.arange(1, 5).reshape((1, 1, 4)),
+        device=flow.device(device),
+        dtype=flow.float32,
+        requires_grad=True,
+    )
+    of_out = flow.nn.functional.interpolate(input, scale_factor=2.0, mode="linear")
+    np_out = [[[1.0, 1.25, 1.75, 2.25, 2.75, 3.25, 3.75, 4.0]]]
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [[[2.0, 2.0, 2.0, 2.0]]]
+    test_case.assertTrue(np.allclose(np_grad, input.grad.numpy(), 0.0001, 0.0001))
+    input.grad = None
+    of_out = flow.nn.functional.interpolate(
+        input, scale_factor=2.0, mode="linear", align_corners=True
+    )
+    np_out = [
+        [
+            [
+                1.0,
+                1.4285714626312256,
+                1.8571429252624512,
+                2.2857141494750977,
+                2.7142856121063232,
+                3.142857074737549,
+                3.5714285373687744,
+                4.0,
+            ]
+        ]
+    ]
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [
+        [
+            [
+                1.7142856121063232,
+                2.2857141494750977,
+                2.2857143878936768,
+                1.7142856121063232,
+            ]
+        ]
+    ]
+    test_case.assertTrue(np.allclose(np_grad, input.grad.numpy(), 0.0001, 0.0001))
+
+
+def _test_interpolate_nearest_1d(test_case, device):
+    input = flow.Tensor(
+        np.arange(1, 5).reshape((1, 1, 4)),
+        device=flow.device(device),
+        dtype=flow.float32,
+        requires_grad=True,
+    )
+    of_out = flow.nn.functional.interpolate(input, scale_factor=2.0, mode="nearest")
+    np_out = [[[1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0]]]
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [[[2.0, 2.0, 2.0, 2.0]]]
+    test_case.assertTrue(np.allclose(np_grad, input.grad.numpy(), 0.0001, 0.0001))
+
+
+def _test_interpolate_nearest_2d(test_case, device):
+    input = flow.Tensor(
+        np.arange(1, 5).reshape((1, 1, 2, 2)),
+        device=flow.device(device),
+        dtype=flow.float32,
+        requires_grad=True,
+    )
+    of_out = flow.nn.functional.interpolate(input, scale_factor=2.0, mode="nearest")
+    np_out = np.array(
+        [
+            [
+                [
+                    [1.0, 1.0, 2.0, 2.0],
+                    [1.0, 1.0, 2.0, 2.0],
+                    [3.0, 3.0, 4.0, 4.0],
+                    [3.0, 3.0, 4.0, 4.0],
+                ]
+            ]
+        ]
+    )
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [[[[4.0, 4.0], [4.0, 4.0]]]]
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+def _test_interpolate_nearest_3d(test_case, device):
+    input = flow.Tensor(
+        np.arange(1, 9).reshape((1, 1, 2, 2, 2)),
+        device=flow.device(device),
+        dtype=flow.float32,
+        requires_grad=True,
+    )
+    of_out = flow.nn.functional.interpolate(input, scale_factor=2.0, mode="nearest")
+    np_out = np.array(
+        [
+            [
+                [
+                    [
+                        [1.0, 1.0, 2.0, 2.0],
+                        [1.0, 1.0, 2.0, 2.0],
+                        [3.0, 3.0, 4.0, 4.0],
+                        [3.0, 3.0, 4.0, 4.0],
+                    ],
+                    [
+                        [1.0, 1.0, 2.0, 2.0],
+                        [1.0, 1.0, 2.0, 2.0],
+                        [3.0, 3.0, 4.0, 4.0],
+                        [3.0, 3.0, 4.0, 4.0],
+                    ],
+                    [
+                        [5.0, 5.0, 6.0, 6.0],
+                        [5.0, 5.0, 6.0, 6.0],
+                        [7.0, 7.0, 8.0, 8.0],
+                        [7.0, 7.0, 8.0, 8.0],
+                    ],
+                    [
+                        [5.0, 5.0, 6.0, 6.0],
+                        [5.0, 5.0, 6.0, 6.0],
+                        [7.0, 7.0, 8.0, 8.0],
+                        [7.0, 7.0, 8.0, 8.0],
+                    ],
+                ]
+            ]
+        ]
+    )
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [[[[[8.0, 8.0], [8.0, 8.0]], [[8.0, 8.0], [8.0, 8.0]]]]]
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+def _test_interpolate_bilinear_2d(test_case, device):
+    input = flow.Tensor(
+        np.arange(1, 5).reshape((1, 1, 2, 2)),
+        device=flow.device(device),
+        dtype=flow.float32,
+        requires_grad=True,
+    )
+    of_out = flow.nn.functional.interpolate(input, scale_factor=2.0, mode="bilinear")
+    np_out = np.array(
+        [
+            [
+                [
+                    [1.0, 1.25, 1.75, 2.0],
+                    [1.5, 1.75, 2.25, 2.5],
+                    [2.5, 2.75, 3.25, 3.5],
+                    [3.0, 3.25, 3.75, 4.0],
+                ]
+            ]
+        ]
+    )
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [[[[4.0, 4.0], [4.0, 4.0]]]]
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+def _test_interpolate_bicubic_2d(test_case, device):
+    input = flow.Tensor(
+        np.arange(1, 5).reshape((1, 1, 2, 2)).astype(np.float32),
+        device=flow.device(device),
+        dtype=flow.float32,
+        requires_grad=True,
+    )
+    of_out = flow.nn.functional.interpolate(input, scale_factor=2.0, mode="bicubic")
+    np_out = np.array(
+        [
+            [
+                [
+                    [0.68359375, 1.015625, 1.5625, 1.89453125],
+                    [1.34765625, 1.6796875, 2.2265625, 2.55859375],
+                    [2.44140625, 2.7734375, 3.3203125, 3.65234375],
+                    [3.10546875, 3.4375, 3.984375, 4.31640625],
+                ]
+            ]
+        ]
+    )
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [[[[4.0, 4.0], [4.0, 4.0]]]]
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+def _test_interpolate_bicubic_same_dim_2d(test_case, device):
+    input = flow.Tensor(
+        np.arange(1, 5).reshape((1, 1, 2, 2)).astype(np.float32),
+        device=flow.device(device),
+        dtype=flow.float32,
+        requires_grad=True,
+    )
+    of_out = flow.nn.functional.interpolate(input, scale_factor=1.0, mode="bicubic")
+    np_out = [[[[1.0, 2.0], [3.0, 4.0]]]]
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [[[[1.0, 1.0], [1.0, 1.0]]]]
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+def _test_interpolate_trilinear_3d(test_case, device):
+    input = flow.Tensor(
+        np.arange(1, 9).reshape((1, 1, 2, 2, 2)),
+        device=flow.device(device),
+        dtype=flow.float32,
+        requires_grad=True,
+    )
+    of_out = flow.nn.functional.interpolate(input, scale_factor=2.0, mode="trilinear")
+    np_out = np.array(
+        [
+            [
+                [
+                    [
+                        [1.0, 1.25, 1.75, 2.0],
+                        [1.5, 1.75, 2.25, 2.5],
+                        [2.5, 2.75, 3.25, 3.5],
+                        [3.0, 3.25, 3.75, 4.0],
+                    ],
+                    [
+                        [2.0, 2.25, 2.75, 3.0],
+                        [2.5, 2.75, 3.25, 3.5],
+                        [3.5, 3.75, 4.25, 4.5],
+                        [4.0, 4.25, 4.75, 5.0],
+                    ],
+                    [
+                        [4.0, 4.25, 4.75, 5.0],
+                        [4.5, 4.75, 5.25, 5.5],
+                        [5.5, 5.75, 6.25, 6.5],
+                        [6.0, 6.25, 6.75, 7.0],
+                    ],
+                    [
+                        [5.0, 5.25, 5.75, 6.0],
+                        [5.5, 5.75, 6.25, 6.5],
+                        [6.5, 6.75, 7.25, 7.5],
+                        [7.0, 7.25, 7.75, 8.0],
+                    ],
+                ]
+            ]
+        ]
+    )
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [[[[[8.0, 8.0], [8.0, 8.0]], [[8.0, 8.0], [8.0, 8.0]]]]]
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+def _test_interpolate_trilinear_3d_align_corners(test_case, device):
+    input = flow.Tensor(
+        np.arange(1, 9).reshape((1, 1, 2, 2, 2)),
+        device=flow.device(device),
+        dtype=flow.float32,
+        requires_grad=True,
+    )
+    of_out = flow.nn.functional.interpolate(
+        input, scale_factor=2.0, mode="trilinear", align_corners=True
+    )
+    np_out = np.array(
+        [
+            [
+                [
+                    [
+                        [1.0, 1.3333332538604736, 1.6666667461395264, 2.0],
+                        [
+                            1.6666666269302368,
+                            2.0,
+                            2.3333334922790527,
+                            2.6666665077209473,
+                        ],
+                        [
+                            2.3333332538604736,
+                            2.6666665077209473,
+                            3.0,
+                            3.3333334922790527,
+                        ],
+                        [3.0, 3.3333332538604736, 3.6666667461395264, 4.0],
+                    ],
+                    [
+                        [
+                            2.3333334922790527,
+                            2.6666665077209473,
+                            3.0,
+                            3.3333332538604736,
+                        ],
+                        [3.0, 3.3333330154418945, 3.6666665077209473, 4.0],
+                        [
+                            3.6666665077209473,
+                            4.0,
+                            4.333333492279053,
+                            4.6666669845581055,
+                        ],
+                        [4.333333492279053, 4.666666030883789, 5.0, 5.3333330154418945],
+                    ],
+                    [
+                        [3.6666667461395264, 4.0, 4.333333492279053, 4.666666507720947],
+                        [4.333333492279053, 4.666666507720947, 5.0, 5.3333330154418945],
+                        [5.0, 5.333333492279053, 5.6666669845581055, 6.0],
+                        [
+                            5.6666669845581055,
+                            6.0,
+                            6.333333492279053,
+                            6.6666669845581055,
+                        ],
+                    ],
+                    [
+                        [5.0, 5.3333330154418945, 5.666666507720947, 6.0],
+                        [
+                            5.666666507720947,
+                            5.999999523162842,
+                            6.3333330154418945,
+                            6.666666507720947,
+                        ],
+                        [6.333333492279053, 6.666666030883789, 7.0, 7.333333492279053],
+                        [7.0, 7.3333330154418945, 7.6666669845581055, 8.0],
+                    ],
+                ]
+            ]
+        ]
+    )
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [
+        [
+            [
+                [[7.999999523162842, 8.0], [7.999999523162842, 8.0]],
+                [[8.0, 8.0], [8.0, 8.0]],
+            ]
+        ]
+    ]
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+def _test_interpolate_area_1d(test_case, device):
+    input = flow.Tensor(
+        np.array(
+            [
+                [
+                    [
+                        0.05580734834074974,
+                        -0.6875145435333252,
+                        -1.654430866241455,
+                        -0.6225992441177368,
+                        0.10183599591255188,
+                        0.05019790679216385,
+                        -1.2537643909454346,
+                        0.14907236397266388,
+                    ]
+                ]
+            ]
+        ),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    of_out_1 = flow.nn.functional.interpolate(input, size=4, mode="area")
+    of_out_2 = flow.nn.functional.interpolate(input, scale_factor=0.5, mode="area")
+    np_out = np.array(
+        [
+            [
+                [
+                    -0.3158535957336426,
+                    -1.1385149955749512,
+                    0.07601694762706757,
+                    -0.5523459911346436,
+                ]
+            ]
+        ]
+    )
+    test_case.assertTrue(np.allclose(of_out_1.numpy(), np_out, 1e-05, 1e-05))
+    test_case.assertTrue(np.allclose(of_out_2.numpy(), np_out, 1e-05, 1e-05))
+    of_out_1 = of_out_1.sum()
+    of_out_1.backward()
+    np_grad = np.array([[[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]]])
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+def _test_interpolate_area_2d(test_case, device):
+    input = flow.Tensor(
+        np.array(
+            [
+                [
+                    [
+                        [
+                            0.10039155930280685,
+                            0.04879157617688179,
+                            -1.0515470504760742,
+                            0.9466001987457275,
+                        ],
+                        [
+                            0.45375481247901917,
+                            0.23611211776733398,
+                            1.343685269355774,
+                            0.3979687988758087,
+                        ],
+                        [
+                            0.05580734834074974,
+                            -0.6875145435333252,
+                            -1.654430866241455,
+                            -0.6225992441177368,
+                        ],
+                        [
+                            0.10183599591255188,
+                            0.05019790679216385,
+                            -1.2537643909454346,
+                            0.14907236397266388,
+                        ],
+                    ]
+                ]
+            ]
+        ),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    of_out_1 = flow.nn.functional.interpolate(input, size=(2, 2), mode="area")
+    of_out_2 = flow.nn.functional.interpolate(input, scale_factor=0.5, mode="area")
+    np_out = np.array(
+        [
+            [
+                [
+                    [0.20976251363754272, 0.4091767966747284],
+                    [-0.1199183315038681, -0.8454304933547974],
+                ]
+            ]
+        ]
+    )
+    test_case.assertTrue(np.allclose(of_out_1.numpy(), np_out, 1e-05, 1e-05))
+    test_case.assertTrue(np.allclose(of_out_2.numpy(), np_out, 1e-05, 1e-05))
+    of_out_1 = of_out_1.sum()
+    of_out_1.backward()
+    np_grad = np.array(
+        [
+            [
+                [
+                    [0.25, 0.25, 0.25, 0.25],
+                    [0.25, 0.25, 0.25, 0.25],
+                    [0.25, 0.25, 0.25, 0.25],
+                    [0.25, 0.25, 0.25, 0.25],
+                ]
+            ]
+        ]
+    )
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+def _test_interpolate_area_3d(test_case, device):
+    input = flow.Tensor(
+        np.array(
+            [
+                [
+                    [
+                        [
+                            [
+                                -1.077571799600885,
+                                -0.7804538890365837,
+                                -1.2627538752119443,
+                                0.9993507145120477,
+                            ],
+                            [
+                                2.0222532489157516,
+                                1.103451377699465,
+                                -0.4377324754879578,
+                                1.890491810587517,
+                            ],
+                            [
+                                -0.5593861899064654,
+                                -0.4949520241526519,
+                                -0.18536721363519787,
+                                -0.6098969866775772,
+                            ],
+                            [
+                                -1.6536215260171816,
+                                -1.0392583540436786,
+                                0.3686776597613967,
+                                -0.5356882834951805,
+                            ],
+                        ],
+                        [
+                            [
+                                -1.2617900664449953,
+                                -1.4390921091631532,
+                                0.20654399652431357,
+                                0.8186472101906713,
+                            ],
+                            [
+                                -0.3033378863400014,
+                                -0.8173269764076293,
+                                -0.3767515097625614,
+                                -0.11021655039337777,
+                            ],
+                            [
+                                -0.22977043608192885,
+                                1.2717196366649905,
+                                -0.4790851297878291,
+                                -1.4495369404727856,
+                            ],
+                            [
+                                -1.2802093286977783,
+                                -0.11184514806663474,
+                                1.7022167087210984,
+                                -1.7354837287725355,
+                            ],
+                        ],
+                        [
+                            [
+                                2.4706497991773606,
+                                -0.6549702631973298,
+                                -0.9318107079571676,
+                                1.4652904271682428,
+                            ],
+                            [
+                                1.1419864234341397,
+                                1.389909081086008,
+                                0.9657841900525568,
+                                -0.8563114264976619,
+                            ],
+                            [
+                                0.19515087084250754,
+                                -0.37808457398571094,
+                                0.2938625398496183,
+                                0.9279930510353327,
+                            ],
+                            [
+                                -0.9374118277994007,
+                                0.3341831730452431,
+                                -0.2792542765303833,
+                                0.38029090707066726,
+                            ],
+                        ],
+                        [
+                            [
+                                0.5918686659736041,
+                                -0.7870631089938902,
+                                -0.9534344874245392,
+                                0.31341612954718795,
+                            ],
+                            [
+                                0.7509029444145228,
+                                -0.9299288398562323,
+                                -0.7343054052782476,
+                                -0.8806481590696694,
+                            ],
+                            [
+                                -0.4707853016353985,
+                                0.12253641652645629,
+                                0.5088022039832846,
+                                0.520391789327562,
+                            ],
+                            [
+                                -0.0861300651163632,
+                                0.30291348404866386,
+                                -0.6268565873680123,
+                                -0.27469204305759976,
+                            ],
+                        ],
+                    ]
+                ]
+            ]
+        ),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    of_out_1 = flow.nn.functional.interpolate(input, size=(2, 2, 2), mode="area")
+    of_out_2 = flow.nn.functional.interpolate(input, scale_factor=0.5, mode="area")
+    np_out = np.array(
+        [
+            [
+                [
+                    [
+                        [-0.3192335125472539, 0.2159474151198386],
+                        [-0.5121654212876662, -0.3655204892948264],
+                    ],
+                    [
+                        [0.4966693377547728, -0.2015024299324123],
+                        [-0.11470347800925032, 0.18131719803880864],
+                    ],
+                ]
+            ]
+        ]
+    )
+    test_case.assertTrue(np.allclose(of_out_1.numpy(), np_out, 1e-05, 1e-05))
+    test_case.assertTrue(np.allclose(of_out_2.numpy(), np_out, 1e-05, 1e-05))
+    of_out_1 = of_out_1.sum()
+    of_out_1.backward()
+    np_grad = np.array(
+        [
+            [
+                [
+                    [
+                        [0.125, 0.125, 0.125, 0.125],
+                        [0.125, 0.125, 0.125, 0.125],
+                        [0.125, 0.125, 0.125, 0.125],
+                        [0.125, 0.125, 0.125, 0.125],
+                    ],
+                    [
+                        [0.125, 0.125, 0.125, 0.125],
+                        [0.125, 0.125, 0.125, 0.125],
+                        [0.125, 0.125, 0.125, 0.125],
+                        [0.125, 0.125, 0.125, 0.125],
+                    ],
+                    [
+                        [0.125, 0.125, 0.125, 0.125],
+                        [0.125, 0.125, 0.125, 0.125],
+                        [0.125, 0.125, 0.125, 0.125],
+                        [0.125, 0.125, 0.125, 0.125],
+                    ],
+                    [
+                        [0.125, 0.125, 0.125, 0.125],
+                        [0.125, 0.125, 0.125, 0.125],
+                        [0.125, 0.125, 0.125, 0.125],
+                        [0.125, 0.125, 0.125, 0.125],
+                    ],
+                ]
+            ]
+        ]
+    )
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestInterpolate(flow.unittest.TestCase):
+    def test_interpolate(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_interpolate_linear_1d,
+            _test_interpolate_nearest_1d,
+            _test_interpolate_nearest_2d,
+            _test_interpolate_nearest_3d,
+            _test_interpolate_bilinear_2d,
+            _test_interpolate_bicubic_2d,
+            _test_interpolate_bicubic_same_dim_2d,
+            _test_interpolate_trilinear_3d,
+            _test_interpolate_trilinear_3d_align_corners,
+            _test_interpolate_area_1d,
+            _test_interpolate_area_2d,
+            _test_interpolate_area_3d,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            for i in range(100):
+                arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_kldivloss.py b/python/oneflow/test/modules/test_kldivloss.py
new file mode 100644
index 0000000000000000000000000000000000000000..1292dcc1c50865679c951e4970960c1422b2bbe0
--- /dev/null
+++ b/python/oneflow/test/modules/test_kldivloss.py
@@ -0,0 +1,102 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _np_kldivloss(np_input, np_target, np_log_target):
+    if np_log_target:
+        np_kl_div_loss = np.exp(np_target) * (np_target - np_input)
+    else:
+        np_kl_div_out_loss = np_target * (np.log(np_target) - np_input)
+        np_zeros = np.zeros_like(np_kl_div_out_loss, dtype=np.float32)
+        np_kl_div_loss = np.where(np_target > 0, np_kl_div_out_loss, np_zeros)
+    return {
+        "none": np_kl_div_loss,
+        "mean": np.mean(np_kl_div_loss),
+        "sum": np.sum(np_kl_div_loss),
+    }
+
+
+def _np_kldivloss_grad(input, target, np_log_target):
+    elem_cnt = input.size
+    if np_log_target:
+        _np_diff = -np.exp(target)
+    else:
+        _np_diff = -target
+        _zero_index = np.where(target > 0, 1, 0)
+        _np_diff = _np_diff * _zero_index
+    return {"none": _np_diff, "mean": _np_diff / elem_cnt, "sum": _np_diff}
+
+
+def _test_kldivloss_forward(test_case, device, shape, reduction, log_target):
+    x = np.random.randn(*shape)
+    y = np.random.randn(*shape)
+    input = flow.Tensor(
+        x, dtype=flow.float32, requires_grad=True, device=flow.device(device)
+    )
+    target = flow.Tensor(y, dtype=flow.float32, device=flow.device(device))
+    loss = flow.nn.KLDivLoss(reduction=reduction, log_target=log_target)
+    loss = loss.to(device)
+    of_out = loss(input, target)
+    np_out = _np_kldivloss(x, y, log_target)[reduction]
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_kldivloss_backward(test_case, device, shape, reduction, log_target):
+    x = np.random.randn(*shape)
+    y = np.random.randn(*shape)
+    input = flow.Tensor(
+        x, dtype=flow.float32, requires_grad=True, device=flow.device(device)
+    )
+    target = flow.Tensor(y, dtype=flow.float32, device=flow.device(device))
+    loss = flow.nn.KLDivLoss(reduction=reduction, log_target=log_target)
+    loss = loss.to(device)
+    of_out = loss(input, target)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = _np_kldivloss_grad(x, y, log_target)[reduction]
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestKLDivLossModule(flow.unittest.TestCase):
+    def test_kldivloss(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [_test_kldivloss_forward, _test_kldivloss_backward]
+        arg_dict["device"] = ["cpu", "cuda"]
+        arg_dict["shape"] = [
+            (3, 5),
+            (10, 9, 21),
+            (14, 22, 9, 21),
+            (3, 2, 4, 16, 5),
+            (1,),
+        ]
+        arg_dict["reduction"] = ["none", "mean", "sum"]
+        arg_dict["log_target"] = [False, True]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_l1loss.py b/python/oneflow/test/modules/test_l1loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..88125dcf69ddd8a2999833ab76056543080aba79
--- /dev/null
+++ b/python/oneflow/test/modules/test_l1loss.py
@@ -0,0 +1,80 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _np_l1loss(np_input, np_target):
+    np_l1 = np.abs(np_target - np_input)
+    np_l1_sum = np.sum(np_l1)
+    np_l1_mean = np.mean(np_l1)
+    return {"none": np_l1, "mean": np_l1_mean, "sum": np_l1_sum}
+
+
+def _np_l1loss_grad(np_input, np_target):
+    elem_cnt = np_input.size
+    np_grad = np.zeros_like(np_target)
+    np_grad = np.sign(np_input - np_target)
+    np_l1_grad_sum = np_grad
+    np_l1_grad_mean = np_l1_grad_sum / elem_cnt
+    return {"none": np_grad, "mean": np_l1_grad_mean, "sum": np_l1_grad_sum}
+
+
+def _test_l1loss_impl(test_case, device, shape, reduction):
+    x = np.random.randn(*shape).astype(np.float32)
+    y = np.random.randn(*shape).astype(np.float32)
+    input = flow.Tensor(
+        x, dtype=flow.float32, requires_grad=True, device=flow.device(device)
+    )
+    target = flow.Tensor(y, dtype=flow.float32, device=flow.device(device))
+    loss = flow.nn.L1Loss(reduction)
+    loss = loss.to(device)
+    of_out = loss(input, target)
+    np_out = _np_l1loss(x, y)[reduction]
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = _np_l1loss_grad(x, y)[reduction]
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestL1LossModule(flow.unittest.TestCase):
+    def test_l1loss(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [_test_l1loss_impl]
+        arg_dict["device"] = ["cpu", "cuda"]
+        arg_dict["shape"] = [
+            (3, 5),
+            (10, 9, 21),
+            (14, 22, 9, 21),
+            (3, 2, 4, 16, 5),
+            (1,),
+        ]
+        arg_dict["reduction"] = ["none", "sum", "mean"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_less.py b/python/oneflow/test/modules/test_less.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3d8a52b344fbab819308109caf3a557b7638f59
--- /dev/null
+++ b/python/oneflow/test/modules/test_less.py
@@ -0,0 +1,119 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from automated_test_util import *
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_less_normal(test_case, device):
+    input1 = flow.Tensor(
+        np.random.randn(2, 6, 5, 3), dtype=flow.float32, device=flow.device(device)
+    )
+    input2 = flow.Tensor(
+        np.random.randn(2, 6, 5, 3), dtype=flow.float32, device=flow.device(device)
+    )
+    of_out = flow.lt(input1, input2)
+    np_out = np.less(input1.numpy(), input2.numpy())
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+def _test_less_symbol(test_case, device):
+    input1 = flow.Tensor(
+        np.array([1, 1, 4]).astype(np.float32),
+        dtype=flow.float32,
+        device=flow.device(device),
+    )
+    input2 = flow.Tensor(
+        np.array([1, 2, 3]).astype(np.float32),
+        dtype=flow.float32,
+        device=flow.device(device),
+    )
+    of_out = input1 < input2
+    np_out = np.less(input1.numpy(), input2.numpy())
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+def _test_less_int_scalar(test_case, device):
+    np_arr = np.random.randn(2, 3, 4, 5)
+    input1 = flow.Tensor(np_arr, dtype=flow.float32, device=flow.device(device))
+    input2 = 1
+    of_out = input1 < input2
+    np_out = np.less(np_arr, input2)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+def _test_less_int_tensor_int_scalr(test_case, device):
+    np_arr = np.random.randint(2, size=(2, 3, 4, 5))
+    input1 = flow.Tensor(np_arr, dtype=flow.int, device=flow.device(device))
+    input2 = 1
+    of_out = input1 < input2
+    np_out = np.less(np_arr, input2)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+def _test_less_float_scalar(test_case, device):
+    np_arr = np.random.randn(3, 2, 5, 7)
+    input1 = flow.Tensor(np_arr, dtype=flow.float32, device=flow.device(device))
+    input2 = 2.3
+    of_out = input1 < input2
+    np_out = np.less(np_arr, input2)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestLess(flow.unittest.TestCase):
+    def test_less(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_less_normal,
+            _test_less_symbol,
+            _test_less_int_scalar,
+            _test_less_int_tensor_int_scalr,
+            _test_less_float_scalar,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+    @autotest(n=60, auto_backward=False)
+    def test_less_with_random_data(test_case):
+        device = random_device()
+        shape = random_tensor().value().shape
+        x1 = random_pytorch_tensor(len(shape), *shape, requires_grad=False).to(device)
+        x2 = random_pytorch_tensor(len(shape), *shape, requires_grad=False).to(device)
+        y = torch.lt(x1, oneof(x2, random().to(int).to(float)))
+        return y
+
+    @autotest(n=60, auto_backward=False)
+    def test_tensor_less_with_random_data(test_case):
+        device = random_device()
+        shape = random_tensor().value().shape
+        x1 = random_pytorch_tensor(len(shape), *shape, requires_grad=False).to(device)
+        x2 = random_pytorch_tensor(len(shape), *shape, requires_grad=False).to(device)
+        y1 = x1.lt(oneof(x2, random().to(int), random().to(float)))
+        y2 = x1 < x2
+        return (y1, y2)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_less_equal.py b/python/oneflow/test/modules/test_less_equal.py
new file mode 100644
index 0000000000000000000000000000000000000000..57b6bcd6ffff48fc916a60e72c76798755a24e22
--- /dev/null
+++ b/python/oneflow/test/modules/test_less_equal.py
@@ -0,0 +1,99 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_less_equal_normal(test_case, device):
+    input1 = flow.Tensor(
+        np.random.randn(2, 6, 5, 3), dtype=flow.float32, device=flow.device(device)
+    )
+    input2 = flow.Tensor(
+        np.random.randn(2, 6, 5, 3), dtype=flow.float32, device=flow.device(device)
+    )
+    of_out = flow.le(input1, input2)
+    np_out = np.less_equal(input1.numpy(), input2.numpy())
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+def _test_less_equal_symbol(test_case, device):
+    input1 = flow.Tensor(
+        np.array([1, 1, 4]).astype(np.float32),
+        dtype=flow.float32,
+        device=flow.device(device),
+    )
+    input2 = flow.Tensor(
+        np.array([1, 2, 3]).astype(np.float32),
+        dtype=flow.float32,
+        device=flow.device(device),
+    )
+    of_out = input1 <= input2
+    np_out = np.less_equal(input1.numpy(), input2.numpy())
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+def _test_less_equal_int_scalar(test_case, device):
+    np_arr = np.random.randn(2, 3, 4, 5)
+    input1 = flow.Tensor(np_arr, dtype=flow.float32, device=flow.device(device))
+    input2 = 1
+    of_out = input1 <= input2
+    np_out = np.less_equal(np_arr, input2)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+def _test_less_equal_int_tensor_int_scalr(test_case, device):
+    np_arr = np.random.randint(2, size=(2, 3, 4, 5))
+    input1 = flow.Tensor(np_arr, dtype=flow.int, device=flow.device(device))
+    input2 = 1
+    of_out = input1 <= input2
+    np_out = np.less_equal(np_arr, input2)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+def _test_less_equal_float_scalar(test_case, device):
+    np_arr = np.random.randn(3, 2, 5, 7)
+    input1 = flow.Tensor(np_arr, dtype=flow.float32, device=flow.device(device))
+    input2 = 2.3
+    of_out = input1 <= input2
+    np_out = np.less_equal(np_arr, input2)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestLessEqual(flow.unittest.TestCase):
+    def test_less_equal(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_less_equal_normal,
+            _test_less_equal_symbol,
+            _test_less_equal_int_scalar,
+            _test_less_equal_int_tensor_int_scalr,
+            _test_less_equal_float_scalar,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_linear.py b/python/oneflow/test/modules/test_linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..10487919e29ddbf132225b9364af852a71600f8b
--- /dev/null
+++ b/python/oneflow/test/modules/test_linear.py
@@ -0,0 +1,208 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from automated_test_util import *
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_linear_no_bias(test_case, device):
+    linear = flow.nn.Linear(3, 8, False)
+    linear = linear.to(device)
+    input_arr = np.array(
+        [
+            [-0.94630778, -0.83378579, -0.87060891],
+            [2.0289922, -0.28708987, -2.18369248],
+            [0.35217619, -0.67095644, -1.58943879],
+            [0.08086036, -1.81075924, 1.20752494],
+            [0.8901075, -0.49976737, -1.07153746],
+            [-0.44872912, -1.07275683, 0.06256855],
+            [-0.22556897, 0.74798368, 0.90416439],
+            [0.48339456, -2.32742195, -0.59321527],
+        ],
+        dtype=np.float32,
+    )
+    np_weight = np.ones((3, 8)).astype(np.float32)
+    np_weight.fill(2.3)
+    x = flow.Tensor(input_arr, device=flow.device(device))
+    flow.nn.init.constant_(linear.weight, 2.3)
+    of_out = linear(x)
+    np_out = np.matmul(input_arr, np_weight)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_linear_with_bias(test_case, device):
+    linear = flow.nn.Linear(3, 8)
+    linear = linear.to(device)
+    input_arr = np.array(
+        [
+            [-0.94630778, -0.83378579, -0.87060891],
+            [2.0289922, -0.28708987, -2.18369248],
+            [0.35217619, -0.67095644, -1.58943879],
+            [0.08086036, -1.81075924, 1.20752494],
+            [0.8901075, -0.49976737, -1.07153746],
+            [-0.44872912, -1.07275683, 0.06256855],
+            [-0.22556897, 0.74798368, 0.90416439],
+            [0.48339456, -2.32742195, -0.59321527],
+        ],
+        dtype=np.float32,
+    )
+    np_weight = np.ones((3, 8)).astype(np.float32)
+    np_weight.fill(2.068758)
+    np_bias = np.ones(8)
+    np_bias.fill(0.23)
+    x = flow.Tensor(input_arr, device=flow.device(device))
+    flow.nn.init.constant_(linear.weight, 2.068758)
+    flow.nn.init.constant_(linear.bias, 0.23)
+    of_out = linear(x)
+    np_out = np.matmul(input_arr, np_weight)
+    np_out += np_bias
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_linear_3_dimension_input(test_case, device):
+    input_arr = np.random.randn(2, 3, 4)
+    x = flow.Tensor(input_arr, device=flow.device(device))
+    linear = flow.nn.Linear(4, 5, True)
+    linear = linear.to(device)
+    flow.nn.init.constant_(linear.weight, 5.6)
+    flow.nn.init.constant_(linear.bias, 0.78)
+    of_out = linear(x)
+    np_weight = np.ones((4, 5)).astype(np.float32)
+    np_weight.fill(5.6)
+    np_bias = np.ones(5)
+    np_bias.fill(0.78)
+    np_out = np.matmul(input_arr, np_weight)
+    np_out += np_bias
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_linear_4_dimension_input(test_case, device):
+    input_arr = np.random.randn(4, 5, 6, 7)
+    x = flow.Tensor(input_arr, device=flow.device(device))
+    linear = flow.nn.Linear(7, 3, False)
+    linear = linear.to(device)
+    flow.nn.init.constant_(linear.weight, 11.3)
+    of_out = linear(x)
+    np_weight = np.ones((7, 3)).astype(np.float32)
+    np_weight.fill(11.3)
+    np_out = np.matmul(input_arr, np_weight)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_identity(test_case, device):
+    linear = flow.nn.Identity(54, unused_argument1=0.1, unused_argument2=False)
+    linear = linear.to(device)
+    x = flow.Tensor(np.random.rand(2, 3, 4, 5), device=flow.device(device))
+    y = linear(x)
+    test_case.assertTrue(np.array_equal(x.numpy(), y.numpy()))
+
+
+def _test_linear_backward_with_bias(test_case, device):
+    linear = flow.nn.Linear(3, 8)
+    linear = linear.to(device)
+    x = flow.Tensor(
+        [
+            [-0.94630778, -0.83378579, -0.87060891],
+            [2.0289922, -0.28708987, -2.18369248],
+            [0.35217619, -0.67095644, -1.58943879],
+            [0.08086036, -1.81075924, 1.20752494],
+            [0.8901075, -0.49976737, -1.07153746],
+            [-0.44872912, -1.07275683, 0.06256855],
+            [-0.22556897, 0.74798368, 0.90416439],
+            [0.48339456, -2.32742195, -0.59321527],
+        ],
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    flow.nn.init.constant_(linear.weight, 2.068758)
+    flow.nn.init.constant_(linear.bias, 0.23)
+    of_out = linear(x)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = np.array(
+        [
+            [16.5501, 16.5501, 16.5501],
+            [16.5501, 16.5501, 16.5501],
+            [16.5501, 16.5501, 16.5501],
+            [16.5501, 16.5501, 16.5501],
+            [16.5501, 16.5501, 16.5501],
+            [16.5501, 16.5501, 16.5501],
+            [16.5501, 16.5501, 16.5501],
+            [16.5501, 16.5501, 16.5501],
+        ]
+    )
+    test_case.assertTrue(np.allclose(np_grad, x.grad.numpy(), 0.0001, 0.0001))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestLinear(flow.unittest.TestCase):
+    def test_linear_forward(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_linear_no_bias,
+            _test_linear_with_bias,
+            _test_linear_3_dimension_input,
+            _test_linear_4_dimension_input,
+            _test_identity,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+    def test_linear_backward(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [_test_linear_backward_with_bias]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+    @autotest()
+    def test_linear_with_random_data(test_case):
+        input_size = random()
+        m = torch.nn.Linear(
+            in_features=input_size, out_features=random(), bias=random() | nothing()
+        )
+        m.train(random())
+        device = random_device()
+        m.to(device)
+        x = random_pytorch_tensor(ndim=2, dim1=input_size).to(device)
+        y = m(x)
+        return y
+
+    @autotest()
+    def test_identity_with_random_data(test_case):
+        m = torch.nn.Identity(
+            x=random().to(int),
+            unused_argument1=random().to(float),
+            unused_argument2=random().to(float),
+        )
+        m.train(random())
+        device = random_device()
+        m.to(device)
+        x = random_pytorch_tensor().to(device)
+        y = m(x)
+        return y
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_log1p.py b/python/oneflow/test/modules/test_log1p.py
new file mode 100644
index 0000000000000000000000000000000000000000..9910882fc62398608a0db214cecb7e72ce55989b
--- /dev/null
+++ b/python/oneflow/test/modules/test_log1p.py
@@ -0,0 +1,75 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_log1p(test_case, shape, device):
+    input_arr = np.exp(np.random.randn(*shape)) - 1
+    np_out = np.log1p(input_arr)
+    x = flow.Tensor(
+        input_arr, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    of_out = flow.log1p(x)
+    test_case.assertTrue(
+        np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05, equal_nan=True)
+    )
+    of_out = of_out.sum()
+    of_out.backward()
+    np_out_grad = 1.0 / (1 + input_arr)
+    test_case.assertTrue(
+        np.allclose(x.grad.numpy(), np_out_grad, 0.0001, 0.0001, equal_nan=True)
+    )
+
+
+def _test_log1p_tensor_function(test_case, shape, device):
+    input_arr = np.exp(np.random.randn(*shape)) - 1
+    np_out = np.log1p(input_arr)
+    x = flow.Tensor(
+        input_arr, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    of_out = x.log1p()
+    test_case.assertTrue(
+        np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05, equal_nan=True)
+    )
+    of_out = of_out.sum()
+    of_out.backward()
+    np_out_grad = 1.0 / (1 + input_arr)
+    test_case.assertTrue(
+        np.allclose(x.grad.numpy(), np_out_grad, 0.0001, 0.0001, equal_nan=True)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestLog1p(flow.unittest.TestCase):
+    def test_log1p(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [_test_log1p, _test_log1p_tensor_function]
+        arg_dict["shape"] = [(2,), (2, 3), (2, 3, 4, 5)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_lr_scheduler.py b/python/oneflow/test/modules/test_lr_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..79f4f30f66a356e64ce10b1cae486548e8e17a9d
--- /dev/null
+++ b/python/oneflow/test/modules/test_lr_scheduler.py
@@ -0,0 +1,97 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import math
+import unittest
+
+import oneflow as flow
+import oneflow.unittest
+from oneflow.nn.parameter import Parameter
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestLrScheduler(flow.unittest.TestCase):
+    base_lr = 1.0
+
+    def test_cosine_annealing_lr(test_case):
+        optimizer = flow.optim.SGD(
+            [{"params": [Parameter(flow.Tensor([1.0]))]}], lr=TestLrScheduler.base_lr
+        )
+
+        def cosine_annealing_lr_step(base_lr, current_step, steps, alpha):
+            if current_step < steps:
+                cos_decay = 0.5 * (1 + math.cos(math.pi * current_step / steps))
+                decay_factor = (1 - alpha) * cos_decay + alpha
+                return base_lr * decay_factor
+            else:
+                return base_lr * alpha
+
+        alpha = 0.5
+        steps = 10
+        cosine_annealing_lr = flow.optim.lr_scheduler.CosineAnnealingLR(
+            optimizer, steps=steps, alpha=alpha
+        )
+        for i in range(1, 21):
+            cosine_annealing_lr.step()
+            new_lr = cosine_annealing_lr_step(TestLrScheduler.base_lr, i, steps, alpha)
+            test_case.assertAlmostEqual(
+                cosine_annealing_lr.get_last_lr()[0], new_lr, places=4
+            )
+
+    def test_step_lr(test_case):
+        optimizer = flow.optim.SGD(
+            [{"params": [Parameter(flow.Tensor([1.0]))]}], lr=TestLrScheduler.base_lr
+        )
+
+        def step_lr_step(base_lr, current_step, step_size, gamma):
+            return base_lr * gamma ** (current_step // step_size)
+
+        gamma = 0.1
+        step_size = 5
+        step_lr = flow.optim.lr_scheduler.StepLR(
+            optimizer, step_size=step_size, gamma=gamma
+        )
+        for i in range(1, 21):
+            step_lr.step()
+            new_lr = step_lr_step(TestLrScheduler.base_lr, i, step_size, gamma)
+            test_case.assertAlmostEqual(step_lr.get_last_lr()[0], new_lr, places=5)
+
+    def test_lambda_lr(test_case):
+        optimizer = flow.optim.SGD(
+            [
+                {"params": [Parameter(flow.Tensor([1.0]))]},
+                {"params": [Parameter(flow.Tensor([1.0]))]},
+            ],
+            lr=TestLrScheduler.base_lr,
+        )
+        lambdas = [lambda step: step // 30, lambda step: 0.95 * step]
+
+        def lambda_lr_step(base_lrs, current_step):
+            return [
+                base_lr * lmbda(current_step)
+                for (base_lr, lmbda) in zip(base_lrs, lambdas)
+            ]
+
+        lambda_lr = flow.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambdas)
+        for i in range(1, 21):
+            lambda_lr.step()
+            new_lrs = lambda_lr_step(lambda_lr.base_lrs, i)
+            for (lr1, lr2) in zip(lambda_lr.get_last_lr(), new_lrs):
+                test_case.assertAlmostEqual(lr1, lr2, places=5)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_marginrankingloss.py b/python/oneflow/test/modules/test_marginrankingloss.py
new file mode 100644
index 0000000000000000000000000000000000000000..26aa39d44157edf0bc6233ed06655abfbb7f0c06
--- /dev/null
+++ b/python/oneflow/test/modules/test_marginrankingloss.py
@@ -0,0 +1,170 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def np_margin_ranking_loss(margin, input1, input2, targets, reduction="none"):
+    out = np.clip(margin + -targets * (input1 - input2), a_min=0, a_max=None)
+    if reduction == "sum":
+        return np.sum(out)
+    elif reduction == "mean":
+        return out.mean()
+    elif reduction == "none":
+        return out
+
+
+def np_margin_ranking_loss_grad(margin, input1, input2, targets):
+    out = np.clip(margin + -targets * (input1 - input2), a_min=0, a_max=None)
+    out_grad1 = -1 * np.zeros_like(targets)
+    out_grad2 = np.zeros_like(targets)
+    out_grad1[np.nonzero(out)] = -targets[np.nonzero(out)]
+    out_grad2[np.nonzero(out)] = targets[np.nonzero(out)]
+    return (out_grad1, out_grad2)
+
+
+def _test_marginrankingloss_none(test_case, shape, margin, device):
+    input1 = flow.Tensor(
+        np.random.randn(*shape), dtype=flow.float32, device=flow.device(device)
+    )
+    input2 = flow.Tensor(
+        np.random.randn(*shape), dtype=flow.float32, device=flow.device(device)
+    )
+    target_pos = flow.Tensor(
+        np.ones(shape), dtype=flow.float32, device=flow.device(device)
+    )
+    target_neg = flow.Tensor(
+        -1 * np.ones(shape), dtype=flow.float32, device=flow.device(device)
+    )
+    margin_ranking_loss = flow.nn.MarginRankingLoss(margin=margin, reduction="none")
+    margin_ranking_loss = margin_ranking_loss.to(device)
+    of_out_pos = margin_ranking_loss(input1, input2, target_pos)
+    np_out_pos = np_margin_ranking_loss(
+        margin, input1.numpy(), input2.numpy(), target_pos.numpy(), reduction="none"
+    )
+    test_case.assertTrue(np.allclose(of_out_pos.numpy(), np_out_pos, 1e-05, 1e-05))
+    of_out_neg = margin_ranking_loss(input1, input2, target_neg)
+    np_out_neg = np_margin_ranking_loss(
+        margin, input1.numpy(), input2.numpy(), target_neg.numpy(), reduction="none"
+    )
+    test_case.assertTrue(np.allclose(of_out_neg.numpy(), np_out_neg, 1e-05, 1e-05))
+
+
+def _test_marginrankingloss_mean(test_case, shape, margin, device):
+    input1 = flow.Tensor(
+        np.random.randn(*shape), dtype=flow.float32, device=flow.device(device)
+    )
+    input2 = flow.Tensor(
+        np.random.randn(*shape), dtype=flow.float32, device=flow.device(device)
+    )
+    target_pos = flow.Tensor(
+        np.ones(shape), dtype=flow.float32, device=flow.device(device)
+    )
+    target_neg = flow.Tensor(
+        -1 * np.ones(shape), dtype=flow.float32, device=flow.device(device)
+    )
+    margin_ranking_loss = flow.nn.MarginRankingLoss(margin=margin, reduction="mean")
+    margin_ranking_loss = margin_ranking_loss.to(device)
+    of_out_pos = margin_ranking_loss(input1, input2, target_pos)
+    np_out_pos = np_margin_ranking_loss(
+        margin, input1.numpy(), input2.numpy(), target_pos.numpy(), reduction="mean"
+    )
+    test_case.assertTrue(np.allclose(of_out_pos.numpy(), np_out_pos, 1e-05, 1e-05))
+    of_out_neg = margin_ranking_loss(input1, input2, target_neg)
+    np_out_neg = np_margin_ranking_loss(
+        margin, input1.numpy(), input2.numpy(), target_neg.numpy(), reduction="mean"
+    )
+    test_case.assertTrue(np.allclose(of_out_neg.numpy(), np_out_neg, 1e-05, 1e-05))
+
+
+def _test_marginrankingloss_sum(test_case, shape, margin, device):
+    input1 = flow.Tensor(
+        np.random.randn(*shape), dtype=flow.float32, device=flow.device(device)
+    )
+    input2 = flow.Tensor(
+        np.random.randn(*shape), dtype=flow.float32, device=flow.device(device)
+    )
+    target_pos = flow.Tensor(
+        np.ones(shape), dtype=flow.float32, device=flow.device(device)
+    )
+    target_neg = flow.Tensor(
+        -1 * np.ones(shape), dtype=flow.float32, device=flow.device(device)
+    )
+    margin_ranking_loss = flow.nn.MarginRankingLoss(margin=margin, reduction="sum")
+    margin_ranking_loss = margin_ranking_loss.to(device)
+    of_out_pos = margin_ranking_loss(input1, input2, target_pos)
+    np_out_pos = np_margin_ranking_loss(
+        margin, input1.numpy(), input2.numpy(), target_pos.numpy(), reduction="sum"
+    )
+    test_case.assertTrue(np.allclose(of_out_pos.numpy(), np_out_pos, 1e-05, 1e-05))
+    of_out_neg = margin_ranking_loss(input1, input2, target_neg)
+    np_out_neg = np_margin_ranking_loss(
+        margin, input1.numpy(), input2.numpy(), target_neg.numpy(), reduction="sum"
+    )
+    test_case.assertTrue(np.allclose(of_out_neg.numpy(), np_out_neg, 1e-05, 1e-05))
+
+
+def _test_marginrankingloss_grad(test_case, shape, margin, device):
+    input1 = flow.Tensor(
+        np.random.randn(*shape),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    input2 = flow.Tensor(
+        np.random.randn(*shape),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    target = flow.Tensor(np.ones(shape), dtype=flow.float32, device=flow.device(device))
+    margin_ranking_loss = flow.nn.MarginRankingLoss(margin=margin, reduction="sum")
+    margin_ranking_loss = margin_ranking_loss.to(device)
+    of_out = margin_ranking_loss(input1, input2, target)
+    of_out.backward()
+    (np_out_grad1, np_out_grad2) = np_margin_ranking_loss_grad(
+        margin, input1.numpy(), input2.numpy(), target.numpy()
+    )
+    test_case.assertTrue(np.allclose(input1.grad.numpy(), np_out_grad1, 1e-05, 1e-05))
+    test_case.assertTrue(np.allclose(input2.grad.numpy(), np_out_grad2, 1e-05, 1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestMarginRankingLossModule(flow.unittest.TestCase):
+    def test_margin_ranking_loss(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_marginrankingloss_none,
+            _test_marginrankingloss_mean,
+            _test_marginrankingloss_sum,
+            _test_marginrankingloss_grad,
+        ]
+        arg_dict["shape"] = [(2, 3), (2, 4, 5, 6)]
+        arg_dict["margin"] = [1.0, 0.3, 10]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_masked_fill.py b/python/oneflow/test/modules/test_masked_fill.py
new file mode 100644
index 0000000000000000000000000000000000000000..056df4d656c6e4baf0f3c8cb06f7ac333468f3db
--- /dev/null
+++ b/python/oneflow/test/modules/test_masked_fill.py
@@ -0,0 +1,70 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+from automated_test_util import *
+
+import oneflow as flow
+import oneflow.unittest
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestMaskedFill(flow.unittest.TestCase):
+    @unittest.skip("has bug now, need rewrite")
+    def test_masked_fill_aginst_pytorch(test_case):
+        import numpy as np
+        import torch
+
+        def mask_tensor(shape):
+            def generator(_):
+                rng = np.random.default_rng()
+                np_arr = rng.integers(low=0, high=2, size=shape)
+                return (
+                    flow.Tensor(np_arr, dtype=flow.int8),
+                    torch.tensor(np_arr, dtype=torch.bool),
+                )
+
+            return generator
+
+        for device in ["cpu", "cuda"]:
+            test_flow_against_pytorch(
+                test_case,
+                "masked_fill",
+                extra_annotations={"mask": flow.Tensor, "value": float},
+                extra_generators={
+                    "input": random_tensor(ndim=2, dim0=4, dim1=5),
+                    "mask": mask_tensor((4, 5)),
+                    "value": constant(3.14),
+                },
+                device=device,
+            )
+            test_tensor_against_pytorch(
+                test_case,
+                "masked_fill",
+                extra_annotations={"mask": flow.Tensor, "value": float},
+                extra_generators={
+                    "input": random_tensor(ndim=2, dim0=4, dim1=5),
+                    "mask": mask_tensor((4, 5)),
+                    "value": constant(3.14),
+                },
+                device=device,
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_masked_select.py b/python/oneflow/test/modules/test_masked_select.py
new file mode 100644
index 0000000000000000000000000000000000000000..ced33ad1f0007572413f2f56d2c69f4ca5b8c263
--- /dev/null
+++ b/python/oneflow/test/modules/test_masked_select.py
@@ -0,0 +1,96 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_masked_select(test_case, device):
+    x = flow.Tensor(
+        np.array([[-0.462, 0.3139], [0.3898, -0.7197], [0.0478, -0.1657]]),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    mask = x.gt(0.05)
+    of_out = flow.masked_select(x, mask)
+    np_out = np.array([0.3139, 0.3898])
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = np.array([[0, 1], [1, 0], [0, 0]])
+    test_case.assertTrue(np.allclose(x.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+def _test_masked_select_broadcast(test_case, device):
+    x = flow.Tensor(
+        np.array([[[-0.462, 0.3139], [0.3898, -0.7197], [0.0478, -0.1657]]]),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    mask = flow.Tensor(
+        np.array(
+            [
+                [[1.0, 0.0], [1.0, 1.0], [0.0, 1.0]],
+                [[1.0, 0], [1.0, 1.0], [0.0, 1.0]],
+                [[1.0, 1.0], [0.0, 1.0], [1.0, 1.0]],
+            ]
+        ),
+        dtype=flow.int8,
+        device=flow.device(device),
+    )
+    of_out = flow.masked_select(x, mask)
+    np_out = [
+        -0.462,
+        0.3898,
+        -0.7197,
+        -0.1657,
+        -0.462,
+        0.3898,
+        -0.7197,
+        -0.1657,
+        -0.462,
+        0.3139,
+        -0.7197,
+        0.0478,
+        -0.1657,
+    ]
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [[[3.0, 1.0], [2.0, 3.0], [1.0, 3.0]]]
+    test_case.assertTrue(np.allclose(x.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestAbs(flow.unittest.TestCase):
+    def test_cosh(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [_test_masked_select, _test_masked_select_broadcast]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_math_ops.py b/python/oneflow/test/modules/test_math_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3edb8bedb2b8023c3bd570facfb7d16ef93ecb6
--- /dev/null
+++ b/python/oneflow/test/modules/test_math_ops.py
@@ -0,0 +1,884 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from automated_test_util import *
+from test_util import GenArgList, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_variance_keepdim(test_case, shape, device):
+    np_arr = np.random.randn(*shape)
+    of_out = flow.Tensor(np_arr, device=flow.device(device)).var(0, True)
+    np_out = np.var(np_arr, 0, keepdims=True)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_variance(test_case, shape, device):
+    np_arr = np.random.randn(*shape)
+    of_out = flow.var(flow.Tensor(np_arr, device=flow.device(device)), 1, False)
+    np_out = np.var(np_arr, 1, keepdims=False)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_variance_backward(test_case, shape, device):
+    np_arr = np.array(
+        [
+            [
+                [-0.436214, -1.11672411, 0.78394664, 2.0621712],
+                [0.7716703, -1.35367316, -0.40694879, -1.72392356],
+                [-1.08482436, -0.20731248, 1.39633697, 0.32614333],
+            ],
+            [
+                [-1.42467297, -1.78418015, 0.17861511, 0.12065858],
+                [2.03621124, -0.93674042, 0.1943963, 1.98559192],
+                [-0.00436223, 0.37788105, 0.47820872, 0.15467583],
+            ],
+        ]
+    )
+    x = flow.Tensor(np_arr, requires_grad=True, device=flow.device(device))
+    y = flow.var(x, False)
+    z = y.sum()
+    z.backward()
+    np_grad = 2 * (np_arr - np_arr.mean()) / np_arr.size
+    test_case.assertTrue(np.allclose(x.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestVariance(flow.unittest.TestCase):
+    def test_variance(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [_test_variance, _test_variance_keepdim]
+        arg_dict["shape"] = [(2, 3), (2, 3, 4), (2, 3, 4, 5)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+def _test_sinh_impl(test_case, shape, device):
+    np_input = np.random.randn(*shape)
+    of_input = flow.Tensor(
+        np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    np_x_grad = np.cosh(np_input)
+    of_out = flow.sinh(of_input)
+    np_out = np.sinh(np_input)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+    of_out = of_out.sum()
+    of_out.backward()
+    test_case.assertTrue(np.allclose(of_input.grad.numpy(), np_x_grad, 0.0001, 0.0001))
+
+
+@flow.unittest.skip_unless_1n1d()
+class Testsinh(flow.unittest.TestCase):
+    def test_sinh(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(2, 3), (2, 4, 5, 6)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_sinh_impl(test_case, *arg)
+
+    @autotest()
+    def test_flow_sinh_with_random_data(test_case):
+        device = random_device()
+        x = random_pytorch_tensor().to(device)
+        y = torch.sinh(x)
+        return y
+
+
+def _test_sin(test_case, shape, device):
+    input = flow.Tensor(np.random.randn(*shape), device=flow.device(device))
+    of_out = flow.sin(input)
+    np_out = np.sin(input.numpy())
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_sin_backward(test_case, shape, device):
+    x = flow.Tensor(
+        np.random.randn(*shape), requires_grad=True, device=flow.device(device)
+    )
+    y = flow.sin(x)
+    z = y.sum()
+    z.backward()
+    test_case.assertTrue(np.allclose(x.grad.numpy(), np.cos(x.numpy()), 1e-05, 1e-05))
+
+
+def _test_inplace_sin(test_case, shape, device):
+    x = flow.Tensor(
+        np.random.randn(*shape), device=flow.device(device), requires_grad=True
+    )
+    x_inplace = x + 1
+    np_out = np.sin(x_inplace.numpy())
+    id_old = id(x_inplace)
+    x_inplace.sin_()
+    test_case.assertEqual(id_old, id(x_inplace))
+    test_case.assertTrue(np.allclose(x_inplace.numpy(), np_out, 1e-05, 1e-05))
+    of_x_inplace = x_inplace.sum()
+    of_x_inplace.backward()
+    test_case.assertTrue(
+        np.allclose(x.grad.numpy(), np.cos(x_inplace.numpy()), 1e-05, 1e-05)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestSin(flow.unittest.TestCase):
+    def test_sin(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [_test_sin, _test_sin_backward, _test_inplace_sin]
+        arg_dict["shape"] = [(2, 3), (2, 3, 4), (2, 3, 4, 5)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+def _test_cos(test_case, shape, device):
+    input = flow.Tensor(
+        np.random.randn(*shape), dtype=flow.float32, device=flow.device(device)
+    )
+    of_out = flow.cos(input)
+    np_out = np.cos(input.numpy())
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_cos_backward(test_case, shape, device):
+    x = flow.Tensor(
+        np.random.randn(*shape),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    y = flow.cos(x)
+    z = y.sum()
+    z.backward()
+    np_grad = -np.sin(x.numpy())
+    test_case.assertTrue(np.allclose(x.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestCos(flow.unittest.TestCase):
+    def test_cos(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [_test_cos, _test_cos_backward]
+        arg_dict["shape"] = [(2, 3), (2, 3, 4), (2, 3, 4, 5)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+def _test_log(test_case, shape, device):
+    np_arr = np.abs(np.random.randn(*shape))
+    input = flow.Tensor(np_arr, dtype=flow.float32, device=flow.device(device))
+    of_out = flow.log(input)
+    np_out = np.log(np_arr)
+    test_case.assertTrue(
+        np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05, equal_nan=True)
+    )
+
+
+def _test_log_nan_value(test_case, shape, device):
+    arr = np.array([-0.7168, -0.5471, -0.8933, -1.4428, -0.119])
+    input = flow.Tensor(arr, dtype=flow.float32, device=flow.device(device))
+    np_out = np.full((5,), np.nan)
+    of_out = flow.log(input)
+    test_case.assertTrue(
+        np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05, equal_nan=True)
+    )
+
+
+def _test_log_backward(test_case, shape, device):
+    x = flow.Tensor(
+        np.random.randn(*shape),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    y = flow.log(x)
+    z = y.sum()
+    z.backward()
+    np_grad = 1 / x.numpy()
+    test_case.assertTrue(
+        np.allclose(x.grad.numpy(), np_grad, 1e-05, 1e-05, equal_nan=True)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestLog(flow.unittest.TestCase):
+    def test_log(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [_test_log, _test_log_nan_value, _test_log_backward]
+        arg_dict["shape"] = [(2, 3), (2, 3, 4), (2, 3, 4, 5)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+def _test_std(test_case, shape, device):
+    np_arr = np.random.randn(*shape)
+    input = flow.Tensor(np_arr, device=flow.device(device))
+    of_out = flow.std(input, dim=2)
+    np_out = np.std(np_arr, axis=2)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+
+
+def _test_std_dim1(test_case, shape, device):
+    np_arr = np.random.randn(*shape)
+    input = flow.Tensor(np_arr, device=flow.device(device))
+    of_out = flow.std(input, dim=1)
+    np_out = np.std(np_arr, axis=1)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+
+
+def _test_std_negative_dim(test_case, shape, device):
+    np_arr = np.random.randn(4, 2, 3, 5)
+    input = flow.Tensor(np_arr, device=flow.device(device))
+    of_out = input.std(dim=(-2, -1, -3), keepdim=False)
+    np_out = np.std(np_arr, axis=(-2, -1, -3))
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestStd(flow.unittest.TestCase):
+    def test_std(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [_test_std, _test_std_dim1, _test_std_negative_dim]
+        arg_dict["shape"] = [(2, 3, 4), (2, 3, 4, 5)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+def _test_sqrt(test_case, shape, device):
+    np_arr = np.random.randn(*shape)
+    np_arr = np.abs(np_arr)
+    np_out = np.sqrt(np_arr)
+    x = flow.Tensor(np_arr, device=flow.device(device))
+    of_out = flow.sqrt(input=x)
+    test_case.assertTrue(
+        np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05, equal_nan=True)
+    )
+
+
+def _test_sqrt_backward(test_case, shape, device):
+    np_arr = np.random.randn(*shape)
+    np_arr = np.abs(np_arr)
+    x = flow.Tensor(np_arr, device=flow.device(device), requires_grad=True)
+    y = flow.sqrt(input=x)
+    z = y.sum()
+    z.backward()
+    np_grad = 0.5 * 1 / np.sqrt(x.numpy())
+    test_case.assertTrue(
+        np.allclose(x.grad.numpy(), np_grad, 1e-05, 1e-05, equal_nan=True)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestSqrt(flow.unittest.TestCase):
+    def test_sqrt(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [_test_sqrt, _test_sqrt_backward]
+        arg_dict["shape"] = [(2, 3), (2, 3, 4), (2, 3, 4, 5)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+def _test_rsqrt(test_case, shape, device):
+    np_arr = np.random.randn(*shape)
+    np_arr = np.abs(np_arr)
+    np_out = 1 / np.sqrt(np_arr)
+    input = flow.Tensor(np_arr, device=flow.device(device))
+    of_out = input.rsqrt()
+    test_case.assertTrue(
+        np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05, equal_nan=True)
+    )
+
+
+def _test_rsqrt_backward(test_case, shape, device):
+    np_arr = np.random.randn(*shape)
+    np_arr = np.abs(np_arr)
+    x = flow.Tensor(np_arr, device=flow.device(device), requires_grad=True)
+    y = flow.rsqrt(input=x)
+    z = y.sum()
+    z.backward()
+    np_grad = -1 / 2 * 1 / (x.numpy() * np.sqrt(x.numpy()))
+    test_case.assertTrue(
+        np.allclose(x.grad.numpy(), np_grad, 1e-05, 1e-05, equal_nan=True)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestRsqrt(flow.unittest.TestCase):
+    def test_rsqrt(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [_test_rsqrt, _test_rsqrt_backward]
+        arg_dict["shape"] = [(2, 3), (2, 3, 4), (2, 3, 4, 5)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+def _test_square(test_case, shape, device):
+    np_arr = np.random.randn(*shape)
+    np_out = np.square(np_arr)
+    x = flow.Tensor(np_arr, device=flow.device(device))
+    of_out = flow.square(x)
+    test_case.assertTrue(
+        np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05, equal_nan=True)
+    )
+
+
+def _test_square_backward(test_case, shape, device):
+    np_arr = np.random.randn(*shape)
+    np_out = np.square(np_arr)
+    x = flow.Tensor(np_arr, device=flow.device(device), requires_grad=True)
+    y = flow.square(x)
+    z = y.sum()
+    z.backward()
+    np_grad = 2 * np_arr
+    test_case.assertTrue(
+        np.allclose(x.grad.numpy(), np_grad, 1e-05, 1e-05, equal_nan=True)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestSquare(flow.unittest.TestCase):
+    def test_square(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [_test_square, _test_square_backward]
+        arg_dict["shape"] = [(2, 3), (2, 3, 4), (2, 3, 4, 5)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+def _test_pow(test_case, shape, device):
+    input = flow.Tensor(
+        np.random.randn(*shape), dtype=flow.float32, device=flow.device(device)
+    )
+    of_out = flow.pow(input, 2.1)
+    np_out = np.power(input.numpy(), 2.1)
+    test_case.assertTrue(
+        np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05, equal_nan=True)
+    )
+
+
+def _test_pow_backward(test_case, shape, device):
+    x = flow.Tensor(
+        np.random.randn(*shape),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    y = flow.pow(x, 2.34)
+    z = y.sum()
+    z.backward()
+    np_grad = 2.34 * x.numpy() ** (2.34 - 1)
+    test_case.assertTrue(
+        np.allclose(x.grad.numpy(), np_grad, 1e-05, 1e-05, equal_nan=True)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestPow(flow.unittest.TestCase):
+    def test_pow(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [_test_pow, _test_pow_backward]
+        arg_dict["shape"] = [(2, 3), (2, 3, 4), (2, 3, 4, 5)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+def _test_asin(test_case, shape, device):
+    np_input = np.random.random(shape) - 0.5
+    of_input = flow.Tensor(
+        np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    of_out = flow.asin(of_input)
+    np_out = np.arcsin(np_input)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_out_grad = 1 / np.sqrt(1 - np_input ** 2)
+    test_case.assertTrue(
+        np.allclose(of_input.grad.numpy(), np_out_grad, 0.0001, 0.0001)
+    )
+
+
+def _test_arcsin(test_case, shape, device):
+    np_input = np.random.random(shape) - 0.5
+    of_input = flow.Tensor(
+        np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    of_out = flow.arcsin(of_input)
+    np_out = np.arcsin(np_input)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_out_grad = 1 / np.sqrt(1 - np_input ** 2)
+    test_case.assertTrue(
+        np.allclose(of_input.grad.numpy(), np_out_grad, 0.0001, 0.0001)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestAsin(flow.unittest.TestCase):
+    def test_asin(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(2,), (2, 3), (2, 4, 5, 6)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_asin(test_case, *arg)
+            _test_arcsin(test_case, *arg)
+
+    @unittest.skip("asin has bug")
+    @autotest()
+    def test_flow_asin_with_random_data(test_case):
+        device = random_device()
+        x = random_pytorch_tensor().to(device)
+        y = torch.asin(x)
+        return y
+
+    @unittest.skip("arcsin has bug")
+    @autotest()
+    def test_flow_arcsin_with_random_data(test_case):
+        device = random_device()
+        x = random_pytorch_tensor().to(device)
+        y = torch.arcsin(x)
+        return y
+
+
+def _test_asinh(test_case, shape, device):
+    np_input = np.random.randn(*shape)
+    of_input = flow.Tensor(
+        np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    of_out = flow.asinh(of_input)
+    np_out = np.arcsinh(np_input)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_out_grad = 1 / np.sqrt(1 + np_input ** 2)
+    test_case.assertTrue(
+        np.allclose(of_input.grad.numpy(), np_out_grad, 0.0001, 0.0001)
+    )
+
+
+def _test_arcsinh(test_case, shape, device):
+    np_input = np.random.randn(*shape)
+    of_input = flow.Tensor(
+        np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    of_out = flow.arcsinh(of_input)
+    np_out = np.arcsinh(np_input)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_out_grad = 1 / np.sqrt(1 + np_input ** 2)
+    test_case.assertTrue(
+        np.allclose(of_input.grad.numpy(), np_out_grad, 0.0001, 0.0001)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestAsinh(flow.unittest.TestCase):
+    def test_asinh(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(2,), (2, 3), (2, 4, 5, 6)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_asinh(test_case, *arg)
+            _test_arcsinh(test_case, *arg)
+
+    @autotest()
+    def test_flow_asinh_with_random_data(test_case):
+        device = random_device()
+        x = random_pytorch_tensor().to(device)
+        y = torch.asinh(x)
+        return y
+
+    @autotest()
+    def test_flow_arcsinh_with_random_data(test_case):
+        device = random_device()
+        x = random_pytorch_tensor().to(device)
+        y = torch.arcsinh(x)
+        return y
+
+
+def _topk_np(input, k, dim: int = None, largest: bool = True, _sorted: bool = True):
+    in_dims = input.shape
+    out_dims = list(in_dims)
+    num_axes = len(input.shape)
+    if dim < 0:
+        dim = dim + num_axes
+    n = in_dims[dim]
+    if k > n:
+        k = n
+    out_dims[dim] = k
+    out_dims = tuple(out_dims)
+    prev_dims = 1
+    next_dims = 1
+    for i in range(dim):
+        prev_dims *= in_dims[i]
+    for i in range(dim + 1, len(in_dims)):
+        next_dims *= in_dims[i]
+    input_flat = input.reshape((prev_dims, n, next_dims))
+    values_ref = np.ndarray(shape=(prev_dims, k, next_dims), dtype=input.dtype)
+    values_ref.fill(0)
+    indices_ref = np.ndarray(shape=(prev_dims, k, next_dims), dtype=np.int64)
+    indices_ref.fill(-1)
+    for i in range(prev_dims):
+        for j in range(next_dims):
+            kv = []
+            for x in range(n):
+                val = input_flat[i, x, j]
+                y = x * next_dims + i * in_dims[dim] * next_dims + j
+                kv.append((val, x, y))
+            cnt = 0
+            for (val, x, y) in sorted(kv, key=lambda x: (x[0], -x[1]), reverse=largest):
+                values_ref[i, cnt, j] = val
+                indices_ref[i, cnt, j] = x
+                cnt += 1
+                if cnt >= k or cnt >= n:
+                    break
+    values_ref = values_ref.reshape(out_dims)
+    indices_ref = indices_ref.reshape(out_dims)
+    return (values_ref, indices_ref)
+
+
+def _test_topk_dim_negative(test_case, device):
+    input = flow.Tensor(
+        np.random.randn(2, 6, 5, 7), dtype=flow.float32, device=flow.device(device)
+    )
+    dim = -1
+    k = 4
+    (of_values, of_indices) = flow.topk(input, k=k, dim=dim)
+    (np_values, np_indices) = _topk_np(input.numpy(), k=k, dim=dim)
+    test_case.assertTrue(
+        np.array_equal(of_values.numpy().flatten(), np_values.flatten())
+    )
+    test_case.assertTrue(
+        np.array_equal(of_indices.numpy().flatten(), np_indices.flatten())
+    )
+
+
+def _test_tensor_topk(test_case, device):
+    input = flow.Tensor(
+        np.random.randn(2, 6, 5, 7), dtype=flow.float32, device=flow.device(device)
+    )
+    dim = 1
+    k = 4
+    (of_values, of_indices) = input.topk(k=k, dim=dim)
+    (np_values, np_indices) = _topk_np(input.numpy(), k=k, dim=dim)
+    test_case.assertTrue(
+        np.array_equal(of_values.numpy().flatten(), np_values.flatten())
+    )
+    test_case.assertTrue(
+        np.array_equal(of_indices.numpy().flatten(), np_indices.flatten())
+    )
+
+
+def _test_topk_dim_positive(test_case, device):
+    input = flow.Tensor(
+        np.random.randn(2, 6, 5, 7), dtype=flow.float32, device=flow.device(device)
+    )
+    dim = 2
+    k = 4
+    (of_values, of_indices) = flow.topk(input, k=k, dim=dim)
+    (np_values, np_indices) = _topk_np(input.numpy(), k=k, dim=dim)
+    test_case.assertTrue(
+        np.array_equal(of_values.numpy().flatten(), np_values.flatten())
+    )
+    test_case.assertTrue(
+        np.array_equal(of_indices.numpy().flatten(), np_indices.flatten())
+    )
+
+
+def _test_topk_largest(test_case, device):
+    input = flow.Tensor(
+        np.random.randn(2, 6, 5, 7), dtype=flow.float32, device=flow.device(device)
+    )
+    dim = 1
+    k = 4
+    largest = False
+    (of_values, of_indices) = flow.topk(input, k=k, dim=dim, largest=False)
+    (np_values, np_indices) = _topk_np(input.numpy(), k=k, dim=dim, largest=False)
+    test_case.assertTrue(
+        np.array_equal(of_values.numpy().flatten(), np_values.flatten())
+    )
+    test_case.assertTrue(
+        np.array_equal(of_indices.numpy().flatten(), np_indices.flatten())
+    )
+
+
+def _test_topk_original(test_case, device):
+    arg_dict = OrderedDict()
+    arg_dict["shape"] = [(10, 10, 200)]
+    arg_dict["axis"] = [-2, 0, 2]
+    arg_dict["k"] = [1, 50, 200]
+    arg_dict["largest"] = [True, False]
+    arg_dict["data_type"] = ["float32", "double"]
+    rng = np.random.default_rng()
+    for (shape, axis, k, largest, data_type) in GenArgList(arg_dict):
+        np_type = type_name_to_np_type[data_type]
+        random_data = rng.standard_normal(size=shape, dtype=np_type)
+        while np.unique(random_data).size != random_data.size:
+            random_data = rng.standard_normal(size=shape, dtype=np_type)
+        input = flow.Tensor(
+            random_data,
+            dtype=type_name_to_flow_type[data_type],
+            device=flow.device(device),
+        )
+        (of_values, of_indices) = flow.topk(input, k=k, dim=axis, largest=largest)
+        (np_values, np_indices) = _topk_np(
+            input.numpy(), k=k, dim=axis, largest=largest
+        )
+        test_case.assertTrue(
+            np.array_equal(of_values.numpy().flatten(), np_values.flatten())
+        )
+        test_case.assertTrue(
+            np.array_equal(of_indices.numpy().flatten(), np_indices.flatten())
+        )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestPow(flow.unittest.TestCase):
+    def test_pow(test_case):
+        input = flow.Tensor(np.array([1, 2, 3, 4, 5, 6]), dtype=flow.float32)
+        of_out = flow.pow(input, 2.1)
+        np_out = np.power(input.numpy(), 2.1)
+        test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+    def test_pow_tensor_function(test_case):
+        input = flow.Tensor(np.array([1, 2, 3, 4, 5, 6]), dtype=flow.float32)
+        of_out = input.pow(2.1)
+        np_out = np.power(input.numpy(), 2.1)
+        test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestTopk(flow.unittest.TestCase):
+    def test_topk(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_topk_dim_negative,
+            _test_tensor_topk,
+            _test_topk_dim_positive,
+            _test_topk_largest,
+            _test_topk_original,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+def arccosh_input_tensor(shape):
+    def generator(_):
+        low = 1
+        high = 2
+        rng = np.random.default_rng()
+        np_arr = rng.random(size=shape) * (high - low) + low
+        return (
+            flow.Tensor(np_arr, dtype=flow.float32),
+            torch.tensor(np_arr, dtype=torch.float32),
+        )
+
+    return generator
+
+
+@unittest.skipIf(
+    not flow.unittest.env.eager_execution_enabled(),
+    ".numpy() doesn't work in lazy mode",
+)
+@flow.unittest.skip_unless_1n1d()
+class TestArccosh(flow.unittest.TestCase):
+    @unittest.skip("arccosh has bug")
+    @autotest()
+    def test_arccosh_flow_with_random_data(test_case):
+        device = random_device()
+        x = random_pytorch_tensor().to(device)
+        y = torch.arccosh(x)
+        return y
+
+
+def _test_acosh_impl(test_case, shape, device):
+    np_input = np.random.rand(*shape) + 2.0
+    of_input = flow.Tensor(
+        np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    of_out = flow.acosh(of_input)
+    np_out = np.arccosh(np_input)
+    test_case.assertTrue(
+        np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001, equal_nan=True)
+    )
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = 1.0 / np.sqrt(np.square(np_input) - 1)
+    test_case.assertTrue(
+        np.allclose(of_input.grad.numpy(), np_grad, 0.0001, 0.0001, equal_nan=True)
+    )
+
+
+def acosh_input_tensor(shape):
+    def generator(_):
+        low = 1
+        high = 2
+        rng = np.random.default_rng()
+        np_arr = rng.random(size=shape) * (high - low) + low
+        return (
+            flow.Tensor(np_arr, dtype=flow.float32),
+            torch.tensor(np_arr, dtype=torch.float32),
+        )
+
+    return generator
+
+
+@unittest.skipIf(
+    not flow.unittest.env.eager_execution_enabled(),
+    ".numpy() doesn't work in lazy mode",
+)
+@flow.unittest.skip_unless_1n1d()
+class TestAcosh(flow.unittest.TestCase):
+    def test_acosh(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(2, 3), (2, 3, 4), (2, 4, 5, 6)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_acosh_impl(test_case, *arg)
+
+    @unittest.skip("acosh has bug")
+    @autotest()
+    def test_acosh_flow_with_random_data(test_case):
+        device = random_device()
+        x = random_pytorch_tensor().to(device)
+        y = torch.acosh(x)
+        return y
+
+
+def _test_atan2_forward(test_case, shape, scalar, device):
+    np_input_x = 10 * np.random.rand(*shape)
+    np_input_y = 10 * np.random.randn(*shape)
+    of_input_x = flow.Tensor(np_input_x, dtype=flow.float32, device=flow.device(device))
+    of_input_y = flow.Tensor(np_input_y, dtype=flow.float32, device=flow.device(device))
+    of_out = flow.atan2(of_input_x, of_input_y)
+    np_out = np.arctan2(np_input_x, np_input_y)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_atan2_backward(test_case, device):
+    np_input_x = np.random.rand(2, 3)
+    np_input_y = np.random.rand(2, 3)
+    np_y_grad = -1 * np_input_x / (np_input_x * np_input_x + np_input_y * np_input_y)
+    np_x_grad = np_input_y / (np_input_x * np_input_x + np_input_y * np_input_y)
+
+    def test_x_y_grad():
+        of_input_x = flow.Tensor(
+            np_input_x,
+            dtype=flow.float32,
+            device=flow.device(device),
+            requires_grad=True,
+        )
+        of_input_y = flow.Tensor(
+            np_input_y,
+            dtype=flow.float32,
+            device=flow.device(device),
+            requires_grad=True,
+        )
+        of_out = flow.atan2(of_input_x, of_input_y)
+        of_out_sum = of_out.sum()
+        of_out_sum.backward()
+        test_case.assertTrue(
+            np.allclose(of_input_x.grad.numpy(), np_x_grad, 0.0001, 0.0001)
+        )
+        test_case.assertTrue(
+            np.allclose(of_input_y.grad.numpy(), np_y_grad, 0.0001, 0.0001)
+        )
+
+    def test_x_grad():
+        of_input_x = flow.Tensor(
+            np_input_x,
+            dtype=flow.float32,
+            device=flow.device(device),
+            requires_grad=True,
+        )
+        of_input_y = flow.Tensor(
+            np_input_y, dtype=flow.float32, device=flow.device(device)
+        )
+        of_out = flow.atan2(of_input_x, of_input_y)
+        of_out_sum = of_out.sum()
+        of_out_sum.backward()
+        test_case.assertTrue(
+            np.allclose(of_input_x.grad.numpy(), np_x_grad, 0.0001, 0.0001)
+        )
+
+    def test_y_grad():
+        of_input_x = flow.Tensor(
+            np_input_x, dtype=flow.float32, device=flow.device(device)
+        )
+        of_input_y = flow.Tensor(
+            np_input_y,
+            dtype=flow.float32,
+            device=flow.device(device),
+            requires_grad=True,
+        )
+        of_out = flow.atan2(of_input_x, of_input_y)
+        of_out_sum = of_out.sum()
+        of_out_sum.backward()
+        test_case.assertTrue(
+            np.allclose(of_input_y.grad.numpy(), np_y_grad, 0.0001, 0.0001)
+        )
+
+    test_x_y_grad()
+    test_x_grad()
+    test_y_grad()
+
+
+@unittest.skipIf(
+    not flow.unittest.env.eager_execution_enabled(),
+    ".numpy() doesn't work in lazy mode",
+)
+@flow.unittest.skip_unless_1n1d()
+class TestAtan2(flow.unittest.TestCase):
+    def test_atan2_forward(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(2,), (2, 3), (2, 3, 4), (2, 3, 4, 5)]
+        arg_dict["scalar"] = [2.1, 0.8]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_atan2_forward(test_case, *arg)
+
+    def test_atan2_backward(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_atan2_backward(test_case, *arg)
+
+    @autotest()
+    def test_flow_atan2_with_random_data(test_case):
+        device = random_device()
+        x1 = random_pytorch_tensor(ndim=1, dim0=1).to(device)
+        x2 = random_pytorch_tensor(ndim=1, dim0=1).to(device)
+        y = torch.atan2(x1, x2)
+        return y
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_matmul.py b/python/oneflow/test/modules/test_matmul.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b0fb5a471ddcc1c083ee5095079f40d19db5b2a
--- /dev/null
+++ b/python/oneflow/test/modules/test_matmul.py
@@ -0,0 +1,350 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import torch
+from automated_test_util import *
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_matmul(test_case, device):
+    input1 = flow.Tensor(
+        np.random.randn(2, 6), dtype=flow.float32, device=flow.device(device)
+    )
+    input2 = flow.Tensor(
+        np.random.randn(6, 5), dtype=flow.float32, device=flow.device(device)
+    )
+    of_out = flow.matmul(input1, input2)
+    np_out = np.matmul(input1.numpy(), input2.numpy())
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_broadcast_matmul(test_case, device):
+    input1 = flow.Tensor(
+        np.random.randn(3, 4, 5), dtype=flow.float32, device=flow.device(device)
+    )
+    input2 = flow.Tensor(
+        np.random.randn(5, 6), dtype=flow.float32, device=flow.device(device)
+    )
+    of_out = flow.matmul(input1, input2)
+    np_out = np.matmul(input1.numpy(), input2.numpy())
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_batch_matmul(test_case, device):
+    input1 = flow.Tensor(
+        np.random.randn(10, 3, 4), dtype=flow.float32, device=flow.device(device)
+    )
+    input2 = flow.Tensor(
+        np.random.randn(10, 4, 5), dtype=flow.float32, device=flow.device(device)
+    )
+    of_out = flow.matmul(input1, input2)
+    np_out = np.matmul(input1.numpy(), input2.numpy())
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_matmul_backward(test_case, device):
+    input1 = flow.Tensor(
+        [
+            [
+                -0.36023932695388794,
+                0.5571867227554321,
+                -1.4987696409225464,
+                -0.9674592018127441,
+                0.021076146513223648,
+                2.9180469512939453,
+            ],
+            [
+                -0.29169487953186035,
+                0.2978641390800476,
+                0.8198832273483276,
+                -0.3385652005672455,
+                -2.9260432720184326,
+                0.22528153657913208,
+            ],
+        ],
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    input2 = flow.Tensor(
+        [
+            [
+                -0.5270200371742249,
+                -0.4325239062309265,
+                -0.33396217226982117,
+                1.2983192205429077,
+                -0.463693231344223,
+            ],
+            [
+                1.893467903137207,
+                -1.0874812602996826,
+                0.7068315744400024,
+                -0.23532593250274658,
+                -0.011510828509926796,
+            ],
+            [
+                -0.5477776527404785,
+                -0.0381619855761528,
+                0.03451986983418465,
+                -0.8248650431632996,
+                -1.8885509967803955,
+            ],
+            [
+                -1.0034432411193848,
+                0.5428839921951294,
+                -0.7785694599151611,
+                -0.4489346146583557,
+                1.780846118927002,
+            ],
+            [
+                0.9378347396850586,
+                -0.38816362619400024,
+                0.8186876177787781,
+                -0.9630932807922363,
+                -0.11487948149442673,
+            ],
+            [
+                -0.12073716521263123,
+                2.181835174560547,
+                0.5511962175369263,
+                -1.294308066368103,
+                -0.7765272855758667,
+            ],
+        ],
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    of_out = flow.matmul(input1, input2)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [
+        [
+            -0.45888009667396545,
+            1.2659813165664673,
+            -3.264835834503174,
+            0.09278273582458496,
+            0.2903860807418823,
+            0.5414588451385498,
+        ],
+        [
+            -0.45888009667396545,
+            1.2659813165664673,
+            -3.264835834503174,
+            0.09278273582458496,
+            0.2903860807418823,
+            0.5414588451385498,
+        ],
+    ]
+    test_case.assertTrue(
+        np.allclose(input1.grad.numpy(), np_grad, atol=1e-05, rtol=1e-05)
+    )
+
+
+def _test_matmul_backward_x_grad(test_case, device):
+    input1 = flow.Tensor(
+        [
+            [-1.8604081869125366, -2.0019688606262207],
+            [1.0511547327041626, -2.263841390609741],
+        ],
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    input2 = flow.Tensor(
+        [
+            [-0.13973912596702576, 0.8478717803955078],
+            [-0.2144828885793686, -1.7145386934280396],
+        ],
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=False,
+    )
+    of_out = flow.matmul(input1, input2)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [
+        [0.7081326246261597, -1.9290215969085693],
+        [0.7081326246261597, -1.9290215969085693],
+    ]
+    test_case.assertTrue(
+        np.allclose(input1.grad.numpy(), np_grad, atol=1e-05, rtol=1e-05)
+    )
+
+
+def _test_matmul_backward_y_grad(test_case, device):
+    input1 = flow.Tensor(
+        [
+            [-1.8604081869125366, -2.0019688606262207],
+            [1.0511547327041626, -2.263841390609741],
+        ],
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=False,
+    )
+    input2 = flow.Tensor(
+        [
+            [-0.13973912596702576, 0.8478717803955078],
+            [-0.2144828885793686, -1.7145386934280396],
+        ],
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    of_out = flow.matmul(input1, input2)
+    of_out = of_out.sum()
+    of_out.backward()
+    print(input2.grad.numpy().tolist())
+    np_grad = [
+        [-0.809253454208374, -0.809253454208374],
+        [-4.265810012817383, -4.265810012817383],
+    ]
+    test_case.assertTrue(
+        np.allclose(input2.grad.numpy(), np_grad, atol=1e-05, rtol=1e-05)
+    )
+
+
+def _test_broadcast_matmul_backward(test_case, device):
+    input1 = flow.Tensor(
+        [
+            [
+                [0.5893293023109436, -0.0376124233007431, 0.7791574001312256],
+                [1.1614371538162231, 0.009700910188257694, 0.7281601428985596],
+            ],
+            [
+                [-0.27213698625564575, 0.7058051824569702, -0.4643424451351166],
+                [2.2279646396636963, 0.05870082601904869, -0.18335142731666565],
+            ],
+        ],
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    input2 = flow.Tensor(
+        [
+            [0.25825661420822144, -0.4875393807888031],
+            [-0.040459781885147095, -0.3713535666465759],
+            [-1.633512258529663, -2.0034799575805664],
+        ],
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    of_out = flow.matmul(input1, input2)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [
+        [
+            [-0.22928276658058167, -0.411813348531723, -3.6369922161102295],
+            [-0.22928276658058167, -0.411813348531723, -3.6369922161102295],
+        ],
+        [
+            [-0.22928276658058167, -0.411813348531723, -3.6369922161102295],
+            [-0.22928276658058167, -0.411813348531723, -3.6369922161102295],
+        ],
+    ]
+    test_case.assertTrue(
+        np.allclose(input1.grad.numpy(), np_grad, atol=1e-05, rtol=1e-05)
+    )
+
+
+def _test_batch_matmul_backward(test_case, device):
+    input1 = flow.Tensor(
+        [
+            [
+                [-0.0036776792258024216, 1.9946473836898804, -0.423959881067276],
+                [1.0892143249511719, 0.04005361348390579, -0.27883127331733704],
+            ],
+            [
+                [-0.970306396484375, 0.017771577462553978, 0.019596196711063385],
+                [0.27402883768081665, -0.8192587494850159, -0.3135920464992523],
+            ],
+        ],
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    input2 = flow.Tensor(
+        [
+            [
+                [1.118346929550171, -0.930071234703064],
+                [1.1238232851028442, 1.373764157295227],
+                [0.17178462445735931, -1.1010534763336182],
+            ],
+            [
+                [0.6694859862327576, 0.9250285029411316],
+                [-1.0835869312286377, 0.4192655086517334],
+                [1.2616937160491943, 0.33809131383895874],
+            ],
+        ],
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    of_out = flow.matmul(input1, input2)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [
+        [
+            [0.18827569484710693, 2.4975874423980713, -0.9292688369750977],
+            [0.18827569484710693, 2.4975874423980713, -0.9292688369750977],
+        ],
+        [
+            [1.5945144891738892, -0.6643214225769043, 1.5997850894927979],
+            [1.5945144891738892, -0.6643214225769043, 1.5997850894927979],
+        ],
+    ]
+    test_case.assertTrue(
+        np.allclose(input1.grad.numpy(), np_grad, atol=1e-05, rtol=1e-05)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestModule(flow.unittest.TestCase):
+    def test_matmul(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_matmul,
+            _test_broadcast_matmul,
+            _test_batch_matmul,
+            _test_matmul_backward,
+            _test_matmul_backward_x_grad,
+            _test_matmul_backward_y_grad,
+            _test_batch_matmul_backward,
+            _test_broadcast_matmul_backward,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+    @autotest()
+    def test_flow_matmul_with_random_data(test_case):
+        k = random(1, 6)
+        x = random_pytorch_tensor(ndim=2, dim1=k)
+        y = random_pytorch_tensor(ndim=2, dim0=k)
+        z = torch.matmul(x, y)
+        return z
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_mean.py b/python/oneflow/test/modules/test_mean.py
new file mode 100644
index 0000000000000000000000000000000000000000..b292a8b5ba9b27f9ac261893208d1d4cd9f7799e
--- /dev/null
+++ b/python/oneflow/test/modules/test_mean.py
@@ -0,0 +1,90 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from automated_test_util import *
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_mean(test_case, shape, device):
+    input = flow.Tensor(
+        np.random.randn(*shape), dtype=flow.float32, device=flow.device(device)
+    )
+    of_out = flow.mean(input, dim=1)
+    np_out = np.mean(input.numpy(), axis=1)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+    input = flow.Tensor(
+        np.random.randn(*shape), dtype=flow.float32, device=flow.device(device)
+    )
+    of_out = flow.mean(input, dim=0)
+    np_out = np.mean(input.numpy(), axis=0)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+
+
+def _test_mean_negative_dim(test_case, shape, device):
+    if len(shape) < 4:
+        shape = (2, 3, 4, 5)
+    input = flow.Tensor(
+        np.random.randn(*shape), dtype=flow.float32, device=flow.device(device)
+    )
+    of_out = flow.mean(input, dim=(-2, -1, -3))
+    np_out = np.mean(input.numpy(), axis=(-2, -1, -3))
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+
+
+def _test_mean_backward(test_case, shape, device):
+    np_arr = np.random.randn(*shape)
+    x = flow.Tensor(
+        np_arr, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    y = flow.mean(x, dim=1)
+    z = y.sum()
+    z.backward()
+    np_grad = np.zeros(shape=np_arr.shape)
+    np_grad[:] = 1 / x.size(1)
+    test_case.assertTrue(np.allclose(x.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestMean(flow.unittest.TestCase):
+    def test_mean(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_mean,
+            _test_mean_negative_dim,
+            _test_mean_backward,
+        ]
+        arg_dict["shape"] = [(2, 3), (2, 3, 4), (2, 4, 5, 6)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+    def test_mean_against_pytorch(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_type"] = [test_flow_against_pytorch, test_tensor_against_pytorch]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, "mean", device=arg[1])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_meshgrid.py b/python/oneflow/test/modules/test_meshgrid.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb769083d9ce24483ba363d272e699cadf36f403
--- /dev/null
+++ b/python/oneflow/test/modules/test_meshgrid.py
@@ -0,0 +1,83 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_meshgrid_forawd(test_case, device):
+    input1 = flow.Tensor(
+        np.array([1, 2, 3]), dtype=flow.float32, device=flow.device(device)
+    )
+    input2 = flow.Tensor(
+        np.array([4, 5, 6]), dtype=flow.float32, device=flow.device(device)
+    )
+    (np_x, np_y) = np.meshgrid(input1.numpy(), input2.numpy(), indexing="ij")
+    (of_x, of_y) = flow.meshgrid(input1, input2)
+    test_case.assertTrue(np.allclose(of_x.numpy(), np_x, 0.0001, 0.0001))
+    test_case.assertTrue(np.allclose(of_y.numpy(), np_y, 0.0001, 0.0001))
+
+
+def _test_meshgrid_forawd_scalr(test_case, device):
+    input1 = flow.Tensor(np.array(1.0), dtype=flow.float32, device=flow.device(device))
+    input2 = flow.Tensor(np.array(2.0), dtype=flow.float32, device=flow.device(device))
+    (np_x, np_y) = np.meshgrid(input1.numpy(), input2.numpy(), indexing="ij")
+    (of_x, of_y) = flow.meshgrid(input1, input2)
+    test_case.assertTrue(np.allclose(of_x.numpy(), np_x, 0.0001, 0.0001))
+    test_case.assertTrue(np.allclose(of_y.numpy(), np_y, 0.0001, 0.0001))
+
+
+def _test_meshgrid_forawd_3tensor(test_case, device):
+    input1 = flow.Tensor(
+        np.array([1, 2, 3]), dtype=flow.float32, device=flow.device(device)
+    )
+    input2 = flow.Tensor(
+        np.array([4, 5, 6]), dtype=flow.float32, device=flow.device(device)
+    )
+    input3 = flow.Tensor(
+        np.array([7, 8, 9]), dtype=flow.float32, device=flow.device(device)
+    )
+    (np_x, np_y, np_z) = np.meshgrid(
+        input1.numpy(), input2.numpy(), input3.numpy(), indexing="ij"
+    )
+    (of_x, of_y, of_z) = flow.meshgrid(input1, input2, input3)
+    test_case.assertTrue(np.allclose(of_x.numpy(), np_x, 0.0001, 0.0001))
+    test_case.assertTrue(np.allclose(of_y.numpy(), np_y, 0.0001, 0.0001))
+    test_case.assertTrue(np.allclose(of_z.numpy(), np_z, 0.0001, 0.0001))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestMeshGrid(flow.unittest.TestCase):
+    def test_meshgrid(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_meshgrid_forawd,
+            _test_meshgrid_forawd_scalr,
+            _test_meshgrid_forawd_3tensor,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_module.py b/python/oneflow/test/modules/test_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..7834d88f6f7470913e5fb025cb936243bf11cc2d
--- /dev/null
+++ b/python/oneflow/test/modules/test_module.py
@@ -0,0 +1,183 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import collections.abc
+import tempfile
+import unittest
+from itertools import repeat
+from typing import Tuple, Union
+
+import numpy as np
+
+import oneflow as flow
+import oneflow.typing as tp
+import oneflow.unittest
+
+
+def np_relu(np_arr):
+    return np.where(np_arr > 0, np_arr, 0)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestModule(flow.unittest.TestCase):
+    def test_nested_module(test_case):
+        class CustomModule(flow.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.relu = flow.nn.ReLU()
+
+            def forward(self, x):
+                return self.relu(x)
+
+        m = CustomModule()
+        x = flow.Tensor(2, 3)
+        flow.nn.init.uniform_(x, a=-1.0, b=1.0)
+        y = m(x)
+        test_case.assertTrue(np.array_equal(np_relu(x.numpy()), y.numpy()))
+
+    def test_relu(test_case):
+        relu = flow.nn.ReLU()
+        x = flow.Tensor(2, 3)
+        flow.nn.init.uniform_(x, a=-1.0, b=1.0)
+        y = relu(x)
+        test_case.assertTrue(np.array_equal(np_relu(x.numpy()), y.numpy()))
+
+    def test_load_state_dict(test_case):
+        class CustomModule(flow.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.w = flow.nn.Parameter(flow.Tensor(2, 3))
+
+            def forward(self, x):
+                return self.w
+
+        m = CustomModule()
+        ones = np.ones((2, 3), dtype=np.float32)
+        m.load_state_dict({"w": ones})
+        x = flow.Tensor(2, 3)
+        y = m(x).numpy()
+        test_case.assertTrue(np.array_equal(y, ones))
+
+    def test_state_dict(test_case):
+        class CustomModule(flow.nn.Module):
+            def __init__(self, param1, param2):
+                super().__init__()
+                self.param1 = param1
+                self.param2 = param2
+
+        tensor0 = flow.nn.Parameter(flow.Tensor(2, 3))
+        tensor1 = flow.nn.Parameter(flow.Tensor(2, 3))
+        sub_module = CustomModule(tensor0, tensor1)
+        m = CustomModule(tensor1, sub_module)
+        state_dict = m.state_dict()
+        test_case.assertEqual(
+            state_dict,
+            {"param2.param1": tensor0, "param2.param2": tensor1, "param1": tensor1},
+        )
+
+    def test_parameter(test_case):
+        shape = (3, 4)
+        t = flow.Tensor(*shape)
+        p = flow.nn.Parameter(t)
+        test_case.assertEqual(type(p), flow.nn.Parameter)
+        test_case.assertEqual(p.shape, shape)
+
+    def test_module_forward(test_case):
+        class CustomModule(flow.nn.Module):
+            def __init__(self, w):
+                super().__init__()
+                self.w = w
+
+            def forward(self, x):
+                return x + self.w
+
+        m = CustomModule(5)
+        test_case.assertEqual(m(1), 6)
+        m = CustomModule(4)
+        test_case.assertEqual(m(3), 7)
+
+    def test_train_eval(test_case):
+        m = flow.nn.Module()
+        test_case.assertEqual(m.training, True)
+        m.train()
+        test_case.assertEqual(m.training, True)
+        m.eval()
+        test_case.assertEqual(m.training, False)
+
+    def test_module_setattr(test_case):
+        class CustomModule(flow.nn.Module):
+            def __init__(self, param1, param2):
+                super().__init__()
+                self.param1 = param1
+                self.param2 = param2
+
+        param0 = flow.nn.Parameter(flow.Tensor(2, 3))
+        param1 = flow.nn.Parameter(flow.Tensor(2, 3))
+        param2 = CustomModule(param0, param1)
+        m = CustomModule(param1, param2)
+        params = list(m.parameters())
+        test_case.assertEqual(len(params), 2)
+        test_case.assertEqual(params[0], param1)
+        test_case.assertEqual(params[1], param0)
+        children = list(m.children())
+        test_case.assertEqual(len(children), 1)
+        child = children[0]
+        test_case.assertEqual(child, param2)
+        child_params = list(child.parameters())
+        test_case.assertEqual(len(child_params), 2)
+        test_case.assertEqual(child_params[0], param0)
+        test_case.assertEqual(child_params[1], param1)
+
+    def test_module_apply(test_case):
+        class CustomModule(flow.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.modules = flow.nn.Module()
+
+        global module_num
+        module_num = 0
+
+        def get_module_num(m):
+            global module_num
+            module_num += 1
+
+        net = CustomModule()
+        net.apply(get_module_num)
+        test_case.assertEqual(module_num, 2)
+
+    def test_save_state_dict(test_case):
+        class CustomModule(flow.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.param1 = flow.nn.Parameter(flow.Tensor(32, 1024, 1024))
+                self.param2 = flow.nn.Parameter(flow.Tensor(32, 1024, 1024))
+
+            def forward(self):
+                return self.param1 + self.param2
+
+        m = CustomModule()
+        res1 = m()
+        state_dict = m.state_dict()
+        with tempfile.TemporaryDirectory() as save_dir:
+            flow.save(state_dict, save_dir)
+            loaded_state_dict = flow.load(save_dir)
+            m.load_state_dict(loaded_state_dict)
+        res2 = m()
+        test_case.assertTrue(np.array_equal(res1.numpy(), res2.numpy()))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_module_to.py b/python/oneflow/test/modules/test_module_to.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba9e91d40b9820acd23a5279a1425062f28e4b9f
--- /dev/null
+++ b/python/oneflow/test/modules/test_module_to.py
@@ -0,0 +1,91 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+dummy_val = np.random.randn(2, 3)
+in_val = np.full((2, 3), -2)
+cpu0_device = flow.device("cpu")
+gpu0_device = flow.device("cuda")
+
+
+class DummyModule(flow.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.register_buffer("dummy_buf", flow.Tensor(dummy_val))
+        self.dummy_para = flow.nn.Parameter(flow.Tensor(dummy_val))
+
+    def forward(self, x):
+        return self.dummy_para * x + self.dummy_buf
+
+
+def _test_dummy_module(test_case):
+    m = DummyModule()
+    test_case.assertEqual(m.dummy_buf.device, cpu0_device)
+    test_case.assertEqual(m.dummy_para.device, cpu0_device)
+    input = flow.Tensor(in_val)
+    output = m(input)
+    test_case.assertTrue(np.allclose(output.numpy(), -dummy_val, 0.0001, 0.0001))
+    test_case.assertEqual(m.dummy_buf.grad, None)
+    test_case.assertEqual(m.dummy_para.grad, None)
+    test_case.assertEqual(input.device, cpu0_device)
+    test_case.assertEqual(output.device, cpu0_device)
+
+
+def _test_dummy_module_to(test_case):
+    m = DummyModule()
+    test_case.assertEqual(m.dummy_buf.device, cpu0_device)
+    test_case.assertEqual(m.dummy_para.device, cpu0_device)
+    m.to(gpu0_device)
+    test_case.assertEqual(m.dummy_buf.device, gpu0_device)
+    test_case.assertTrue(m.dummy_buf.is_leaf)
+    test_case.assertTrue(not m.dummy_buf.requires_grad)
+    test_case.assertEqual(m.dummy_para.device, gpu0_device)
+    test_case.assertTrue(m.dummy_para.is_leaf)
+    test_case.assertTrue(m.dummy_para.requires_grad)
+    input = flow.Tensor(in_val).to(gpu0_device)
+    output = m(input)
+    test_case.assertTrue(np.allclose(output.numpy(), -dummy_val, 0.0001, 0.0001))
+    test_case.assertEqual(m.dummy_buf.grad, None)
+    test_case.assertEqual(m.dummy_para.grad, None)
+    test_case.assertEqual(input.device, gpu0_device)
+    test_case.assertEqual(output.device, gpu0_device)
+    output_grad = flow.ones((2, 3)).to(gpu0_device)
+    output.backward(output_grad)
+    test_case.assertEqual(output_grad.device, gpu0_device)
+    test_case.assertEqual(m.dummy_buf.grad, None)
+    test_case.assertTrue(np.allclose(m.dummy_para.grad.numpy(), in_val, 0.0001, 0.0001))
+    test_case.assertEqual(m.dummy_para.grad.device, gpu0_device)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestModuleTo(flow.unittest.TestCase):
+    def test_module_to(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [_test_dummy_module, _test_dummy_module_to]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_mseloss.py b/python/oneflow/test/modules/test_mseloss.py
new file mode 100644
index 0000000000000000000000000000000000000000..00b089e1fd7ac6e103213e12695e75107a9b8d3f
--- /dev/null
+++ b/python/oneflow/test/modules/test_mseloss.py
@@ -0,0 +1,78 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _np_mseloss(np_input, np_target):
+    np_mse = np.square(np_target - np_input)
+    np_mse_mean = np.mean(np_mse)
+    np_mse_sum = np.sum(np_mse)
+    return {"none": np_mse, "mean": np_mse_mean, "sum": np_mse_sum}
+
+
+def _np_mseloss_grad(np_input, np_target):
+    elem_cnt = np_input.size
+    np_mse_grad_sum = -2 * (np_target - np_input)
+    np_mse_grad_mean = np_mse_grad_sum / elem_cnt
+    return {"none": np_mse_grad_sum, "mean": np_mse_grad_mean, "sum": np_mse_grad_sum}
+
+
+def _test_mseloss_impl(test_case, device, shape, reduction):
+    x = np.random.randn(*shape)
+    y = np.random.randn(*shape)
+    input = flow.Tensor(
+        x, dtype=flow.float32, requires_grad=True, device=flow.device(device)
+    )
+    target = flow.Tensor(y, dtype=flow.float32, device=flow.device(device))
+    loss = flow.nn.MSELoss(reduction=reduction)
+    loss = loss.to(device)
+    of_out = loss(input, target)
+    np_out = _np_mseloss(x, y)[reduction]
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = _np_mseloss_grad(x, y)[reduction]
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestMSELossModule(flow.unittest.TestCase):
+    def test_mseloss(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [_test_mseloss_impl]
+        arg_dict["device"] = ["cpu", "cuda"]
+        arg_dict["shape"] = [
+            (3, 5),
+            (10, 9, 21),
+            (14, 22, 9, 21),
+            (3, 2, 4, 16, 5),
+            (1,),
+        ]
+        arg_dict["reduction"] = ["none", "mean", "sum"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_mul.py b/python/oneflow/test/modules/test_mul.py
new file mode 100644
index 0000000000000000000000000000000000000000..907c4b5a19283a1137fe5db8f77b113c11588316
--- /dev/null
+++ b/python/oneflow/test/modules/test_mul.py
@@ -0,0 +1,133 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from automated_test_util import *
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_mul_impl(test_case, device):
+    x = flow.Tensor(
+        np.random.randn(2, 3), device=flow.device(device), requires_grad=True
+    )
+    y = flow.Tensor(
+        np.random.randn(2, 3), device=flow.device(device), requires_grad=True
+    )
+    of_out = flow.mul(x, y)
+    np_out = np.multiply(x.numpy(), y.numpy())
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad_x = y.numpy()
+    np_grad_y = x.numpy()
+    test_case.assertTrue(np.allclose(x.grad.numpy(), np_grad_x, 1e-05, 1e-05))
+    test_case.assertTrue(np.allclose(y.grad.numpy(), np_grad_y, 1e-05, 1e-05))
+    x = 5
+    y = flow.Tensor(np.random.randn(2, 3), device=flow.device(device))
+    of_out = flow.mul(x, y)
+    np_out = np.multiply(x, y.numpy())
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    x = flow.Tensor(np.random.randn(2, 3), device=flow.device(device))
+    y = 5
+    of_out = flow.mul(x, y)
+    np_out = np.multiply(x.numpy(), y)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    x = flow.Tensor(
+        np.random.randn(1, 1), device=flow.device(device), requires_grad=True
+    )
+    y = flow.Tensor(
+        np.random.randn(2, 3), device=flow.device(device), requires_grad=True
+    )
+    of_out = flow.mul(x, y)
+    np_out = np.multiply(x.numpy(), y.numpy())
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    test_case.assertTrue(np.allclose(x.grad.numpy(), np.sum(y.numpy()), 1e-05, 1e-05))
+    test_case.assertTrue(np.allclose(y.grad.numpy(), x.numpy(), 1e-05, 1e-05))
+    x = flow.Tensor(
+        np.random.randn(1, 1), device=flow.device(device), requires_grad=True
+    )
+    y = flow.Tensor(
+        np.random.randn(2, 3, 4), device=flow.device(device), requires_grad=True
+    )
+    of_out = flow.mul(x, y)
+    np_out = np.multiply(x.numpy(), y.numpy())
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    test_case.assertTrue(np.allclose(x.grad.numpy(), np.sum(y.numpy()), 1e-05, 1e-05))
+    test_case.assertTrue(np.allclose(y.grad.numpy(), x.numpy(), 1e-05, 1e-05))
+    x = flow.Tensor(
+        np.random.randn(1, 1), device=flow.device(device), requires_grad=True
+    )
+    y = flow.Tensor(
+        np.random.randn(2, 3, 4, 5), device=flow.device(device), requires_grad=True
+    )
+    of_out = flow.mul(x, y)
+    np_out = np.multiply(x.numpy(), y.numpy())
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    test_case.assertTrue(np.allclose(x.grad.numpy(), np.sum(y.numpy()), 1e-05, 1e-05))
+    test_case.assertTrue(np.allclose(y.grad.numpy(), x.numpy(), 1e-05, 1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestMulModule(flow.unittest.TestCase):
+    def test_mul(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [_test_mul_impl]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+    def test_mul_against_pytorch(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_type"] = [test_flow_against_pytorch, test_tensor_against_pytorch]
+        arg_dict["device"] = ["cpu", "cuda"]
+        arg_dict["op"] = ["mul"]
+        for arg in GenArgList(arg_dict):
+            arg[0](
+                test_case,
+                arg[2],
+                extra_annotations={"other": flow.Tensor},
+                extra_generators={
+                    "input": random_tensor(ndim=2, dim0=2, dim1=3),
+                    "other": random_tensor(ndim=2, dim0=2, dim1=3),
+                },
+                device=arg[1],
+            )
+            arg[0](
+                test_case,
+                arg[2],
+                extra_annotations={"other": float},
+                extra_generators={
+                    "input": random_tensor(ndim=2, dim0=2, dim1=3),
+                    "other": random(0, 5),
+                },
+                device=arg[1],
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_ne.py b/python/oneflow/test/modules/test_ne.py
new file mode 100644
index 0000000000000000000000000000000000000000..229ec5a8f79cfdb27100b19bc625cbb0055ce1d7
--- /dev/null
+++ b/python/oneflow/test/modules/test_ne.py
@@ -0,0 +1,104 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_ne(test_case, shape, device):
+    arr1 = np.random.randn(*shape)
+    arr2 = np.random.randn(*shape)
+    input = flow.Tensor(arr1, dtype=flow.float32, device=flow.device(device))
+    other = flow.Tensor(arr2, dtype=flow.float32, device=flow.device(device))
+    of_out = flow.ne(input, other)
+    of_out2 = flow.not_equal(input, other)
+    np_out = np.not_equal(arr1, arr2)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+    test_case.assertTrue(np.array_equal(of_out2.numpy(), np_out))
+
+
+def _test_tensor_ne_operator(test_case, shape, device):
+    arr1 = np.random.randn(*shape)
+    arr2 = np.random.randn(*shape)
+    input = flow.Tensor(arr1, dtype=flow.float32, device=flow.device(device))
+    other = flow.Tensor(arr2, dtype=flow.float32, device=flow.device(device))
+    of_out = input.ne(other)
+    np_out = np.not_equal(arr1, arr2)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+def _test_ne_int(test_case, shape, device):
+    arr = np.random.randn(*shape)
+    input = flow.Tensor(arr, dtype=flow.float32, device=flow.device(device))
+    num = 1
+    of_out = flow.ne(input, num)
+    np_out = np.not_equal(arr, num)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+def _test_tensor_ne_operator_int(test_case, shape, device):
+    arr = np.random.randn(*shape)
+    input = flow.Tensor(arr, dtype=flow.float32, device=flow.device(device))
+    num = 1
+    of_out = input.ne(num)
+    np_out = np.not_equal(arr, num)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+def _test_ne_float(test_case, shape, device):
+    arr = np.random.randn(*shape)
+    input = flow.Tensor(arr, dtype=flow.float32, device=flow.device(device))
+    num = 1.0
+    of_out = flow.ne(input, num)
+    np_out = np.not_equal(arr, num)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+def _test_tensor_ne_operator_float(test_case, shape, device):
+    arr = np.random.randn(*shape)
+    input = flow.Tensor(arr, dtype=flow.float32, device=flow.device(device))
+    num = 1.0
+    of_out = input.ne(num)
+    np_out = np.not_equal(arr, num)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestNe(flow.unittest.TestCase):
+    def test_ne(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_func"] = [
+            _test_ne,
+            _test_tensor_ne_operator,
+            _test_ne_int,
+            _test_tensor_ne_operator_int,
+            _test_ne_float,
+            _test_tensor_ne_operator_float,
+        ]
+        arg_dict["shape"] = [(2, 3), (2, 3, 4), (2, 4, 5, 6)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_negative.py b/python/oneflow/test/modules/test_negative.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f8e7e45a162bb0d0598cd4f74d5179d6b54a7ee
--- /dev/null
+++ b/python/oneflow/test/modules/test_negative.py
@@ -0,0 +1,82 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_negtive(test_case, shape, device):
+    np_input = np.random.randn(*shape)
+    input = flow.Tensor(np_input, dtype=flow.float32, device=flow.device(device))
+    of_out = flow.negative(input)
+    np_out = -input.numpy()
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+def _test_negative_neg(test_case, shape, device):
+    np_input = np.random.randn(*shape)
+    input = flow.Tensor(np_input, dtype=flow.float32, device=flow.device(device))
+    of_out = flow.neg(input)
+    np_out = -input.numpy()
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+def _test_tensor_negative(test_case, shape, device):
+    np_input = np.random.randn(*shape)
+    input = flow.Tensor(np_input, dtype=flow.float32, device=flow.device(device))
+    of_out = input.negative()
+    np_out = -input.numpy()
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+def _test_negative_backward(test_case, shape, device):
+    np_input = np.random.randn(*shape)
+    input = flow.Tensor(
+        np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    of_out = flow.negative(input)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = -np.ones(shape)
+    test_case.assertTrue(
+        np.allclose(input.grad.numpy(), np_grad, atol=1e-05, rtol=1e-05)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestNegativeModule(flow.unittest.TestCase):
+    def test_negative(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_negtive,
+            _test_negative_neg,
+            _test_tensor_negative,
+            _test_negative_backward,
+        ]
+        arg_dict["shape"] = [(2, 3), (2, 4, 5, 6)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_nllloss.py b/python/oneflow/test/modules/test_nllloss.py
new file mode 100644
index 0000000000000000000000000000000000000000..67005f7ee39041e75334a9cd379999a3e19c0d35
--- /dev/null
+++ b/python/oneflow/test/modules/test_nllloss.py
@@ -0,0 +1,324 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def nll_loss_1d(logs, targets, reduction="none", ignore_index=None):
+    input_shape = logs.shape
+    N = input_shape[0]
+    C = input_shape[1]
+    out = np.zeros_like(targets).astype(np.float64)
+    total_weight = N
+    for i in range(N):
+        cur_target = targets[i]
+        out[i] = -logs[i][cur_target]
+    if ignore_index is not None:
+        condition = targets != ignore_index
+        out *= condition
+    if reduction == "sum":
+        return np.sum(out)
+    elif reduction == "mean":
+        if ignore_index is not None:
+            reduce_sum = out.sum()
+            reduce_count = np.argwhere(condition).shape[0]
+            return reduce_sum / reduce_count
+        else:
+            return out.sum() / total_weight
+    elif reduction == "none":
+        return out
+
+
+def nll_loss_2d(logs, targets, reduction="none", ignore_index=None):
+    input_shape = logs.shape
+    N = input_shape[0]
+    H = input_shape[2]
+    W = input_shape[3]
+    out = np.zeros_like(targets).astype(np.float64)
+    total_weight = N * H * W
+    for i in range(N):
+        for h in range(H):
+            for w in range(W):
+                cur_target = targets[i][h][w]
+                out[i][h][w] = -logs[i][cur_target][h][w]
+    if ignore_index is not None:
+        condition = targets != ignore_index
+        out *= condition
+    if reduction == "sum":
+        return np.sum(out)
+    elif reduction == "mean":
+        if ignore_index is not None:
+            reduce_sum = out.sum()
+            reduce_count = np.argwhere(condition).shape[0]
+            return reduce_sum / reduce_count
+        else:
+            return out.sum() / total_weight
+    elif reduction == "none":
+        return out
+
+
+def nll_loss_bert(logs, targets, reduction="none", ignore_index=None):
+    input_shape = logs.shape
+    N = input_shape[0]
+    H = input_shape[2]
+    out = np.zeros_like(targets).astype(np.float64)
+    total_weight = N * H
+    for i in range(N):
+        for h in range(H):
+            cur_target = targets[i][h]
+            out[i][h] = -logs[i][cur_target][h]
+    if ignore_index is not None:
+        condition = targets != ignore_index
+        out *= condition
+    if reduction == "sum":
+        return np.sum(out)
+    elif reduction == "mean":
+        if ignore_index is not None:
+            reduce_sum = out.sum()
+            reduce_count = np.argwhere(condition).shape[0]
+            return reduce_sum / reduce_count
+        else:
+            return out.sum() / total_weight
+    elif reduction == "none":
+        return out
+
+
+def _test_nllloss_none(test_case, device):
+    x = np.array(
+        [
+            [0.88103855, 0.9908683, 0.6226845],
+            [0.53331435, 0.07999352, 0.8549948],
+            [0.25879037, 0.39530203, 0.698465],
+            [0.73427284, 0.63575995, 0.18827209],
+            [0.05689114, 0.0862954, 0.6325046],
+        ]
+    ).astype(np.float32)
+    y = np.array([0, 2, 1, 1, 0]).astype(np.int)
+    input = flow.Tensor(x, dtype=flow.float32, device=flow.device(device))
+    target = flow.Tensor(y, dtype=flow.int64, device=flow.device(device))
+    nll_loss = flow.nn.NLLLoss(reduction="none")
+    nll_loss = nll_loss.to(device)
+    of_out = nll_loss(input, target)
+    np_out = nll_loss_1d(input.numpy(), target.numpy())
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out))
+    nll_loss = flow.nn.NLLLoss(reduction="none", ignore_index=1)
+    nll_loss = nll_loss.to(device)
+    of_out = nll_loss(input, target)
+    np_out = nll_loss_1d(input.numpy(), target.numpy(), ignore_index=1)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out))
+
+
+def _test_nllloss_mean(test_case, device):
+    x = np.array(
+        [
+            [0.88103855, 0.9908683, 0.6226845],
+            [0.53331435, 0.07999352, 0.8549948],
+            [0.25879037, 0.39530203, 0.698465],
+            [0.73427284, 0.63575995, 0.18827209],
+            [0.05689114, 0.0862954, 0.6325046],
+        ]
+    ).astype(np.float32)
+    y = np.array([0, 2, 1, 1, 0]).astype(np.int)
+    input = flow.Tensor(x, dtype=flow.float32, device=flow.device(device))
+    target = flow.Tensor(y, dtype=flow.int64, device=flow.device(device))
+    nll_loss = flow.nn.NLLLoss(reduction="mean")
+    nll_loss = nll_loss.to(device)
+    of_out = nll_loss(input, target)
+    np_out = nll_loss_1d(input.numpy(), target.numpy(), reduction="mean")
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out))
+    nll_loss = flow.nn.NLLLoss(reduction="mean", ignore_index=1)
+    nll_loss = nll_loss.to(device)
+    of_out = nll_loss(input, target)
+    np_out = nll_loss_1d(
+        input.numpy(), target.numpy(), reduction="mean", ignore_index=1
+    )
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out))
+
+
+def _test_nllloss_sum(test_case, device):
+    x = np.array(
+        [
+            [0.88103855, 0.9908683, 0.6226845],
+            [0.53331435, 0.07999352, 0.8549948],
+            [0.25879037, 0.39530203, 0.698465],
+            [0.73427284, 0.63575995, 0.18827209],
+            [0.05689114, 0.0862954, 0.6325046],
+        ]
+    ).astype(np.float32)
+    y = np.array([0, 2, 1, 1, 0]).astype(np.int)
+    input = flow.Tensor(x, dtype=flow.float32, device=flow.device(device))
+    target = flow.Tensor(y, dtype=flow.int64, device=flow.device(device))
+    nll_loss = flow.nn.NLLLoss(reduction="sum")
+    nll_loss = nll_loss.to(device)
+    of_out = nll_loss(input, target)
+    np_out = nll_loss_1d(input.numpy(), target.numpy(), reduction="sum")
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out))
+    nll_loss = flow.nn.NLLLoss(reduction="sum", ignore_index=1)
+    nll_loss = nll_loss.to(device)
+    of_out = nll_loss(input, target)
+    np_out = nll_loss_1d(input.numpy(), target.numpy(), reduction="sum", ignore_index=1)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out))
+
+
+def _test_nllloss_segmentation_none(test_case, device):
+    x = np.array(
+        [[[[0.12, 0.36], [0.22, 0.66]], [[0.13, 0.34], [0.52, -0.96]]]]
+    ).astype(np.float32)
+    input = flow.Tensor(x, dtype=flow.float32, device=flow.device(device))
+    y = np.array([[[1, 0], [0, 1]]]).astype(np.int)
+    target = flow.Tensor(y, dtype=flow.int64, device=flow.device(device))
+    nll_loss = flow.nn.NLLLoss(reduction="none")
+    nll_loss = nll_loss.to(device)
+    of_out = nll_loss(input, target)
+    np_out = nll_loss_2d(input.numpy(), target.numpy())
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out))
+    nll_loss = flow.nn.NLLLoss(reduction="none", ignore_index=1)
+    nll_loss = nll_loss.to(device)
+    of_out = nll_loss(input, target)
+    np_out = nll_loss_2d(input.numpy(), target.numpy(), ignore_index=1)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out))
+
+
+def _test_nllloss_segmentation_mean(test_case, device):
+    x = np.array(
+        [[[[0.12, 0.36], [0.22, 0.66]], [[0.13, 0.34], [0.52, -0.96]]]]
+    ).astype(np.float32)
+    input = flow.Tensor(x, dtype=flow.float32, device=flow.device(device))
+    y = np.array([[[1, 0], [0, 1]]]).astype(np.int)
+    target = flow.Tensor(y, dtype=flow.int64, device=flow.device(device))
+    nll_loss = flow.nn.NLLLoss(reduction="mean")
+    nll_loss = nll_loss.to(device)
+    of_out = nll_loss(input, target)
+    np_out = nll_loss_2d(input.numpy(), target.numpy(), reduction="mean")
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out))
+    nll_loss = flow.nn.NLLLoss(reduction="mean", ignore_index=1)
+    nll_loss = nll_loss.to(device)
+    of_out = nll_loss(input, target)
+    np_out = nll_loss_2d(
+        input.numpy(), target.numpy(), reduction="mean", ignore_index=1
+    )
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out))
+
+
+def _test_nllloss_segmentation_sum(test_case, device):
+    x = np.array(
+        [[[[0.12, 0.36], [0.22, 0.66]], [[0.13, 0.34], [0.52, -0.96]]]]
+    ).astype(np.float32)
+    input = flow.Tensor(x, dtype=flow.float32, device=flow.device(device))
+    y = np.array([[[1, 0], [0, 1]]]).astype(np.int)
+    target = flow.Tensor(y, dtype=flow.int64, device=flow.device(device))
+    nll_loss = flow.nn.NLLLoss(reduction="sum")
+    nll_loss = nll_loss.to(device)
+    of_out = nll_loss(input, target)
+    np_out = nll_loss_2d(input.numpy(), target.numpy(), reduction="sum")
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out))
+    nll_loss = flow.nn.NLLLoss(reduction="sum", ignore_index=1)
+    nll_loss = nll_loss.to(device)
+    of_out = nll_loss(input, target)
+    np_out = nll_loss_2d(input.numpy(), target.numpy(), reduction="sum", ignore_index=1)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out))
+
+
+def _test_nllloss_bert_none(test_case, device):
+    x = np.array([[[0.12, 0.36, 0.22, 0.66], [0.13, 0.34, 0.52, -0.96]]]).astype(
+        np.float32
+    )
+    input = flow.Tensor(x, dtype=flow.float32, device=flow.device(device))
+    y = np.array([[1, 0, 0, 1]]).astype(np.int)
+    target = flow.Tensor(y, dtype=flow.int64, device=flow.device(device))
+    nll_loss = flow.nn.NLLLoss(reduction="none")
+    nll_loss = nll_loss.to(device)
+    of_out = nll_loss(input, target)
+    np_out = nll_loss_bert(input.numpy(), target.numpy())
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out))
+    nll_loss = flow.nn.NLLLoss(reduction="none", ignore_index=1)
+    nll_loss = nll_loss.to(device)
+    of_out = nll_loss(input, target)
+    np_out = nll_loss_bert(input.numpy(), target.numpy(), ignore_index=1)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out))
+
+
+def _test_nllloss_bert_mean(test_case, device):
+    x = np.array([[[0.12, 0.36, 0.22, 0.66], [0.13, 0.34, 0.52, -0.96]]]).astype(
+        np.float32
+    )
+    input = flow.Tensor(x, dtype=flow.float32, device=flow.device(device))
+    y = np.array([[1, 0, 0, 1]]).astype(np.int)
+    target = flow.Tensor(y, dtype=flow.int64, device=flow.device(device))
+    nll_loss = flow.nn.NLLLoss(reduction="mean")
+    nll_loss = nll_loss.to(device)
+    of_out = nll_loss(input, target)
+    np_out = nll_loss_bert(input.numpy(), target.numpy(), reduction="mean")
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out))
+    nll_loss = flow.nn.NLLLoss(reduction="mean", ignore_index=1)
+    nll_loss = nll_loss.to(device)
+    of_out = nll_loss(input, target)
+    np_out = nll_loss_bert(
+        input.numpy(), target.numpy(), reduction="mean", ignore_index=1
+    )
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out))
+
+
+def _test_nllloss_bert_sum(test_case, device):
+    x = np.array([[[0.12, 0.36, 0.22, 0.66], [0.13, 0.34, 0.52, -0.96]]]).astype(
+        np.float32
+    )
+    input = flow.Tensor(x, dtype=flow.float32, device=flow.device(device))
+    y = np.array([[1, 0, 0, 1]]).astype(np.int)
+    target = flow.Tensor(y, dtype=flow.int64, device=flow.device(device))
+    nll_loss = flow.nn.NLLLoss(reduction="sum")
+    nll_loss = nll_loss.to(device)
+    of_out = nll_loss(input, target)
+    np_out = nll_loss_bert(input.numpy(), target.numpy(), reduction="sum")
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out))
+    nll_loss = flow.nn.NLLLoss(reduction="sum", ignore_index=1)
+    nll_loss = nll_loss.to(device)
+    of_out = nll_loss(input, target)
+    np_out = nll_loss_bert(
+        input.numpy(), target.numpy(), reduction="sum", ignore_index=1
+    )
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestNLLLossModule(flow.unittest.TestCase):
+    def test_nllloss(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_nllloss_none,
+            _test_nllloss_mean,
+            _test_nllloss_sum,
+            _test_nllloss_segmentation_none,
+            _test_nllloss_segmentation_mean,
+            _test_nllloss_segmentation_sum,
+            _test_nllloss_bert_none,
+            _test_nllloss_bert_mean,
+            _test_nllloss_bert_sum,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_nllloss_grad.py b/python/oneflow/test/modules/test_nllloss_grad.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa296ca08d11bd5060db05573afda9a749419506
--- /dev/null
+++ b/python/oneflow/test/modules/test_nllloss_grad.py
@@ -0,0 +1,489 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_nllloss_none_backward(test_case, device):
+    x = np.array(
+        [
+            [0.88103855, 0.9908683, 0.6226845],
+            [0.53331435, 0.07999352, 0.8549948],
+            [0.25879037, 0.39530203, 0.698465],
+            [0.73427284, 0.63575995, 0.18827209],
+            [0.05689114, 0.0862954, 0.6325046],
+        ]
+    ).astype(np.float32)
+    y = np.array([0, 2, 1, 1, 0]).astype(np.int)
+    input = flow.Tensor(
+        x, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    target = flow.Tensor(y, dtype=flow.int64, device=flow.device(device))
+    nll_loss = flow.nn.NLLLoss(reduction="none")
+    nll_loss = nll_loss.to(device)
+    of_out = nll_loss(input, target)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [
+        [-1.0, 0.0, 0.0],
+        [0.0, 0.0, -1.0],
+        [0.0, -1.0, 0.0],
+        [0.0, -1.0, 0.0],
+        [-1.0, 0.0, 0.0],
+    ]
+    test_case.assertTrue(
+        np.allclose(input.grad.numpy(), np_grad, atol=1e-05, rtol=1e-05)
+    )
+
+
+def _test_nllloss_mean_backward(test_case, device):
+    x = np.array(
+        [
+            [0.88103855, 0.9908683, 0.6226845],
+            [0.53331435, 0.07999352, 0.8549948],
+            [0.25879037, 0.39530203, 0.698465],
+            [0.73427284, 0.63575995, 0.18827209],
+            [0.05689114, 0.0862954, 0.6325046],
+        ]
+    ).astype(np.float32)
+    y = np.array([0, 2, 1, 1, 0]).astype(np.int)
+    input = flow.Tensor(
+        x, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    target = flow.Tensor(y, dtype=flow.int64, device=flow.device(device))
+    nll_loss = flow.nn.NLLLoss(reduction="mean")
+    nll_loss = nll_loss.to(device)
+    of_out = nll_loss(input, target)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [
+        [-0.20000000298023224, 0.0, 0.0],
+        [0.0, 0.0, -0.20000000298023224],
+        [0.0, -0.20000000298023224, 0.0],
+        [0.0, -0.20000000298023224, 0.0],
+        [-0.20000000298023224, 0.0, 0.0],
+    ]
+    test_case.assertTrue(
+        np.allclose(input.grad.numpy(), np_grad, atol=1e-05, rtol=1e-05)
+    )
+
+
+def _test_nllloss_sum_backward(test_case, device):
+    x = np.array(
+        [
+            [0.88103855, 0.9908683, 0.6226845],
+            [0.53331435, 0.07999352, 0.8549948],
+            [0.25879037, 0.39530203, 0.698465],
+            [0.73427284, 0.63575995, 0.18827209],
+            [0.05689114, 0.0862954, 0.6325046],
+        ]
+    ).astype(np.float32)
+    y = np.array([0, 2, 1, 1, 0]).astype(np.int)
+    input = flow.Tensor(
+        x, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    target = flow.Tensor(y, dtype=flow.int64, device=flow.device(device))
+    nll_loss = flow.nn.NLLLoss(reduction="sum")
+    nll_loss = nll_loss.to(device)
+    of_out = nll_loss(input, target)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [
+        [-1.0, 0.0, 0.0],
+        [0.0, 0.0, -1.0],
+        [0.0, -1.0, 0.0],
+        [0.0, -1.0, 0.0],
+        [-1.0, 0.0, 0.0],
+    ]
+    test_case.assertTrue(
+        np.allclose(input.grad.numpy(), np_grad, atol=1e-05, rtol=1e-05)
+    )
+
+
+def _test_nllloss_segmentation_none_backward(test_case, device):
+    x = np.array(
+        [[[[0.12, 0.36], [0.22, 0.66]], [[0.13, 0.34], [0.52, -0.96]]]]
+    ).astype(np.float32)
+    input = flow.Tensor(
+        x, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    y = np.array([[[1, 0], [0, 1]]]).astype(np.int)
+    target = flow.Tensor(y, dtype=flow.int64, device=flow.device(device))
+    nll_loss = flow.nn.NLLLoss(reduction="none")
+    nll_loss = nll_loss.to(device)
+    of_out = nll_loss(input, target)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [[[[0.0, -1.0], [-1.0, 0.0]], [[-1.0, 0.0], [0.0, -1.0]]]]
+    test_case.assertTrue(
+        np.allclose(input.grad.numpy(), np_grad, atol=1e-05, rtol=1e-05)
+    )
+
+
+def _test_nllloss_segmentation_mean_backward(test_case, device):
+    x = np.array(
+        [[[[0.12, 0.36], [0.22, 0.66]], [[0.13, 0.34], [0.52, -0.96]]]]
+    ).astype(np.float32)
+    input = flow.Tensor(
+        x, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    y = np.array([[[1, 0], [0, 1]]]).astype(np.int)
+    target = flow.Tensor(y, dtype=flow.int64, device=flow.device(device))
+    nll_loss = flow.nn.NLLLoss(reduction="mean")
+    nll_loss = nll_loss.to(device)
+    of_out = nll_loss(input, target)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [[[[0.0, -0.25], [-0.25, 0.0]], [[-0.25, 0.0], [0.0, -0.25]]]]
+    test_case.assertTrue(
+        np.allclose(input.grad.numpy(), np_grad, atol=1e-05, rtol=1e-05)
+    )
+
+
+def _test_nllloss_segmentation_sum_backward(test_case, device):
+    x = np.array(
+        [[[[0.12, 0.36], [0.22, 0.66]], [[0.13, 0.34], [0.52, -0.96]]]]
+    ).astype(np.float32)
+    input = flow.Tensor(
+        x, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    y = np.array([[[1, 0], [0, 1]]]).astype(np.int)
+    target = flow.Tensor(y, dtype=flow.int64, device=flow.device(device))
+    nll_loss = flow.nn.NLLLoss(reduction="sum")
+    nll_loss = nll_loss.to(device)
+    of_out = nll_loss(input, target)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [[[[0.0, -1.0], [-1.0, 0.0]], [[-1.0, 0.0], [0.0, -1.0]]]]
+    test_case.assertTrue(
+        np.allclose(input.grad.numpy(), np_grad, atol=1e-05, rtol=1e-05)
+    )
+
+
+def _test_nllloss_bert_none_backward(test_case, device):
+    x = np.array([[[0.12, 0.36, 0.22, 0.66], [0.13, 0.34, 0.52, -0.96]]]).astype(
+        np.float32
+    )
+    input = flow.Tensor(
+        x, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    y = np.array([[1, 0, 0, 1]]).astype(np.int)
+    target = flow.Tensor(y, dtype=flow.int64, device=flow.device(device))
+    nll_loss = flow.nn.NLLLoss(reduction="none")
+    nll_loss = nll_loss.to(device)
+    of_out = nll_loss(input, target)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [[[0.0, -1.0, -1.0, 0.0], [-1.0, 0.0, 0.0, -1.0]]]
+    test_case.assertTrue(
+        np.allclose(input.grad.numpy(), np_grad, atol=1e-05, rtol=1e-05)
+    )
+
+
+def _test_nllloss_bert_mean_backward(test_case, device):
+    x = np.array([[[0.12, 0.36, 0.22, 0.66], [0.13, 0.34, 0.52, -0.96]]]).astype(
+        np.float32
+    )
+    input = flow.Tensor(
+        x, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    y = np.array([[1, 0, 0, 1]]).astype(np.int)
+    target = flow.Tensor(y, dtype=flow.int64, device=flow.device(device))
+    nll_loss = flow.nn.NLLLoss(reduction="mean")
+    nll_loss = nll_loss.to(device)
+    of_out = nll_loss(input, target)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [[[0.0, -0.25, -0.25, 0.0], [-0.25, 0.0, 0.0, -0.25]]]
+    test_case.assertTrue(
+        np.allclose(input.grad.numpy(), np_grad, atol=1e-05, rtol=1e-05)
+    )
+
+
+def _test_nllloss_bert_sum_backward(test_case, device):
+    x = np.array([[[0.12, 0.36, 0.22, 0.66], [0.13, 0.34, 0.52, -0.96]]]).astype(
+        np.float32
+    )
+    input = flow.Tensor(
+        x, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    y = np.array([[1, 0, 0, 1]]).astype(np.int)
+    target = flow.Tensor(y, dtype=flow.int64, device=flow.device(device))
+    nll_loss = flow.nn.NLLLoss(reduction="sum")
+    nll_loss = nll_loss.to(device)
+    of_out = nll_loss(input, target)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [[[0.0, -1.0, -1.0, 0.0], [-1.0, 0.0, 0.0, -1.0]]]
+    test_case.assertTrue(
+        np.allclose(input.grad.numpy(), np_grad, atol=1e-05, rtol=1e-05)
+    )
+
+
+def _test_nllloss_none_backward_with_ignore_index(test_case, device):
+    x = np.array(
+        [
+            [0.88103855, 0.9908683, 0.6226845],
+            [0.53331435, 0.07999352, 0.8549948],
+            [0.25879037, 0.39530203, 0.698465],
+            [0.73427284, 0.63575995, 0.18827209],
+            [0.05689114, 0.0862954, 0.6325046],
+        ]
+    ).astype(np.float32)
+    y = np.array([0, 2, 1, 1, 0]).astype(np.int)
+    input = flow.Tensor(
+        x, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    target = flow.Tensor(y, dtype=flow.int64, device=flow.device(device))
+    nll_loss = flow.nn.NLLLoss(reduction="none", ignore_index=1)
+    nll_loss = nll_loss.to(device)
+    of_out = nll_loss(input, target)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [
+        [-1.0, 0.0, 0.0],
+        [0.0, 0.0, -1.0],
+        [0.0, 0.0, 0.0],
+        [0.0, 0.0, 0.0],
+        [-1.0, 0.0, 0.0],
+    ]
+    test_case.assertTrue(
+        np.allclose(input.grad.numpy(), np_grad, atol=1e-05, rtol=1e-05)
+    )
+
+
+def _test_nllloss_mean_backward_with_ignore_index(test_case, device):
+    x = np.array(
+        [
+            [0.88103855, 0.9908683, 0.6226845],
+            [0.53331435, 0.07999352, 0.8549948],
+            [0.25879037, 0.39530203, 0.698465],
+            [0.73427284, 0.63575995, 0.18827209],
+            [0.05689114, 0.0862954, 0.6325046],
+        ]
+    ).astype(np.float32)
+    y = np.array([0, 2, 1, 1, 0]).astype(np.int)
+    input = flow.Tensor(
+        x, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    target = flow.Tensor(y, dtype=flow.int64, device=flow.device(device))
+    nll_loss = flow.nn.NLLLoss(reduction="mean", ignore_index=1)
+    nll_loss = nll_loss.to(device)
+    of_out = nll_loss(input, target)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [
+        [-0.33333, 0.0, 0.0],
+        [0.0, 0.0, -0.33333],
+        [0.0, 0.0, 0.0],
+        [0.0, 0.0, 0.0],
+        [-0.33333, 0.0, 0.0],
+    ]
+    test_case.assertTrue(
+        np.allclose(input.grad.numpy(), np_grad, atol=1e-05, rtol=1e-05)
+    )
+
+
+def _test_nllloss_sum_backward_with_ignore_index(test_case, device):
+    x = np.array(
+        [
+            [0.88103855, 0.9908683, 0.6226845],
+            [0.53331435, 0.07999352, 0.8549948],
+            [0.25879037, 0.39530203, 0.698465],
+            [0.73427284, 0.63575995, 0.18827209],
+            [0.05689114, 0.0862954, 0.6325046],
+        ]
+    ).astype(np.float32)
+    y = np.array([0, 2, 1, 1, 0]).astype(np.int)
+    input = flow.Tensor(
+        x, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    target = flow.Tensor(y, dtype=flow.int64, device=flow.device(device))
+    nll_loss = flow.nn.NLLLoss(reduction="sum", ignore_index=1)
+    nll_loss = nll_loss.to(device)
+    of_out = nll_loss(input, target)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [
+        [-1.0, 0.0, 0.0],
+        [0.0, 0.0, -1.0],
+        [0.0, 0.0, 0.0],
+        [0.0, 0.0, 0.0],
+        [-1.0, 0.0, 0.0],
+    ]
+    test_case.assertTrue(
+        np.allclose(input.grad.numpy(), np_grad, atol=1e-05, rtol=1e-05)
+    )
+
+
+def _test_nllloss_segmentation_none_backward_with_ignore_index(test_case, device):
+    x = np.array(
+        [[[[0.12, 0.36], [0.22, 0.66]], [[0.13, 0.34], [0.52, -0.96]]]]
+    ).astype(np.float32)
+    input = flow.Tensor(
+        x, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    y = np.array([[[1, 0], [0, 1]]]).astype(np.int)
+    target = flow.Tensor(y, dtype=flow.int64, device=flow.device(device))
+    nll_loss = flow.nn.NLLLoss(reduction="none", ignore_index=1)
+    nll_loss = nll_loss.to(device)
+    of_out = nll_loss(input, target)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [[[[0.0, -1.0], [-1.0, 0.0]], [[0.0, 0.0], [0.0, 0.0]]]]
+    test_case.assertTrue(
+        np.allclose(input.grad.numpy(), np_grad, atol=1e-05, rtol=1e-05)
+    )
+
+
+def _test_nllloss_segmentation_mean_backward_with_ignore_index(test_case, device):
+    x = np.array(
+        [[[[0.12, 0.36], [0.22, 0.66]], [[0.13, 0.34], [0.52, -0.96]]]]
+    ).astype(np.float32)
+    input = flow.Tensor(
+        x, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    y = np.array([[[1, 0], [0, 1]]]).astype(np.int)
+    target = flow.Tensor(y, dtype=flow.int64, device=flow.device(device))
+    nll_loss = flow.nn.NLLLoss(reduction="mean", ignore_index=1)
+    nll_loss = nll_loss.to(device)
+    of_out = nll_loss(input, target)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [[[[0.0, -0.5], [-0.5, 0.0]], [[0.0, 0.0], [0.0, 0.0]]]]
+    test_case.assertTrue(
+        np.allclose(input.grad.numpy(), np_grad, atol=1e-05, rtol=1e-05)
+    )
+
+
+def _test_nllloss_segmentation_sum_backward_with_ignore_index(test_case, device):
+    x = np.array(
+        [[[[0.12, 0.36], [0.22, 0.66]], [[0.13, 0.34], [0.52, -0.96]]]]
+    ).astype(np.float32)
+    input = flow.Tensor(
+        x, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    y = np.array([[[1, 0], [0, 1]]]).astype(np.int)
+    target = flow.Tensor(y, dtype=flow.int64, device=flow.device(device))
+    nll_loss = flow.nn.NLLLoss(reduction="sum", ignore_index=1)
+    nll_loss = nll_loss.to(device)
+    of_out = nll_loss(input, target)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [[[[0.0, -1.0], [-1.0, 0.0]], [[0.0, 0.0], [0.0, 0.0]]]]
+    test_case.assertTrue(
+        np.allclose(input.grad.numpy(), np_grad, atol=1e-05, rtol=1e-05)
+    )
+
+
+def _test_nllloss_bert_none_backward_with_ignore_index(test_case, device):
+    x = np.array([[[0.12, 0.36, 0.22, 0.66], [0.13, 0.34, 0.52, -0.96]]]).astype(
+        np.float32
+    )
+    input = flow.Tensor(
+        x, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    y = np.array([[1, 0, 0, 1]]).astype(np.int)
+    target = flow.Tensor(y, dtype=flow.int64, device=flow.device(device))
+    nll_loss = flow.nn.NLLLoss(reduction="none", ignore_index=1)
+    nll_loss = nll_loss.to(device)
+    of_out = nll_loss(input, target)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [[[0.0, -1.0, -1.0, 0.0], [0.0, 0.0, 0.0, 0.0]]]
+    test_case.assertTrue(
+        np.allclose(input.grad.numpy(), np_grad, atol=1e-05, rtol=1e-05)
+    )
+
+
+def _test_nllloss_bert_mean_backward_with_ignore_index(test_case, device):
+    x = np.array([[[0.12, 0.36, 0.22, 0.66], [0.13, 0.34, 0.52, -0.96]]]).astype(
+        np.float32
+    )
+    input = flow.Tensor(
+        x, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    y = np.array([[1, 0, 0, 1]]).astype(np.int)
+    target = flow.Tensor(y, dtype=flow.int64, device=flow.device(device))
+    nll_loss = flow.nn.NLLLoss(reduction="mean", ignore_index=1)
+    nll_loss = nll_loss.to(device)
+    of_out = nll_loss(input, target)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [[[0.0, -0.5, -0.5, 0.0], [0.0, 0.0, 0.0, 0.0]]]
+    test_case.assertTrue(
+        np.allclose(input.grad.numpy(), np_grad, atol=1e-05, rtol=1e-05)
+    )
+
+
+def _test_nllloss_bert_sum_backward_with_ignore_index(test_case, device):
+    x = np.array([[[0.12, 0.36, 0.22, 0.66], [0.13, 0.34, 0.52, -0.96]]]).astype(
+        np.float32
+    )
+    input = flow.Tensor(
+        x, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    y = np.array([[1, 0, 0, 1]]).astype(np.int)
+    target = flow.Tensor(y, dtype=flow.int64, device=flow.device(device))
+    nll_loss = flow.nn.NLLLoss(reduction="sum", ignore_index=1)
+    nll_loss = nll_loss.to(device)
+    of_out = nll_loss(input, target)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [[[0.0, -1.0, -1.0, 0.0], [0.0, 0.0, 0.0, 0.0]]]
+    test_case.assertTrue(
+        np.allclose(input.grad.numpy(), np_grad, atol=1e-05, rtol=1e-05)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestNLLLossModule(flow.unittest.TestCase):
+    def test_nllloss(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_nllloss_none_backward,
+            _test_nllloss_mean_backward,
+            _test_nllloss_sum_backward,
+            _test_nllloss_segmentation_none_backward,
+            _test_nllloss_segmentation_mean_backward,
+            _test_nllloss_segmentation_sum_backward,
+            _test_nllloss_bert_none_backward,
+            _test_nllloss_bert_mean_backward,
+            _test_nllloss_bert_sum_backward,
+            _test_nllloss_none_backward_with_ignore_index,
+            _test_nllloss_mean_backward_with_ignore_index,
+            _test_nllloss_sum_backward_with_ignore_index,
+            _test_nllloss_segmentation_none_backward_with_ignore_index,
+            _test_nllloss_segmentation_mean_backward_with_ignore_index,
+            _test_nllloss_segmentation_sum_backward_with_ignore_index,
+            _test_nllloss_bert_none_backward_with_ignore_index,
+            _test_nllloss_bert_mean_backward_with_ignore_index,
+            _test_nllloss_bert_sum_backward_with_ignore_index,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_norm.py b/python/oneflow/test/modules/test_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..3525bca9a0fa5837cf7fc720eb853db53978fdc8
--- /dev/null
+++ b/python/oneflow/test/modules/test_norm.py
@@ -0,0 +1,260 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _np_vector_norm_backward(x, ord=2, dim=None):
+    re = np.zeros_like(x)
+    if isinstance(ord, int) and isinstance(dim, int):
+        if ord == 0:
+            return re
+        else:
+            temp = np.sum(np.abs(x ** ord), dim) ** (1.0 / ord - 1)
+            re = np.where(x ** ord < 0, -temp, temp) * x ** (ord - 1)
+    elif dim == None and x.ndim == 1:
+        if ord == 0:
+            return re
+        elif ord == float("inf"):
+            max_ind = np.argmax(np.abs(x))
+            re[max_ind] += 1 if x[max_ind] != 0 else 0
+            re = np.where(x < 0, -re, re)
+        elif ord == float("-inf"):
+            min_ind = np.argmin(np.abs(x))
+            re[min_ind] += 1 if x[min_ind] != 0 else 0
+            re = np.where(x < 0, -re, re)
+        else:
+            temp = np.sum(np.abs(x ** ord)) ** (1.0 / ord - 1)
+            re = np.where(x ** ord < 0, -temp, temp) * x ** (ord - 1)
+    elif (
+        isinstance(ord, float)
+        and isinstance(dim, int)
+        and (ord in [float("inf"), float("-inf")])
+    ):
+        if ord == float("inf"):
+            max_ind = np.argmax(np.abs(x), dim)
+            index = (
+                [(i, max_ind[i]) for i in range(len(max_ind))]
+                if dim == 1
+                else [(max_ind[i], i) for i in range(len(max_ind))]
+            )
+            print(index)
+            for j in index:
+                re[j] += 1 if x[j] != 0 else 0
+            re = np.where(x < 0, -re, re)
+        else:
+            min_ind = np.argmin(np.abs(x), dim)
+            index = (
+                [(i, min_ind[i]) for i in range(len(min_ind))]
+                if dim == 1
+                else [(min_ind[i], i) for i in range(len(min_ind))]
+            )
+            for j in index:
+                re[j] += 1 if x[j] != 0 else 0
+            re = np.where(x < 0, -re, re)
+    return re
+
+
+def _np_matrix_norm_backward(x, ord="fro"):
+    re = np.zeros_like(x)
+    if isinstance(ord, int):
+        if ord == 1:
+            max_ind = np.argmax(np.sum(np.abs(x), 0))
+            index = [(i, max_ind) for i in range(x.shape[0])]
+            for j in index:
+                re[j] += 1 if x[j] != 0 else 0
+            re = np.where(x < 0, -re, re)
+        elif ord == -1:
+            min_ind = np.argmin(np.sum(np.abs(x), 0))
+            index = [(i, min_ind) for i in range(x.shape[0])]
+            for j in index:
+                re[j] += 1 if x[j] != 0 else 0
+            re = np.where(x < 0, -re, re)
+    elif ord == "fro":
+        re = np.sum(x ** 2) ** (-0.5) * x
+    elif isinstance(ord, float) and ord in [float("inf"), float("-inf")]:
+        if ord == float("inf"):
+            max_ind = np.argmax(np.sum(np.abs(x), 1))
+            index = [(max_ind, i) for i in range(x.shape[1])]
+            for j in index:
+                re[j] += 1 if x[j] != 0 else 0
+            re = np.where(x < 0, -re, re)
+        else:
+            min_ind = np.argmin(np.sum(np.abs(x), 1))
+            index = [(min_ind, i) for i in range(x.shape[1])]
+            for j in index:
+                re[j] += 1 if x[j] != 0 else 0
+            re = np.where(x < 0, -re, re)
+    return re
+
+
+def _test_norm_1d(test_case, device):
+    input = flow.Tensor(
+        np.random.randn(10), dtype=flow.float32, device=flow.device(device)
+    )
+    of_out_1 = flow.linalg.norm(input)
+    of_out_2 = flow.linalg.norm(input, ord=0)
+    of_out_3 = flow.linalg.norm(input, ord=3)
+    of_out_4 = flow.linalg.norm(input, ord=float("inf"))
+    of_out_5 = flow.linalg.norm(input, ord=-float("inf"))
+    np_out_1 = np.linalg.norm(input.numpy())
+    np_out_2 = np.linalg.norm(input.numpy(), ord=0)
+    np_out_3 = np.linalg.norm(input.numpy(), ord=3)
+    np_out_4 = np.linalg.norm(input.numpy(), ord=float("inf"))
+    np_out_5 = np.linalg.norm(input.numpy(), ord=-float("inf"))
+    test_case.assertTrue(np.allclose(of_out_1.numpy(), np_out_1, 1e-05, 1e-05))
+    test_case.assertTrue(np.allclose(of_out_2.numpy(), np_out_2, 1e-05, 1e-05))
+    test_case.assertTrue(np.allclose(of_out_3.numpy(), np_out_3, 1e-05, 1e-05))
+    test_case.assertTrue(np.allclose(of_out_4.numpy(), np_out_4, 1e-05, 1e-05))
+    test_case.assertTrue(np.allclose(of_out_5.numpy(), np_out_5, 1e-05, 1e-05))
+
+
+def _test_norm_2d(test_case, device):
+    input = flow.Tensor(
+        np.random.randn(5, 4), dtype=flow.float32, device=flow.device(device)
+    )
+    of_out_1 = flow.linalg.norm(input)
+    of_out_2 = flow.linalg.norm(input, dim=0)
+    of_out_3 = flow.linalg.norm(input, dim=1, keepdim=True)
+    of_out_4 = flow.linalg.norm(input, ord=1, dim=0)
+    of_out_5 = flow.linalg.norm(input, ord=-1, dim=1, keepdim=True)
+    np_out_1 = np.linalg.norm(input.numpy())
+    np_out_2 = np.linalg.norm(input.numpy(), axis=0)
+    np_out_3 = np.linalg.norm(input.numpy(), axis=1, keepdims=True)
+    np_out_4 = np.linalg.norm(input.numpy(), ord=1, axis=0)
+    np_out_5 = np.linalg.norm(input.numpy(), ord=-1, axis=1, keepdims=True)
+    test_case.assertTrue(np.allclose(of_out_1.numpy(), np_out_1, 1e-05, 1e-05))
+    test_case.assertTrue(np.allclose(of_out_2.numpy(), np_out_2, 1e-05, 1e-05))
+    test_case.assertTrue(np.allclose(of_out_3.numpy(), np_out_3, 1e-05, 1e-05))
+    test_case.assertTrue(np.allclose(of_out_4.numpy(), np_out_4, 1e-05, 1e-05))
+    test_case.assertTrue(np.allclose(of_out_5.numpy(), np_out_5, 1e-05, 1e-05))
+
+
+def _test_norm_Nd(test_case, device):
+    input1 = flow.Tensor(
+        np.random.randn(3, 4, 3), dtype=flow.float32, device=flow.device(device)
+    )
+    input2 = flow.Tensor(
+        np.random.randn(3, 4, 3, 5), dtype=flow.float32, device=flow.device(device)
+    )
+    of_out_1 = flow.linalg.norm(input1)
+    of_out_2 = flow.linalg.norm(input1, dim=(0, 1))
+    of_out_3 = flow.linalg.norm(input2, dim=(0, 2))
+    np_out_1 = np.linalg.norm(input1.numpy())
+    np_out_2 = np.linalg.norm(input1.numpy(), axis=(0, 1))
+    np_out_3 = np.linalg.norm(input2.numpy(), axis=(0, 2))
+    test_case.assertTrue(np.allclose(of_out_1.numpy(), np_out_1, 1e-05, 1e-05))
+    test_case.assertTrue(np.allclose(of_out_2.numpy(), np_out_2, 1e-05, 1e-05))
+    test_case.assertTrue(np.allclose(of_out_3.numpy(), np_out_3, 1e-05, 1e-05))
+
+
+def _test_fro_order_norm_backward(test_case, device):
+    input = flow.Tensor(
+        np.random.randn(5, 4),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    of_out = flow.linalg.norm(input)
+    of_out.backward()
+    np_out_grad = _np_matrix_norm_backward(input.numpy())
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_out_grad, 1e-05, 1e-05))
+
+
+def _test_1d_inf_order_norm_backward(test_case, device):
+    for ord in [float("inf"), -float("inf")]:
+        input = flow.Tensor(
+            np.random.randn(5),
+            dtype=flow.float32,
+            device=flow.device(device),
+            requires_grad=True,
+        )
+        of_out = flow.linalg.norm(input, ord=ord)
+        of_out.backward()
+        np_out_grad = _np_vector_norm_backward(input.numpy(), ord=ord)
+        test_case.assertTrue(np.allclose(input.grad.numpy(), np_out_grad, 1e-05, 1e-05))
+
+
+def _test_2d_inf_order_norm_backward(test_case, device):
+    for ord in [float("inf"), -float("inf")]:
+        input = flow.Tensor(
+            np.random.randn(5, 4),
+            dtype=flow.float32,
+            device=flow.device(device),
+            requires_grad=True,
+        )
+        of_out = flow.linalg.norm(input, ord=ord)
+        of_out.backward()
+        np_out_grad = _np_matrix_norm_backward(input.numpy(), ord=ord)
+        test_case.assertTrue(np.allclose(input.grad.numpy(), np_out_grad, 1e-05, 1e-05))
+
+
+def _test_1d_digits_order_norm_backward(test_case, device):
+    for ord in [1, -1, 2, -2, 5]:
+        input = flow.Tensor(
+            np.random.randn(5),
+            dtype=flow.float32,
+            device=flow.device(device),
+            requires_grad=True,
+        )
+        of_out = flow.linalg.norm(input, ord=ord)
+        of_out.backward()
+        np_out_grad = _np_vector_norm_backward(input.numpy(), ord=ord)
+        test_case.assertTrue(np.allclose(input.grad.numpy(), np_out_grad, 1e-05, 1e-05))
+
+
+def _test_2d_digits_order_norm_backward(test_case, device):
+    for ord in [1, -1]:
+        input = flow.Tensor(
+            np.random.randn(4, 5),
+            dtype=flow.float32,
+            device=flow.device(device),
+            requires_grad=True,
+        )
+        of_out = flow.linalg.norm(input, ord=ord)
+        of_out.backward()
+        np_out_grad = _np_matrix_norm_backward(input.numpy(), ord=ord)
+        test_case.assertTrue(np.allclose(input.grad.numpy(), np_out_grad, 1e-05, 1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestNormModule(flow.unittest.TestCase):
+    def test_norm(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["fun"] = [
+            _test_norm_1d,
+            _test_norm_2d,
+            _test_norm_Nd,
+            _test_fro_order_norm_backward,
+            _test_1d_inf_order_norm_backward,
+            _test_2d_inf_order_norm_backward,
+            _test_1d_digits_order_norm_backward,
+            _test_2d_digits_order_norm_backward,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_normalization.py b/python/oneflow/test/modules/test_normalization.py
new file mode 100644
index 0000000000000000000000000000000000000000..2542238376abd229b53c485dbe4e9abfcf869f10
--- /dev/null
+++ b/python/oneflow/test/modules/test_normalization.py
@@ -0,0 +1,141 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+input_arr = np.array(
+    [
+        [
+            [[-0.16046895, -1.03667831], [-0.34974465, 0.26505867]],
+            [[-1.24111986, -0.53806001], [1.72426331, 0.43572459]],
+        ],
+        [
+            [[-0.77390957, -0.42610624], [0.16398858, -1.35760343]],
+            [[1.07541728, 0.11008703], [0.26361224, -0.48663723]],
+        ],
+    ],
+    dtype=np.float32,
+)
+
+
+def _test_layernorm(test_case, device):
+    output = np.array(
+        [
+            [
+                [[-0.0544118, -1.0509688], [-0.2696846, 0.4295622]],
+                [[-1.2834904, -0.4838651], [2.0891891, 0.6236691]],
+            ],
+            [
+                [[-0.8555527, -0.3554582], [0.493019, -1.694826]],
+                [[1.8035311, 0.4155158], [0.6362644, -0.4424936]],
+            ],
+        ],
+        dtype=np.float32,
+    )
+    x = flow.Tensor(input_arr, device=flow.device(device))
+    m = flow.nn.LayerNorm(x.size()[1:]).to(device=flow.device(device))
+    y = m(x)
+    test_case.assertTrue(np.allclose(y.numpy(), output, 1e-05, 1e-05))
+
+
+def _test_layernorm_v2(test_case, device):
+    output = np.array(
+        [
+            [
+                [[0.3406544, -1.5249983], [-0.0623574, 1.2467014]],
+                [[-1.2004623, -0.5688803], [1.4634399, 0.3059027]],
+            ],
+            [
+                [[-0.3180245, 0.3122248], [1.3815271, -1.3757277]],
+                [[1.497291, -0.2341234], [0.0412391, -1.3044068]],
+            ],
+        ],
+        dtype=np.float32,
+    )
+    x = flow.Tensor(input_arr, device=flow.device(device))
+    m = flow.nn.LayerNorm([2, 2], eps=1e-05).to(device=flow.device(device))
+    y = m(x)
+    test_case.assertTrue(np.allclose(y.numpy(), output, 1e-05, 1e-05))
+
+
+def _test_layernorm_v3(test_case, device):
+    output = np.array(
+        [
+            [
+                [[0.999974, -0.999974], [-0.999947, 0.999947]],
+                [[-0.9999595, 0.9999595], [0.999988, -0.999988]],
+            ],
+            [
+                [[-0.9998344, 0.9998341], [0.9999914, -0.9999914]],
+                [[0.9999787, -0.9999787], [0.9999645, -0.9999645]],
+            ],
+        ],
+        dtype=np.float32,
+    )
+    x = flow.Tensor(input_arr, device=flow.device(device))
+    m = flow.nn.LayerNorm(2, elementwise_affine=True).to(device=flow.device(device))
+    y = m(x)
+    test_case.assertTrue(np.allclose(y.numpy(), output, 1e-05, 1e-05))
+
+
+def _test_layernorm_backward(test_case, device):
+    output = np.array(
+        [
+            [
+                [[-0.0544118, -1.0509688], [-0.2696846, 0.4295622]],
+                [[-1.2834904, -0.4838651], [2.0891891, 0.6236691]],
+            ],
+            [
+                [[-0.8555527, -0.3554582], [0.493019, -1.694826]],
+                [[1.8035311, 0.4155158], [0.6362644, -0.4424936]],
+            ],
+        ],
+        dtype=np.float32,
+    )
+    x = flow.Tensor(input_arr, device=flow.device(device), requires_grad=True)
+    m = flow.nn.LayerNorm(x.size()[1:]).to(device=flow.device(device))
+    y = m(x)
+    z = y.sum()
+    z.backward()
+    test_case.assertTrue(
+        np.allclose(x.grad.numpy(), np.zeros(shape=input_arr.shape), 1e-05, 1e-05)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestLayerNorm(flow.unittest.TestCase):
+    def test_layernorm(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_layernorm,
+            _test_layernorm_v2,
+            _test_layernorm_v3,
+            _test_layernorm_backward,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_ones_like.py b/python/oneflow/test/modules/test_ones_like.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ce9a981d756530e3947f38ef8766f08656cf8b6
--- /dev/null
+++ b/python/oneflow/test/modules/test_ones_like.py
@@ -0,0 +1,59 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_ones_like_float(test_case, shape, device):
+    x = flow.Tensor(np.random.randn(*shape), device=flow.device(device))
+    y = flow.ones_like(x)
+    test_case.assertTrue(y.dtype is flow.float32)
+    test_case.assertTrue(y.shape == x.shape)
+    test_case.assertTrue(y.device == x.device)
+    y_numpy = np.ones_like(x.numpy())
+    test_case.assertTrue(np.array_equal(y.numpy(), y_numpy))
+
+
+def _test_ones_like_int(test_case, shape, device):
+    x = flow.Tensor(np.random.randn(*shape), dtype=flow.int, device=flow.device(device))
+    y = flow.ones_like(x)
+    test_case.assertTrue(y.dtype is flow.int)
+    test_case.assertTrue(y.shape == x.shape)
+    test_case.assertTrue(y.device == x.device)
+    y_numpy = np.ones_like(x.numpy())
+    test_case.assertTrue(np.array_equal(y.numpy(), y_numpy))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestModule(flow.unittest.TestCase):
+    def test_ones_like(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [_test_ones_like_float, _test_ones_like_int]
+        arg_dict["shape"] = [(2, 3), (2, 3, 4), (2, 4, 5, 6)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_optim_adam.py b/python/oneflow/test/modules/test_optim_adam.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc56d0a6a9f7730ef2d67cb35e6118f9aa9d337d
--- /dev/null
+++ b/python/oneflow/test/modules/test_optim_adam.py
@@ -0,0 +1,114 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+from oneflow.nn.parameter import Parameter
+
+
+def compare_with_numpy_adam(
+    test_case,
+    device,
+    x_shape,
+    scale,
+    learning_rate,
+    train_iters,
+    betas,
+    weight_decay,
+    eps,
+):
+    random_grad_seq = []
+    for _ in range(train_iters):
+        random_grad_seq.append(np.random.uniform(size=x_shape).astype(np.float32))
+    init_value = np.random.uniform(size=x_shape).astype(np.float32)
+
+    def train_by_oneflow():
+        x = Parameter(flow.Tensor(init_value, device=flow.device(device)))
+        adam = flow.optim.Adam(
+            [
+                {
+                    "params": [x],
+                    "lr": learning_rate,
+                    "betas": betas,
+                    "eps": eps,
+                    "weight_decay": weight_decay,
+                    "scale": scale,
+                }
+            ]
+        )
+
+        def train_one_iter(grad):
+            grad_tensor = flow.Tensor(
+                grad, requires_grad=False, device=flow.device(device)
+            )
+            loss = flow.sum(x * grad_tensor)
+            loss.backward()
+            adam.step()
+            adam.zero_grad()
+
+        for i in range(train_iters):
+            train_one_iter(random_grad_seq[i])
+        return x
+
+    def train_by_numpy():
+        x = init_value
+        vt = np.zeros_like(x)
+        st = np.zeros_like(x)
+        beta1 = betas[0]
+        beta2 = betas[1]
+
+        def train_one_iter(grad):
+            grad = grad * scale + weight_decay * x
+            v = beta1 * vt + (1 - beta1) * grad
+            s = beta2 * st + (1 - beta2) * grad * grad
+            param = x - learning_rate * (v / (np.sqrt(s) + eps))
+            return (param, v, s)
+
+        for i in range(train_iters):
+            (x, vt, st) = train_one_iter(random_grad_seq[i])
+        return x
+
+    oneflow_res = train_by_oneflow().numpy()
+    numpy_res = train_by_numpy()
+    test_case.assertTrue(
+        np.allclose(oneflow_res.flatten(), numpy_res.flatten(), rtol=0.001, atol=0.001)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestAdam(flow.unittest.TestCase):
+    def test_adam(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device"] = ["cpu", "cuda"]
+        arg_dict["x_shape"] = [(10,)]
+        arg_dict["scale"] = [1.0, 0.8]
+        arg_dict["learning_rate"] = [1]
+        arg_dict["train_iters"] = [10]
+        arg_dict["betas"] = [(0.99, 0.9), (0.8, 0.7)]
+        arg_dict["weight_decay"] = [0.0, 0.1]
+        arg_dict["eps"] = [1e-08, 1e-07]
+        for arg in GenArgList(arg_dict):
+            compare_with_numpy_adam(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_optim_adamw.py b/python/oneflow/test/modules/test_optim_adamw.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fe5b4f26c77480c17b2c7220bea1ce6aa4764c7
--- /dev/null
+++ b/python/oneflow/test/modules/test_optim_adamw.py
@@ -0,0 +1,108 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+from oneflow.nn.parameter import Parameter
+
+
+def compare_with_numpy_adamw(
+    test_case, device, x_shape, scale, learning_rate, train_iters, weight_decay
+):
+    random_grad_seq = []
+    for _ in range(train_iters):
+        random_grad_seq.append(np.random.uniform(size=x_shape).astype(np.float32))
+    init_value = np.random.uniform(size=x_shape).astype(np.float32)
+
+    def train_by_oneflow():
+        x = Parameter(flow.Tensor(init_value, device=flow.device(device)))
+        adam = flow.optim.AdamW(
+            [
+                {
+                    "params": [x],
+                    "lr": learning_rate,
+                    "weight_decay": weight_decay,
+                    "scale": scale,
+                }
+            ]
+        )
+
+        def train_one_iter(grad):
+            grad_tensor = flow.Tensor(
+                grad, requires_grad=False, device=flow.device(device)
+            )
+            loss = flow.sum(x * grad_tensor)
+            loss.backward()
+            adam.step()
+            adam.zero_grad()
+
+        for i in range(train_iters):
+            train_one_iter(random_grad_seq[i])
+        return x
+
+    def train_by_numpy():
+        x = init_value
+        vt = np.zeros_like(x)
+        st = np.zeros_like(x)
+        beta1 = 0.9
+        beta2 = 0.999
+
+        def train_one_iter(grad):
+            grad = grad * scale
+            v = beta1 * vt + (1 - beta1) * grad
+            s = beta2 * st + (1 - beta2) * grad * grad
+            g = (
+                learning_rate / (np.sqrt(s) + 1e-08) * v
+                + learning_rate * weight_decay * x
+            )
+            param = x - g
+            return (param, v, s)
+
+        for i in range(train_iters):
+            (x, vt, st) = train_one_iter(random_grad_seq[i])
+        return x
+
+    oneflow_res = train_by_oneflow().numpy()
+    numpy_res = train_by_numpy()
+    test_case.assertTrue(
+        np.allclose(
+            oneflow_res.flatten(), numpy_res.flatten(), rtol=0.0001, atol=0.0001
+        )
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestAdamW(flow.unittest.TestCase):
+    def test_adamw(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device"] = ["cpu", "cuda"]
+        arg_dict["x_shape"] = [(10,)]
+        arg_dict["scale"] = [1.0, 0.9]
+        arg_dict["learning_rate"] = [1]
+        arg_dict["train_iters"] = [10]
+        arg_dict["weight_decay"] = [0.001, 0.0]
+        for arg in GenArgList(arg_dict):
+            compare_with_numpy_adamw(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_optim_rmsprop.py b/python/oneflow/test/modules/test_optim_rmsprop.py
new file mode 100644
index 0000000000000000000000000000000000000000..9afa38e9c59f54e6f71cc156abc0d1f1b667688e
--- /dev/null
+++ b/python/oneflow/test/modules/test_optim_rmsprop.py
@@ -0,0 +1,129 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+from oneflow.nn.parameter import Parameter
+
+
+def compare_with_numpy_rmsprop(
+    test_case,
+    device,
+    x_shape,
+    scale,
+    learning_rate,
+    momentum,
+    train_iters,
+    alpha,
+    eps,
+    weight_decay,
+    centered,
+):
+    random_grad_seq = []
+    for _ in range(train_iters):
+        random_grad_seq.append(np.random.uniform(size=x_shape).astype(np.float32))
+    init_value = np.random.uniform(size=x_shape).astype(np.float32)
+
+    def train_by_oneflow():
+        x = Parameter(flow.Tensor(init_value, device=flow.device(device)))
+        param_list = list()
+        param_list.append(x)
+        rmsprop = flow.optim.RMSprop(
+            [
+                {
+                    "params": param_list,
+                    "lr": learning_rate,
+                    "alpha": alpha,
+                    "eps": eps,
+                    "weight_decay": weight_decay,
+                    "momentum": momentum,
+                    "centered": centered,
+                    "scale": scale,
+                }
+            ]
+        )
+
+        def train_one_iter(grad):
+            grad_tensor = flow.Tensor(
+                grad, requires_grad=False, device=flow.device(device)
+            )
+            loss = flow.sum(x * grad_tensor)
+            loss.backward()
+            rmsprop.step()
+            rmsprop.zero_grad()
+
+        for i in range(train_iters):
+            train_one_iter(random_grad_seq[i])
+        return x
+
+    def train_by_numpy():
+        x = init_value
+        r = np.zeros_like(x)
+        v = np.zeros_like(x)
+        g = np.zeros_like(x)
+
+        def train_one_iter(grad):
+            grad = grad * scale
+            if centered:
+                r_ = alpha * r + (1 - alpha) * grad * grad
+                g_ = alpha * g + (1 - alpha) * grad
+                v_ = momentum * v + learning_rate / np.sqrt(r_ - g_ * g_ + eps) * grad
+            else:
+                r_ = alpha * r + (1 - alpha) * grad * grad
+                g_ = g
+                v_ = momentum * v + learning_rate / np.sqrt(r_ + eps) * grad
+            param = x - v_
+            return (param, r_, g_, v_)
+
+        for i in range(train_iters):
+            (x, r, g, v) = train_one_iter(random_grad_seq[i])
+        return x
+
+    oneflow_res = train_by_oneflow().numpy()
+    numpy_res = train_by_numpy()
+    test_case.assertTrue(
+        np.allclose(
+            oneflow_res.flatten(), numpy_res.flatten(), rtol=0.0001, atol=0.0001
+        )
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestRMSProp(flow.unittest.TestCase):
+    def test_rmsprop(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device"] = ["cpu", "cuda"]
+        arg_dict["x_shape"] = [(10,)]
+        arg_dict["scale"] = [1.0, 0.9]
+        arg_dict["learning_rate"] = [1]
+        arg_dict["momentum"] = [0.0]
+        arg_dict["train_iters"] = [10]
+        arg_dict["alpha"] = [0.9, 0.99]
+        arg_dict["eps"] = [1e-08, 1e-05]
+        arg_dict["weight_decay"] = [0.1, 0.99]
+        arg_dict["centered"] = [False, True]
+        for arg in GenArgList(arg_dict):
+            compare_with_numpy_rmsprop(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_optim_sgd.py b/python/oneflow/test/modules/test_optim_sgd.py
new file mode 100644
index 0000000000000000000000000000000000000000..daabc37e1c4839a0492d9df9a36b93f0e04fccec
--- /dev/null
+++ b/python/oneflow/test/modules/test_optim_sgd.py
@@ -0,0 +1,109 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgDict
+
+import oneflow as flow
+import oneflow.unittest
+from oneflow.nn.parameter import Parameter
+
+
+def compare_with_numpy_sgd(
+    test_case,
+    device,
+    x_shape,
+    scale,
+    momentum,
+    weight_decay,
+    learning_rate,
+    train_iters,
+):
+    random_grad_seq = []
+    for _ in range(train_iters):
+        random_grad_seq.append(np.random.uniform(size=x_shape).astype(np.float32))
+    init_value = np.random.uniform(size=x_shape).astype(np.float32)
+
+    def train_by_oneflow():
+        x = Parameter(flow.Tensor(init_value, device=flow.device(device)))
+        sgd = flow.optim.SGD(
+            [
+                {
+                    "params": [x],
+                    "lr": learning_rate,
+                    "momentum": momentum,
+                    "scale": scale,
+                    "weight_decay": weight_decay,
+                }
+            ]
+        )
+
+        def train_one_iter(grad):
+            grad_tensor = flow.Tensor(
+                grad, requires_grad=False, device=flow.device(device)
+            )
+            loss = flow.sum(x * grad_tensor)
+            loss.backward()
+            sgd.step()
+            sgd.zero_grad()
+
+        for i in range(train_iters):
+            train_one_iter(random_grad_seq[i])
+        return x
+
+    def train_by_numpy():
+        x = init_value
+        vt = np.zeros_like(x)
+
+        def train_one_iter(grad):
+            grad = grad * scale + weight_decay * x
+            v = momentum * vt - learning_rate * grad
+            param = x + v
+            return (param, v)
+
+        for i in range(train_iters):
+            (x, vt) = train_one_iter(random_grad_seq[i])
+        return x
+
+    oneflow_res = train_by_oneflow().numpy()
+    numpy_res = train_by_numpy()
+    test_case.assertTrue(
+        np.allclose(
+            oneflow_res.flatten(), numpy_res.flatten(), rtol=0.0001, atol=0.0001
+        )
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestOptimizers(flow.unittest.TestCase):
+    def test_sgd(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device"] = ["cpu", "cuda"]
+        arg_dict["x_shape"] = [(10,)]
+        arg_dict["scale"] = [1.0, 0.9]
+        arg_dict["momentum"] = [0.0, 0.9]
+        arg_dict["weight_decay"] = [0.0, 0.9]
+        arg_dict["learning_rate"] = [1, 0.1]
+        arg_dict["train_iters"] = [10]
+        for arg in GenArgDict(arg_dict):
+            compare_with_numpy_sgd(test_case, **arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_permute.py b/python/oneflow/test/modules/test_permute.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5dd67a8039a02e03f0caa3a55ee7c4e0207e42f
--- /dev/null
+++ b/python/oneflow/test/modules/test_permute.py
@@ -0,0 +1,53 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_permute_impl(test_case, device):
+    input = flow.Tensor(
+        np.random.randn(2, 6, 5, 3),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    of_out = input.permute(1, 0, 2, 3)
+    np_out = input.numpy().transpose((1, 0, 2, 3))
+    test_case.assertTrue(np.array_equal(of_out.numpy().flatten(), np_out.flatten()))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = np.ones((2, 6, 5, 3))
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 0.0001, 0.0001))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestPermute(flow.unittest.TestCase):
+    def test_permute(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_permute_impl(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_pixel_shuffle.py b/python/oneflow/test/modules/test_pixel_shuffle.py
new file mode 100644
index 0000000000000000000000000000000000000000..efe8ac5f764bd15042c40d3a7dd22f6e3968fde0
--- /dev/null
+++ b/python/oneflow/test/modules/test_pixel_shuffle.py
@@ -0,0 +1,101 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from automated_test_util import *
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _np_pixel_shuffle(input, h_factor, w_factor):
+    (_batch, _channel, _height, _width) = input.shape
+    assert (
+        _channel % (h_factor * w_factor) == 0
+    ), "The channels of input tensor must be divisible by (h_upscale_factor * w_upscale_factor)"
+    _new_c = int(_channel / (h_factor * w_factor))
+    out = np.reshape(input, [_batch, _new_c, h_factor * w_factor, _height, _width])
+    out = np.reshape(out, [_batch, _new_c, h_factor, w_factor, _height, _width])
+    out = np.transpose(out, [0, 1, 4, 2, 5, 3])
+    out = np.reshape(out, [_batch, _new_c, _height * h_factor, _width * w_factor])
+    return out
+
+
+def _np_pixel_shuffle_grad(input, h_factor, w_factor):
+    (_batch, _new_channel, _height_mul_factor, _width_mul_factor) = input.shape
+    _channel = _new_channel * (h_factor * w_factor)
+    _height = _height_mul_factor // h_factor
+    _width = _width_mul_factor // w_factor
+    out = np.ones(shape=(_batch, _channel, _height, _width))
+    return out
+
+
+def _test_pixel_shuffle_impl(
+    test_case, device, shape, h_upscale_factor, w_upscale_factor
+):
+    x = np.random.randn(*shape)
+    input = flow.Tensor(
+        x, dtype=flow.float32, requires_grad=True, device=flow.device(device)
+    )
+    m = flow.nn.PixelShuffle(
+        h_upscale_factor=h_upscale_factor, w_upscale_factor=w_upscale_factor
+    )
+    m = m.to(device)
+    of_out = m(input)
+    np_out = _np_pixel_shuffle(x, h_upscale_factor, w_upscale_factor)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = _np_pixel_shuffle_grad(np_out, h_upscale_factor, w_upscale_factor)
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestPixelShuffleModule(flow.unittest.TestCase):
+    def test_pixel_shuffle(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [_test_pixel_shuffle_impl]
+        arg_dict["device"] = ["cpu", "cuda"]
+        arg_dict["shape"] = [(2, 144, 5, 5), (11, 144, 1, 1)]
+        arg_dict["h_upscale_factor"] = [2, 3, 4]
+        arg_dict["w_upscale_factor"] = [2, 3, 4]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+        arg_dict["shape"] = [(8, 25, 18, 18), (1, 25, 2, 2)]
+        arg_dict["h_upscale_factor"] = [5]
+        arg_dict["w_upscale_factor"] = [5]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+    @autotest()
+    def test_pixel_shuffle_with_random_data(test_case):
+        upscale_factor = random().to(int)
+        num_channels = upscale_factor * upscale_factor * random().to(int)
+        m = torch.nn.PixelShuffle(upscale_factor=upscale_factor)
+        m.train(random())
+        device = random_device()
+        m.to(device)
+        x = random_pytorch_tensor(ndim=4, dim1=num_channels).to(device)
+        y = m(x)
+        return y
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_pooling.py b/python/oneflow/test/modules/test_pooling.py
new file mode 100644
index 0000000000000000000000000000000000000000..587ce2d1ef7cba2061c2f845bdfea4223ee8d654
--- /dev/null
+++ b/python/oneflow/test/modules/test_pooling.py
@@ -0,0 +1,609 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import math
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _nd_tuple_to_dhw(nd_tuple, dim, prefix=1, dhw_offset=0):
+    assert dim <= 3
+    assert dim == len(nd_tuple) - dhw_offset
+    nd_tuple = list(nd_tuple)
+    dhw_tuple = nd_tuple[:dhw_offset]
+    dhw_tuple.extend([prefix for _ in range(3 - dim)])
+    dhw_tuple.extend(nd_tuple[dhw_offset:])
+    return tuple(dhw_tuple)
+
+
+def _dhw_tuple_to_nd(dhw_tuple, dim, prefix=1, dhw_offset=0):
+    assert dim <= 3
+    assert 3 == len(dhw_tuple) - dhw_offset
+    dhw_tuple = list(dhw_tuple)
+    nd_tuple = dhw_tuple[:dhw_offset]
+    nd_offset = dhw_offset + 3 - dim
+    for i in dhw_tuple[dhw_offset:nd_offset]:
+        assert prefix == i
+    nd_tuple.extend(dhw_tuple[nd_offset:])
+    return tuple(nd_tuple)
+
+
+class MaxPoolNumpy:
+    def __init__(self, dim=2, kernel_size=(2, 2), stride=(2, 2), padding=(0, 0)):
+        self.dim = dim
+        self.stride = _nd_tuple_to_dhw(stride, dim)
+        self.padding = _nd_tuple_to_dhw(padding, dim, prefix=0)
+        self.kernel_size = _nd_tuple_to_dhw(kernel_size, dim)
+        self.w_depth = self.kernel_size[0]
+        self.w_height = self.kernel_size[1]
+        self.w_width = self.kernel_size[2]
+        self.min_val = np.finfo(np.float64).min
+
+    def __call__(self, x):
+        self.x_shape = x.shape
+        x_shape_5d = _nd_tuple_to_dhw(self.x_shape, self.dim, prefix=1, dhw_offset=2)
+        x = x.reshape(x_shape_5d)
+        self.in_batch = np.shape(x)[0]
+        self.in_channel = np.shape(x)[1]
+        self.in_depth = np.shape(x)[2]
+        self.in_height = np.shape(x)[3]
+        self.in_width = np.shape(x)[4]
+        pad_x = np.pad(
+            x,
+            (
+                (0, 0),
+                (0, 0),
+                (self.padding[0], self.padding[0]),
+                (self.padding[1], self.padding[1]),
+                (self.padding[2], self.padding[2]),
+            ),
+            "constant",
+            constant_values=(self.min_val, self.min_val),
+        )
+        self.pad_x = pad_x
+        self.pad_shape = pad_x.shape
+        self.out_depth = int((self.in_depth - self.w_depth) / self.stride[0]) + 1
+        self.out_height = int((self.in_height - self.w_height) / self.stride[1]) + 1
+        self.out_width = int((self.in_width - self.w_width) / self.stride[2]) + 1
+        self.pad_out_depth = np.uint16(
+            math.ceil((self.pad_shape[2] - self.w_depth + 1) / self.stride[0])
+        )
+        self.pad_out_height = np.uint16(
+            math.ceil((self.pad_shape[3] - self.w_height + 1) / self.stride[1])
+        )
+        self.pad_out_width = np.uint16(
+            math.ceil((self.pad_shape[4] - self.w_width + 1) / self.stride[2])
+        )
+        out = np.zeros(
+            (
+                self.in_batch,
+                self.in_channel,
+                self.pad_out_depth,
+                self.pad_out_height,
+                self.pad_out_width,
+            )
+        )
+        self.arg_max = np.zeros_like(out, dtype=np.int32)
+        for n in range(self.in_batch):
+            for c in range(self.in_channel):
+                for i in range(self.pad_out_depth):
+                    for j in range(self.pad_out_height):
+                        for k in range(self.pad_out_width):
+                            start_i = i * self.stride[0]
+                            start_j = j * self.stride[1]
+                            start_k = k * self.stride[2]
+                            end_i = start_i + self.w_depth
+                            end_j = start_j + self.w_height
+                            end_k = start_k + self.w_width
+                            out[n, c, i, j, k] = np.max(
+                                pad_x[n, c, start_i:end_i, start_j:end_j, start_k:end_k]
+                            )
+                            self.arg_max[n, c, i, j, k] = np.argmax(
+                                pad_x[n, c, start_i:end_i, start_j:end_j, start_k:end_k]
+                            )
+        self.out_shape_5d = out.shape
+        out_shape = _dhw_tuple_to_nd(out.shape, self.dim, dhw_offset=2)
+        out = out.reshape(out_shape)
+        return out
+
+    def backward(self, d_loss):
+        d_loss = d_loss.reshape(self.out_shape_5d)
+        dx = np.zeros_like(self.pad_x)
+        for n in range(self.in_batch):
+            for c in range(self.in_channel):
+                for i in range(self.pad_out_depth):
+                    for j in range(self.pad_out_height):
+                        for k in range(self.pad_out_width):
+                            start_i = i * self.stride[0]
+                            start_j = j * self.stride[1]
+                            start_k = k * self.stride[2]
+                            end_i = start_i + self.w_depth
+                            end_j = start_j + self.w_height
+                            end_k = start_k + self.w_width
+                            index = np.unravel_index(
+                                self.arg_max[n, c, i, j, k], self.kernel_size
+                            )
+                            dx[n, c, start_i:end_i, start_j:end_j, start_k:end_k][
+                                index
+                            ] += d_loss[n, c, i, j, k]
+        dx = dx[
+            :,
+            :,
+            self.padding[0] : self.pad_shape[2] - self.padding[0],
+            self.padding[1] : self.pad_shape[3] - self.padding[1],
+            self.padding[2] : self.pad_shape[4] - self.padding[2],
+        ]
+        dx = dx.reshape(self.x_shape)
+        return dx
+
+
+def _test_maxpool1d_impl(test_case, device):
+    input_arr = np.array(
+        [
+            [
+                [-0.89042996, 2.33971243, -0.86660827, 0.80398747],
+                [-1.46769364, -0.78125064, 1.50086563, -0.76278226],
+                [1.31984534, 0.20741192, -0.86507054, -0.40776015],
+                [-0.89910823, 0.44932938, 1.49148118, -0.22036761],
+            ],
+            [
+                [-0.5452334, -0.10255169, -1.42035108, 0.73922913],
+                [-0.03192764, 0.69341935, 0.96263152, -1.52070843],
+                [0.02058239, 1.504032, 1.84423001, -0.0130596],
+                [2.20517719, 0.38449598, 0.85677771, 0.60425179],
+            ],
+            [
+                [-1.64366213, 0.51370298, -0.21754866, -0.05085382],
+                [1.17065374, 1.13857674, -1.13070507, 0.44353707],
+                [-1.30783846, -0.48031445, 0.41807536, -2.13778887],
+                [0.08259005, 0.5798125, 0.03024696, 1.96100924],
+            ],
+        ]
+    )
+    (kernel_size, stride, padding) = ((3,), (1,), (1,))
+    output = np.array(
+        [
+            [
+                [2.33971243, 2.33971243, 2.33971243, 0.80398747],
+                [-0.78125064, 1.50086563, 1.50086563, 1.50086563],
+                [1.31984534, 1.31984534, 0.20741192, -0.40776015],
+                [0.44932938, 1.49148118, 1.49148118, 1.49148118],
+            ],
+            [
+                [-0.10255169, -0.10255169, 0.73922913, 0.73922913],
+                [0.69341935, 0.96263152, 0.96263152, 0.96263152],
+                [1.504032, 1.84423001, 1.84423001, 1.84423001],
+                [2.20517719, 2.20517719, 0.85677771, 0.85677771],
+            ],
+            [
+                [0.51370298, 0.51370298, 0.51370298, -0.05085382],
+                [1.17065374, 1.17065374, 1.13857674, 0.44353707],
+                [-0.48031445, 0.41807536, 0.41807536, 0.41807536],
+                [0.5798125, 0.5798125, 1.96100924, 1.96100924],
+            ],
+        ]
+    )
+    output_indice = np.array(
+        [
+            [[1, 1, 1, 3], [1, 2, 2, 2], [0, 0, 1, 3], [1, 2, 2, 2]],
+            [[1, 1, 3, 3], [1, 2, 2, 2], [1, 2, 2, 2], [0, 0, 2, 2]],
+            [[1, 1, 1, 3], [0, 0, 1, 3], [1, 2, 2, 2], [1, 1, 3, 3]],
+        ]
+    )
+    grad = np.array(
+        [
+            [
+                [0.0, 3.0, 0.0, 1.0],
+                [0.0, 1.0, 3.0, 0.0],
+                [2.0, 1.0, 0.0, 1.0],
+                [0.0, 1.0, 3.0, 0.0],
+            ],
+            [
+                [0.0, 2.0, 0.0, 2.0],
+                [0.0, 1.0, 3.0, 0.0],
+                [0.0, 1.0, 3.0, 0.0],
+                [2.0, 0.0, 2.0, 0.0],
+            ],
+            [
+                [0.0, 3.0, 0.0, 1.0],
+                [2.0, 1.0, 0.0, 1.0],
+                [0.0, 1.0, 3.0, 0.0],
+                [0.0, 2.0, 0.0, 2.0],
+            ],
+        ]
+    )
+    m = flow.nn.MaxPool1d(
+        kernel_size=kernel_size, stride=stride, padding=padding, return_indices=True
+    )
+    m.to(flow.device(device))
+    x = flow.Tensor(input_arr, device=flow.device(device), requires_grad=True)
+    (of_output, of_indice) = m(x)
+    y = of_output.sum()
+    y.backward()
+    test_case.assertTrue(np.allclose(x.grad.numpy(), grad, 0.0001, 0.0001))
+    test_case.assertTrue(np.allclose(of_indice.numpy(), output_indice, 0.0001, 0.0001))
+    test_case.assertTrue(np.allclose(of_output.numpy(), output, 0.0001, 0.0001))
+
+
+def _test_maxpool1d_zero_padding(test_case, device):
+    arr = np.arange(1000).reshape(4, 5, 50).astype(np.float)
+    input = flow.tensor(arr, dtype=flow.float32, device=flow.device(device))
+    m1 = flow.nn.MaxPool1d(kernel_size=3, stride=3, padding=0)
+    of_out = m1(input)
+    m2 = MaxPoolNumpy(2, kernel_size=(3, 1), stride=(3, 1), padding=(0, 0))
+    np_out = m2(arr.reshape(4, 5, 50, 1))
+    np_out = np.squeeze(np_out, axis=3)
+    test_case.assertTrue(np.allclose(np_out, of_out.numpy(), 0.0001, 0.0001))
+
+
+def _test_maxpool2d(test_case, device):
+    dim = 2
+    input_arr = np.random.randn(2, 3, 4, 5)
+    (kernel_size, stride, padding) = ((3, 3), (1, 1), (1, 1))
+    m_numpy = MaxPoolNumpy(dim, kernel_size, stride, padding)
+    numpy_output = m_numpy(input_arr)
+    m = flow.nn.MaxPool2d(
+        kernel_size=kernel_size, stride=stride, padding=padding, return_indices=True
+    )
+    m.to(flow.device(device))
+    x = flow.Tensor(input_arr, device=flow.device(device))
+    (output, indice) = m(x)
+    test_case.assertTrue(indice.shape == x.shape)
+    test_case.assertTrue(np.allclose(numpy_output, output.numpy(), 0.0001, 0.0001))
+
+
+def _test_maxpool2d_ceil_mode(test_case, device):
+    dim = 2
+    input_arr = np.array(
+        [
+            [
+                [
+                    [-0.89042996, 2.33971243, -0.86660827, 0.80398747],
+                    [-1.46769364, -0.78125064, 1.50086563, -0.76278226],
+                    [1.31984534, 0.20741192, -0.86507054, -0.40776015],
+                    [-0.89910823, 0.44932938, 1.49148118, -0.22036761],
+                ],
+                [
+                    [-0.5452334, -0.10255169, -1.42035108, 0.73922913],
+                    [-0.03192764, 0.69341935, 0.96263152, -1.52070843],
+                    [0.02058239, 1.504032, 1.84423001, -0.0130596],
+                    [2.20517719, 0.38449598, 0.85677771, 0.60425179],
+                ],
+                [
+                    [-1.64366213, 0.51370298, -0.21754866, -0.05085382],
+                    [1.17065374, 1.13857674, -1.13070507, 0.44353707],
+                    [-1.30783846, -0.48031445, 0.41807536, -2.13778887],
+                    [0.08259005, 0.5798125, 0.03024696, 1.96100924],
+                ],
+            ],
+            [
+                [
+                    [0.45173843, -0.34680027, -0.99754943, 0.18539502],
+                    [-0.68451047, -0.03217399, 0.44705642, -0.39016231],
+                    [-0.18062337, 1.82099303, -0.19113869, 0.85298683],
+                    [0.14080452, 0.15306701, -1.02466827, -0.34480665],
+                ],
+                [
+                    [-0.21048489, 0.20933038, -0.09206508, -1.80402519],
+                    [-0.52028985, 0.01140166, -1.13452858, 0.96648332],
+                    [0.26454393, 0.48343972, -1.84055509, -0.01256443],
+                    [0.31024029, 0.11983007, 0.98806488, 0.93557438],
+                ],
+                [
+                    [0.39152445, 0.672159, 0.71289289, -0.68072016],
+                    [0.33711062, -1.78106242, 0.34545201, -1.62029359],
+                    [0.47343899, -2.3433269, -0.44517497, 0.09004267],
+                    [0.26310742, -1.53121271, 0.65028836, 1.3669488],
+                ],
+            ],
+        ]
+    )
+    ceil_mode_out = np.array(
+        [
+            [
+                [
+                    [2.33971243, 2.33971243, 0.80398747],
+                    [1.31984534, 1.50086563, -0.22036761],
+                    [0.44932938, 1.49148118, -0.22036761],
+                ],
+                [
+                    [0.69341935, 0.96263152, 0.73922913],
+                    [2.20517719, 1.84423001, 0.60425179],
+                    [2.20517719, 0.85677771, 0.60425179],
+                ],
+                [
+                    [1.17065374, 1.13857674, 0.44353707],
+                    [1.17065374, 1.96100924, 1.96100924],
+                    [0.5798125, 1.96100924, 1.96100924],
+                ],
+            ],
+            [
+                [
+                    [0.45173843, 0.44705642, 0.18539502],
+                    [1.82099303, 1.82099303, 0.85298683],
+                    [0.15306701, 0.15306701, -0.34480665],
+                ],
+                [
+                    [0.20933038, 0.96648332, 0.96648332],
+                    [0.48343972, 0.98806488, 0.96648332],
+                    [0.31024029, 0.98806488, 0.93557438],
+                ],
+                [
+                    [0.672159, 0.71289289, -0.68072016],
+                    [0.47343899, 1.3669488, 1.3669488],
+                    [0.26310742, 1.3669488, 1.3669488],
+                ],
+            ],
+        ]
+    )
+    (kernel_size, stride, padding) = ((3, 3), (2, 2), (1, 1))
+    m_numpy = MaxPoolNumpy(dim, kernel_size, stride, padding)
+    numpy_output = m_numpy(input_arr)
+    m1 = flow.nn.MaxPool2d(
+        kernel_size=kernel_size, stride=stride, padding=padding, ceil_mode=False
+    )
+    m2 = flow.nn.MaxPool2d(
+        kernel_size=kernel_size, stride=stride, padding=padding, ceil_mode=True
+    )
+    m1.to(flow.device(device))
+    m2.to(flow.device(device))
+    x = flow.Tensor(input_arr, device=flow.device(device))
+    output1 = m1(x)
+    output2 = m2(x)
+    test_case.assertTrue(np.allclose(numpy_output, output1.numpy(), 0.0001, 0.0001))
+    test_case.assertTrue(np.allclose(ceil_mode_out, output2.numpy(), 0.0001, 0.0001))
+
+
+def _test_maxpool2d_special_kernel_size(test_case, device):
+    dim = 2
+    input_arr = np.random.randn(1, 1, 6, 6)
+    (kernel_size, stride, padding) = ((1, 1), (5, 5), (0, 0))
+    m_numpy = MaxPoolNumpy(dim, kernel_size, stride, padding)
+    numpy_output = m_numpy(input_arr)
+    m = flow.nn.MaxPool2d(kernel_size=kernel_size, stride=stride, padding=padding)
+    m.to(flow.device(device))
+    x = flow.Tensor(input_arr, device=flow.device(device))
+    output = m(x)
+    test_case.assertTrue(np.allclose(numpy_output, output.numpy(), 0.0001, 0.0001))
+
+
+def _test_maxpool2d_diff_kernel_stride(test_case, device):
+    dim = 2
+    input_arr = np.random.randn(9, 7, 32, 20)
+    (kernel_size, stride, padding) = ((2, 4), (4, 5), (1, 2))
+    m_numpy = MaxPoolNumpy(dim, kernel_size, stride, padding)
+    numpy_output = m_numpy(input_arr)
+    m = flow.nn.MaxPool2d(kernel_size=kernel_size, stride=stride, padding=padding)
+    m.to(flow.device(device))
+    x = flow.Tensor(input_arr, device=flow.device(device))
+    output = m(x)
+    test_case.assertTrue(np.allclose(numpy_output, output.numpy(), 0.0001, 0.0001))
+
+
+def _test_maxpool2d_negative_input(test_case, device):
+    dim = 2
+    input_arr = -1.23456 * np.ones((1, 1, 1, 1), dtype=np.float32)
+    (kernel_size, stride, padding) = ((5, 5), (5, 5), (2, 2))
+    m_numpy = MaxPoolNumpy(dim, kernel_size, stride, padding)
+    numpy_output = m_numpy(input_arr)
+    m = flow.nn.MaxPool2d(kernel_size=kernel_size, stride=stride, padding=padding)
+    m.to(flow.device(device))
+    x = flow.Tensor(input_arr, device=flow.device(device))
+    output = m(x)
+    test_case.assertTrue(np.allclose(numpy_output, output.numpy(), 0.0001, 0.0001))
+
+
+def _test_maxpool2d_backward(test_case, device):
+    dim = 2
+    input_arr = np.random.randn(6, 4, 7, 9)
+    (kernel_size, stride, padding) = ((4, 4), (1, 1), (1, 2))
+    m_numpy = MaxPoolNumpy(dim, kernel_size, stride, padding)
+    numpy_output = m_numpy(input_arr)
+    m = flow.nn.MaxPool2d(kernel_size=kernel_size, stride=stride, padding=padding)
+    m.to(flow.device(device))
+    x = flow.Tensor(input_arr, requires_grad=True, device=flow.device(device))
+    output = m(x)
+    output = output.sum()
+    output.backward()
+    doutput = np.ones_like(numpy_output, dtype=np.float64)
+    numpy_grad = m_numpy.backward(doutput)
+    test_case.assertTrue(np.allclose(x.grad.numpy(), numpy_grad, 0.0001, 0.0001))
+
+
+def _test_maxpool2d_special_kernel_size_backward(test_case, device):
+    dim = 2
+    input_arr = np.random.randn(1, 1, 6, 6)
+    (kernel_size, stride, padding) = ((1, 1), (5, 5), (0, 0))
+    m_numpy = MaxPoolNumpy(dim, kernel_size, stride, padding)
+    numpy_output = m_numpy(input_arr)
+    m = flow.nn.MaxPool2d(kernel_size=kernel_size, stride=stride, padding=padding)
+    m.to(flow.device(device))
+    x = flow.Tensor(input_arr, requires_grad=True, device=flow.device(device))
+    output = m(x)
+    output = output.sum()
+    output.backward()
+    doutput = np.ones_like(numpy_output, dtype=np.float64)
+    numpy_grad = m_numpy.backward(doutput)
+    test_case.assertTrue(np.allclose(x.grad.numpy(), numpy_grad, 0.0001, 0.0001))
+
+
+def _test_maxpool2d_diff_kernel_stride_backward(test_case, device):
+    dim = 2
+    input_arr = np.random.randn(9, 7, 32, 20)
+    (kernel_size, stride, padding) = ((2, 4), (4, 5), (1, 2))
+    m_numpy = MaxPoolNumpy(dim, kernel_size, stride, padding)
+    numpy_output = m_numpy(input_arr)
+    m = flow.nn.MaxPool2d(kernel_size=kernel_size, stride=stride, padding=padding)
+    m.to(flow.device(device))
+    x = flow.Tensor(input_arr, requires_grad=True, device=flow.device(device))
+    output = m(x)
+    output = output.sum()
+    output.backward()
+    doutput = np.ones_like(numpy_output, dtype=np.float64)
+    numpy_grad = m_numpy.backward(doutput)
+    test_case.assertTrue(np.allclose(x.grad.numpy(), numpy_grad, 0.0001, 0.0001))
+
+
+def _test_maxpool2d_negative_input_backward(test_case, device):
+    dim = 2
+    input_arr = -1.23456 * np.ones((1, 1, 1, 1), dtype=np.float32)
+    (kernel_size, stride, padding) = ((5, 5), (5, 5), (2, 2))
+    m_numpy = MaxPoolNumpy(dim, kernel_size, stride, padding)
+    numpy_output = m_numpy(input_arr)
+    m = flow.nn.MaxPool2d(kernel_size=kernel_size, stride=stride, padding=padding)
+    m.to(flow.device(device))
+    x = flow.Tensor(input_arr, requires_grad=True, device=flow.device(device))
+    output = m(x)
+    output = output.sum()
+    output.backward()
+    doutput = np.ones_like(numpy_output, dtype=np.float64)
+    numpy_grad = m_numpy.backward(doutput)
+    test_case.assertTrue(np.allclose(x.grad.numpy(), numpy_grad, 0.0001, 0.0001))
+
+
+def _test_maxpool3d(test_case, device):
+    dim = 3
+    input_arr = np.random.randn(2, 3, 7, 9, 13)
+    (kernel_size, stride, padding) = ((2, 3, 4), (2, 3, 4), (1, 1, 2))
+    m_numpy = MaxPoolNumpy(dim, kernel_size, stride, padding)
+    numpy_output = m_numpy(input_arr)
+    m = flow.nn.MaxPool3d(kernel_size=kernel_size, stride=stride, padding=padding)
+    m.to(flow.device(device))
+    x = flow.Tensor(input_arr, device=flow.device(device))
+    output = m(x)
+    test_case.assertTrue(np.allclose(numpy_output, output.numpy(), 0.0001, 0.0001))
+
+
+def _test_maxpool3d_backward(test_case, device):
+    dim = 3
+    input_arr = np.random.randn(6, 4, 8, 7, 9)
+    (kernel_size, stride, padding) = ((4, 4, 4), (1, 1, 1), (2, 1, 2))
+    m_numpy = MaxPoolNumpy(dim, kernel_size, stride, padding)
+    numpy_output = m_numpy(input_arr)
+    m = flow.nn.MaxPool3d(kernel_size=kernel_size, stride=stride, padding=padding)
+    m.to(flow.device(device))
+    x = flow.Tensor(input_arr, requires_grad=True, device=flow.device(device))
+    output = m(x)
+    test_case.assertTrue(np.allclose(numpy_output, output.numpy(), 0.0001, 0.0001))
+    output = output.sum()
+    output.backward()
+    doutput = np.ones_like(numpy_output, dtype=np.float64)
+    numpy_grad = m_numpy.backward(doutput)
+    test_case.assertTrue(np.allclose(x.grad.numpy(), numpy_grad, 0.0001, 0.0001))
+
+
+def _test_maxpool3d_special_kernel_size_backward(test_case, device):
+    dim = 3
+    input_arr = np.random.randn(1, 1, 6, 6, 6)
+    (kernel_size, stride, padding) = ((1, 1, 1), (5, 5, 5), (0, 0, 0))
+    m_numpy = MaxPoolNumpy(dim, kernel_size, stride, padding)
+    numpy_output = m_numpy(input_arr)
+    m = flow.nn.MaxPool3d(kernel_size=kernel_size, stride=stride, padding=padding)
+    m.to(flow.device(device))
+    x = flow.Tensor(input_arr, requires_grad=True, device=flow.device(device))
+    output = m(x)
+    test_case.assertTrue(np.allclose(numpy_output, output.numpy(), 0.0001, 0.0001))
+    output = output.sum()
+    output.backward()
+    doutput = np.ones_like(numpy_output, dtype=np.float64)
+    numpy_grad = m_numpy.backward(doutput)
+    test_case.assertTrue(np.allclose(x.grad.numpy(), numpy_grad, 0.0001, 0.0001))
+
+
+def _test_maxpool3d_diff_kernel_stride_backward(test_case, device):
+    dim = 3
+    input_arr = np.random.randn(9, 7, 48, 32, 20)
+    (kernel_size, stride, padding) = ((6, 2, 4), (5, 4, 5), (3, 1, 2))
+    m_numpy = MaxPoolNumpy(dim, kernel_size, stride, padding)
+    numpy_output = m_numpy(input_arr)
+    m = flow.nn.MaxPool3d(kernel_size=kernel_size, stride=stride, padding=padding)
+    m.to(flow.device(device))
+    x = flow.Tensor(input_arr, requires_grad=True, device=flow.device(device))
+    output = m(x)
+    test_case.assertTrue(np.allclose(numpy_output, output.numpy(), 0.0001, 0.0001))
+    output = output.sum()
+    output.backward()
+    doutput = np.ones_like(numpy_output, dtype=np.float64)
+    numpy_grad = m_numpy.backward(doutput)
+    test_case.assertTrue(np.allclose(x.grad.numpy(), numpy_grad, 0.0001, 0.0001))
+
+
+def _test_maxpool3d_negative_input_backward(test_case, device):
+    dim = 3
+    input_arr = -1.23456 * np.ones((1, 1, 1, 1, 1), dtype=np.float32)
+    (kernel_size, stride, padding) = ((5, 5, 5), (5, 5, 5), (2, 2, 2))
+    m_numpy = MaxPoolNumpy(dim, kernel_size, stride, padding)
+    numpy_output = m_numpy(input_arr)
+    m = flow.nn.MaxPool3d(kernel_size=kernel_size, padding=padding)
+    m.to(flow.device(device))
+    x = flow.Tensor(input_arr, requires_grad=True, device=flow.device(device))
+    output = m(x)
+    test_case.assertTrue(np.allclose(numpy_output, output.numpy(), 0.0001, 0.0001))
+    output = output.sum()
+    output.backward()
+    doutput = np.ones_like(numpy_output, dtype=np.float64)
+    numpy_grad = m_numpy.backward(doutput)
+    test_case.assertTrue(np.allclose(x.grad.numpy(), numpy_grad, 0.0001, 0.0001))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestPooling(flow.unittest.TestCase):
+    def test_maxpool1d(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [_test_maxpool1d_impl, _test_maxpool1d_zero_padding]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+    def test_maxpool2d(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_maxpool2d,
+            _test_maxpool2d_ceil_mode,
+            _test_maxpool2d_special_kernel_size,
+            _test_maxpool2d_diff_kernel_stride,
+            _test_maxpool2d_negative_input,
+            _test_maxpool2d_backward,
+            _test_maxpool2d_special_kernel_size_backward,
+            _test_maxpool2d_diff_kernel_stride_backward,
+            _test_maxpool2d_negative_input_backward,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+    def test_maxpool3d(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_maxpool3d,
+            _test_maxpool3d_backward,
+            _test_maxpool3d_special_kernel_size_backward,
+            _test_maxpool3d_negative_input_backward,
+            _test_maxpool3d_diff_kernel_stride_backward,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_pow.py b/python/oneflow/test/modules/test_pow.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a87b26ffadb0391e74bc6c95a1ed35bb25b07ec
--- /dev/null
+++ b/python/oneflow/test/modules/test_pow.py
@@ -0,0 +1,114 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_pow_scalar_impl(test_case, shape, scalar, device):
+    np_input = 10 * np.random.rand(*shape)
+    of_input = flow.Tensor(np_input, dtype=flow.float32, device=flow.device(device))
+    of_out = flow.pow(of_input, scalar)
+    np_out = np.power(np_input, scalar)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+
+
+def _test_pow_elementwise_impl(test_case, shape, scalar, device):
+    np_input_x = 10 * np.random.rand(*shape)
+    np_input_y = np.random.randint(1, 3, shape) + np.random.randn(*shape)
+    of_input_x = flow.Tensor(np_input_x, dtype=flow.float32, device=flow.device(device))
+    of_input_y = flow.Tensor(np_input_y, dtype=flow.float32, device=flow.device(device))
+    of_out = flow.pow(of_input_x, of_input_y)
+    np_out = np.power(np_input_x, np_input_y)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+
+
+def _test_pow_backward_impl(test_case, device):
+    shape = (2, 3)
+    np_input_x = 10 * np.random.rand(*shape)
+    np_input_y = np.random.randint(1, 3, shape) + np.random.randn(*shape)
+    np_input_y_scalar = (np.random.randint(1, 3, (1,)) + np.random.randn(1))[0]
+    np_x_grad = np_input_y * np.power(np_input_x, np_input_y - 1)
+    np_y_grad = np.power(np_input_x, np_input_y) * np.log(np_input_x)
+    np_x_grad_scalar = np_input_y_scalar * np.power(np_input_x, np_input_y_scalar - 1)
+
+    def test_x_y_grad():
+        of_input_x = flow.Tensor(
+            np_input_x,
+            dtype=flow.float32,
+            device=flow.device(device),
+            requires_grad=True,
+        )
+        of_input_y = flow.Tensor(
+            np_input_y,
+            dtype=flow.float32,
+            device=flow.device(device),
+            requires_grad=True,
+        )
+        of_out = flow.pow(of_input_x, of_input_y)
+        of_out_sum = of_out.sum()
+        of_out_sum.backward()
+        test_case.assertTrue(
+            np.allclose(of_input_x.grad.numpy(), np_x_grad, 0.0001, 0.0001)
+        )
+        test_case.assertTrue(
+            np.allclose(of_input_y.grad.numpy(), np_y_grad, 0.0001, 0.0001)
+        )
+
+    def test_x_grad_scalar():
+        of_input_x = flow.Tensor(
+            np_input_x,
+            dtype=flow.float32,
+            device=flow.device(device),
+            requires_grad=True,
+        )
+        of_out = flow.pow(of_input_x, np_input_y_scalar)
+        of_out_sum = of_out.sum()
+        of_out_sum.backward()
+        test_case.assertTrue(
+            np.allclose(of_input_x.grad.numpy(), np_x_grad_scalar, 0.0001, 0.0001)
+        )
+
+    test_x_y_grad()
+    test_x_grad_scalar()
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestPow(flow.unittest.TestCase):
+    def test_pow_forward(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(2, 3), (2, 3, 4, 5)]
+        arg_dict["scalar"] = [2.1, 0.8]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_pow_scalar_impl(test_case, *arg)
+            _test_pow_elementwise_impl(test_case, *arg)
+
+    def test_pow_backward(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_pow_backward_impl(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_prelu.py b/python/oneflow/test/modules/test_prelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cb4c04e932f7ce24bc488b3f16a07b5d96d16cc
--- /dev/null
+++ b/python/oneflow/test/modules/test_prelu.py
@@ -0,0 +1,104 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from automated_test_util import *
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _prelu(input, alpha):
+    alpha = np.expand_dims(alpha, 0)
+    alpha = np.expand_dims(alpha, 2)
+    alpha = np.expand_dims(alpha, 3)
+    return np.where(input > 0, input, input * alpha)
+
+
+def _prelu_grad(input, alpha):
+    return alpha * (input <= 0) + (input > 0)
+
+
+def _test_prelu(test_case, shape, device):
+    np_input = np.random.randn(*shape)
+    input = flow.Tensor(np_input, dtype=flow.float32, device=flow.device(device))
+    np_alpha = np.random.randn(1)
+    prelu = flow.nn.PReLU(init=np_alpha)
+    if device == "cuda":
+        prelu.to(flow.device("cuda"))
+    np_out = _prelu(np_input, np_alpha)
+    of_out = prelu(input)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_prelu_ndims(test_case, shape, device):
+    np_input = np.random.randn(*shape)
+    input = flow.Tensor(np_input, dtype=flow.float32, device=flow.device(device))
+    np_alpha = np.random.randn(shape[1])
+    prelu = flow.nn.PReLU(init=1.0, num_parameters=shape[1])
+    prelu_alpha = np.expand_dims(np_alpha, (1, 2))
+    prelu.weight = flow.nn.Parameter(flow.Tensor(prelu_alpha, dtype=flow.float32))
+    if device == "cuda":
+        prelu.to(flow.device("cuda"))
+    np_out = _prelu(np_input, np_alpha)
+    of_out = prelu(input)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_prelu_grad(test_case, shape, device):
+    np_input = np.random.randn(*shape)
+    input = flow.Tensor(
+        np_input, dtype=flow.float32, requires_grad=True, device=flow.device(device)
+    )
+    np_alpha = 0.2
+    prelu = flow.nn.PReLU(init=np_alpha)
+    if device == "cuda":
+        prelu.to(flow.device("cuda"))
+    of_out = prelu(input).sum()
+    of_out.backward()
+    np_grad = _prelu_grad(np_input, np_alpha)
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestPReLU(flow.unittest.TestCase):
+    def test_prelu(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(2, 4, 5, 6)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_prelu(test_case, *arg)
+            _test_prelu_ndims(test_case, *arg)
+            _test_prelu_grad(test_case, *arg)
+
+    @unittest.skip("prelu has bug")
+    @autotest()
+    def test_prelu_module_with_random_data(test_case):
+        m = torch.nn.PReLU(num_parameters=random().to(int), init=random().to(float))
+        m.train(random())
+        device = random_device()
+        m.to(device)
+        x = random_pytorch_tensor().to(device)
+        y = m(x)
+        return y
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_reciprocal.py b/python/oneflow/test/modules/test_reciprocal.py
new file mode 100644
index 0000000000000000000000000000000000000000..003257bc4efb1c24efb821aa94aeea3e09ebce17
--- /dev/null
+++ b/python/oneflow/test/modules/test_reciprocal.py
@@ -0,0 +1,47 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_reciprocal_impl(test_case, shape, device):
+    x = flow.Tensor(
+        np.random.randn(*shape), dtype=flow.float32, device=flow.device(device)
+    )
+    of_out = flow.reciprocal(x)
+    np_out = np.reciprocal(x.numpy())
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestReciprocalModule(flow.unittest.TestCase):
+    def test_reciprocal(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(2, 3), (2, 4, 5, 6)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_reciprocal_impl(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_reduce_ops.py b/python/oneflow/test/modules/test_reduce_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c37ddeec3a673050aa6f4c5f88897f624b2c502a
--- /dev/null
+++ b/python/oneflow/test/modules/test_reduce_ops.py
@@ -0,0 +1,149 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from automated_test_util import *
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_min(test_case, device, shape, dim, keepdims):
+    input_arr = np.random.randn(*shape)
+    np_out = np.amin(input_arr, axis=dim, keepdims=keepdims)
+    x = flow.Tensor(
+        input_arr, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    of_out = flow.min(x, dim, keepdims)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_out_grad = np.zeros_like(input_arr)
+    if dim == None:
+        arg_min = np.argmin(input_arr)
+        np.put(np_out_grad, arg_min, 1)
+    else:
+        arg_min = np.expand_dims(np.argmin(input_arr, axis=dim), axis=dim)
+        np.put_along_axis(np_out_grad, arg_min, 1, axis=dim)
+    test_case.assertTrue(np.allclose(x.grad.numpy(), np_out_grad, 0.0001, 0.0001))
+
+
+def _test_min_tensor_function(test_case, device, shape, dim, keepdims):
+    input_arr = np.random.randn(*shape)
+    np_out = np.amin(input_arr, axis=dim, keepdims=keepdims)
+    x = flow.Tensor(
+        input_arr, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    of_out = x.min(dim, keepdims)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_out_grad = np.zeros_like(input_arr)
+    if dim == None:
+        arg_min = np.argmin(input_arr)
+        np.put(np_out_grad, arg_min, 1)
+    else:
+        arg_min = np.expand_dims(np.argmin(input_arr, axis=dim), axis=dim)
+        np.put_along_axis(np_out_grad, arg_min, 1, axis=dim)
+    test_case.assertTrue(np.allclose(x.grad.numpy(), np_out_grad, 0.0001, 0.0001))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestMinModule(flow.unittest.TestCase):
+    def test_min(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [_test_min, _test_min_tensor_function]
+        arg_dict["device"] = ["cpu", "cuda"]
+        arg_dict["shape"] = [(2,), (2, 3), (2, 3, 4, 5)]
+        arg_dict["dim"] = [None, 0, -1]
+        arg_dict["keepdims"] = [False, True]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+    def test_min_against_pytorch(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_type"] = [test_flow_against_pytorch, test_tensor_against_pytorch]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, "min", device=arg[1])
+
+
+def _test_max(test_case, device, shape, dim, keepdims):
+    input_arr = np.random.randn(*shape)
+    np_out = np.amax(input_arr, axis=dim, keepdims=keepdims)
+    x = flow.Tensor(
+        input_arr, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    of_out = flow.max(x, dim, keepdims)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_out_grad = np.zeros_like(input_arr)
+    if dim == None:
+        arg_max = np.argmax(input_arr)
+        np.put(np_out_grad, arg_max, 1)
+    else:
+        arg_max = np.expand_dims(np.argmax(input_arr, axis=dim), axis=dim)
+        np.put_along_axis(np_out_grad, arg_max, 1, axis=dim)
+    test_case.assertTrue(np.allclose(x.grad.numpy(), np_out_grad, 0.0001, 0.0001))
+
+
+def _test_max_tensor_function(test_case, device, shape, dim, keepdims):
+    input_arr = np.random.randn(*shape)
+    np_out = np.amax(input_arr, axis=dim, keepdims=keepdims)
+    x = flow.Tensor(
+        input_arr, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    of_out = x.max(dim, keepdims)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_out_grad = np.zeros_like(input_arr)
+    if dim == None:
+        arg_max = np.argmax(input_arr)
+        np.put(np_out_grad, arg_max, 1)
+    else:
+        arg_max = np.expand_dims(np.argmax(input_arr, axis=dim), axis=dim)
+        np.put_along_axis(np_out_grad, arg_max, 1, axis=dim)
+    test_case.assertTrue(np.allclose(x.grad.numpy(), np_out_grad, 0.0001, 0.0001))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestMaxModule(flow.unittest.TestCase):
+    def test_max(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [_test_max, _test_max_tensor_function]
+        arg_dict["device"] = ["cpu", "cuda"]
+        arg_dict["shape"] = [(2,), (2, 3), (2, 3, 4, 5)]
+        arg_dict["dim"] = [None, 0, -1]
+        arg_dict["keepdims"] = [False, True]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+    def test_max_against_pytorch(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_type"] = [test_flow_against_pytorch, test_tensor_against_pytorch]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, "max", device=arg[1])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_reflection_pad2d.py b/python/oneflow/test/modules/test_reflection_pad2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad7e2e9abaeb4617fde55e956f07d94cf5d3af4c
--- /dev/null
+++ b/python/oneflow/test/modules/test_reflection_pad2d.py
@@ -0,0 +1,112 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import Array2Numpy, FlattenArray, GenArgList, Index2Coordinate
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def gen_numpy_test_sample(input, padding):
+    (c_idx, h_idx, w_idx) = (1, 2, 3)
+    pad_left = padding[0]
+    pad_right = padding[1]
+    pad_top = padding[2]
+    pad_bottom = padding[3]
+    pad_shape = ((0, 0), (0, 0), (pad_top, pad_bottom), (pad_left, pad_right))
+
+    def _np_reflection_pad2d(input, pad_shape):
+        numpy_reflect = np.pad(input, pad_shape, "reflect")
+        return numpy_reflect
+
+    def _np_reflection_pad2d_grad(src, dest):
+        (dx_height, dx_width) = (input.shape[h_idx], input.shape[w_idx])
+        (dy_height, dy_width) = (output.shape[h_idx], output.shape[w_idx])
+        numpy_src = np.ones(src.shape, np.int32)
+        numpy_dest = np.zeros(dest.shape, np.int32)
+        array_src = FlattenArray(numpy_src)
+        array_dest = FlattenArray(numpy_dest)
+        src_num = src.shape[c_idx] * src.shape[h_idx] * src.shape[w_idx]
+        dest_num = dest.shape[c_idx] * dest.shape[h_idx] * dest.shape[w_idx]
+        elements_num = src.shape[0] * src_num
+        for iter_n in range(elements_num):
+            coords = Index2Coordinate(iter_n, src.shape)
+            (n, c, i, j) = (coords[0], coords[c_idx], coords[h_idx], coords[w_idx])
+            ip_x = ip_y = 0
+            if j < pad_left:
+                ip_x = pad_left * 2 - j
+            elif j >= pad_left and j < dx_width + pad_left:
+                ip_x = j
+            else:
+                ip_x = (dx_width + pad_left - 1) * 2 - j
+            if i < pad_top:
+                ip_y = pad_top * 2 - i
+            elif i >= pad_top and i < dx_height + pad_top:
+                ip_y = i
+            else:
+                ip_y = (dx_height + pad_top - 1) * 2 - i
+            ip_x = ip_x - pad_left
+            ip_y = ip_y - pad_top
+            src_index = n * src_num + c * dy_width * dy_height + i * dy_width + j
+            dest_index = (
+                n * dest_num + c * dx_width * dx_height + ip_y * dx_width + ip_x
+            )
+            array_dest[dest_index] += array_src[src_index]
+        numpy_dest = Array2Numpy(array_dest, dest.shape)
+        return numpy_dest
+
+    output = _np_reflection_pad2d(input, pad_shape)
+    grad = _np_reflection_pad2d_grad(output, input)
+    return (output, grad)
+
+
+def _test_reflection_pad2d(test_case, shape, padding, device):
+    np_input = np.random.randn(*shape).astype(np.float32)
+    of_input = flow.Tensor(
+        np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    if isinstance(padding, int):
+        boundary = [padding, padding, padding, padding]
+    elif isinstance(padding, tuple) and len(padding) == 4:
+        boundary = [padding[0], padding[1], padding[2], padding[3]]
+    else:
+        raise ValueError("padding must be in or list or tuple!")
+    (np_out, np_grad) = gen_numpy_test_sample(np_input, boundary)
+    layer = flow.nn.ReflectionPad2d(padding=padding)
+    of_out = layer(of_input)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+    of_out = of_out.sum()
+    of_out.backward()
+    test_case.assertTrue(np.allclose(of_input.grad.numpy(), np_grad, 0.0001, 0.0001))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestReflectionPad2dModule(flow.unittest.TestCase):
+    def test_reflection_pad2d(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(1, 2, 3, 4), (8, 3, 4, 4)]
+        arg_dict["padding"] = [2, (1, 1, 2, 2)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_reflection_pad2d(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_repeat.py b/python/oneflow/test/modules/test_repeat.py
new file mode 100644
index 0000000000000000000000000000000000000000..369a9026251b47c5684602c08c9ed7abff40efde
--- /dev/null
+++ b/python/oneflow/test/modules/test_repeat.py
@@ -0,0 +1,151 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def np_repeat(x, sizes):
+    return np.tile(x, sizes)
+
+
+def _test_repeat_new_dim(test_case, device):
+    input = flow.Tensor(
+        np.random.randn(2, 4, 1, 3), dtype=flow.float32, device=flow.device(device)
+    )
+    sizes = (4, 3, 2, 3, 3)
+    np_out = np_repeat(input.numpy(), sizes)
+    of_out = input.repeat(sizes=sizes)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+def _test_repeat_same_dim(test_case, device):
+    input = flow.Tensor(
+        np.random.randn(1, 2, 5, 3), dtype=flow.float32, device=flow.device(device)
+    )
+    sizes = (4, 2, 3, 19)
+    of_out = input.repeat(sizes=sizes)
+    np_out = np_repeat(input.numpy(), sizes)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+def _test_repeat_same_dim_int(test_case, device):
+    input = flow.Tensor(
+        np.random.randn(1, 2, 5, 3), dtype=flow.int32, device=flow.device(device)
+    )
+    size_tensor = flow.Tensor(np.random.randn(4, 2, 3, 19))
+    sizes = size_tensor.size()
+    of_out = input.repeat(sizes=sizes)
+    np_out = np_repeat(input.numpy(), sizes)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out.astype(np.int32)))
+
+
+def _test_repeat_same_dim_int8(test_case, device):
+    input = flow.Tensor(
+        np.random.randn(1, 2, 5, 3), dtype=flow.int8, device=flow.device(device)
+    )
+    size_tensor = flow.Tensor(np.random.randn(4, 2, 3, 19))
+    sizes = size_tensor.size()
+    of_out = input.repeat(sizes=sizes)
+    np_out = np_repeat(input.numpy(), sizes)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out.astype(np.int32)))
+
+
+def _test_repeat_new_dim_backward(test_case, device):
+    input = flow.Tensor(
+        np.random.randn(2, 4, 1, 3),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    sizes = (4, 3, 2, 3, 3)
+    of_out = input.repeat(sizes=sizes)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [
+        [
+            [[216.0, 216.0, 216.0]],
+            [[216.0, 216.0, 216.0]],
+            [[216.0, 216.0, 216.0]],
+            [[216.0, 216.0, 216.0]],
+        ],
+        [
+            [[216.0, 216.0, 216.0]],
+            [[216.0, 216.0, 216.0]],
+            [[216.0, 216.0, 216.0]],
+            [[216.0, 216.0, 216.0]],
+        ],
+    ]
+    test_case.assertTrue(np.array_equal(input.grad.numpy(), np_grad))
+
+
+def _test_repeat_same_dim_backward(test_case, device):
+    input = flow.Tensor(
+        np.random.randn(1, 2, 5, 3),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    sizes = (1, 2, 3, 1)
+    of_out = input.repeat(sizes=sizes)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [
+        [
+            [
+                [6.0, 6.0, 6.0],
+                [6.0, 6.0, 6.0],
+                [6.0, 6.0, 6.0],
+                [6.0, 6.0, 6.0],
+                [6.0, 6.0, 6.0],
+            ],
+            [
+                [6.0, 6.0, 6.0],
+                [6.0, 6.0, 6.0],
+                [6.0, 6.0, 6.0],
+                [6.0, 6.0, 6.0],
+                [6.0, 6.0, 6.0],
+            ],
+        ]
+    ]
+    test_case.assertTrue(np.array_equal(input.grad.numpy(), np_grad))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestRepeat(flow.unittest.TestCase):
+    def test_repeat(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_repeat_new_dim,
+            _test_repeat_same_dim,
+            _test_repeat_same_dim_int,
+            _test_repeat_same_dim_int8,
+            _test_repeat_new_dim_backward,
+            _test_repeat_same_dim_backward,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_replicationpad2d.py b/python/oneflow/test/modules/test_replicationpad2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..b80fd8fcd810b5f52b382235d5ec000ecff22731
--- /dev/null
+++ b/python/oneflow/test/modules/test_replicationpad2d.py
@@ -0,0 +1,107 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import Array2Numpy, FlattenArray, GenArgList, Index2Coordinate
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _np_replication_pad2d_grad(src, dest, padding):
+    (c_idx, h_idx, w_idx) = (1, 2, 3)
+    pad_left = padding[0]
+    pad_right = padding[1]
+    pad_top = padding[2]
+    pad_bottom = padding[3]
+    (dx_height, dx_width) = (dest.shape[h_idx], dest.shape[w_idx])
+    (dy_height, dy_width) = (src.shape[h_idx], src.shape[w_idx])
+    numpy_src = np.ones(src.shape, np.int32)
+    numpy_dest = np.zeros(dest.shape, np.int32)
+    array_src = FlattenArray(numpy_src)
+    array_dest = FlattenArray(numpy_dest)
+    src_num = src.shape[c_idx] * src.shape[h_idx] * src.shape[w_idx]
+    dest_num = dest.shape[c_idx] * dest.shape[h_idx] * dest.shape[w_idx]
+    elements_num = src.shape[0] * src_num
+    for iter_n in range(elements_num):
+        coords = Index2Coordinate(iter_n, src.shape)
+        (n, c, i, j) = (coords[0], coords[c_idx], coords[h_idx], coords[w_idx])
+        ip_x = ip_y = 0
+        if j < pad_left:
+            ip_x = pad_left
+        elif j >= pad_left and j < dx_width + pad_left:
+            ip_x = j
+        else:
+            ip_x = dx_width + pad_left - 1
+        if i < pad_top:
+            ip_y = pad_top
+        elif i >= pad_top and i < dx_height + pad_top:
+            ip_y = i
+        else:
+            ip_y = dx_height + pad_top - 1
+        ip_x = ip_x - pad_left
+        ip_y = ip_y - pad_top
+        src_index = n * src_num + c * dy_width * dy_height + i * dy_width + j
+        dest_index = n * dest_num + c * dx_width * dx_height + ip_y * dx_width + ip_x
+        array_dest[dest_index] += array_src[src_index]
+    numpy_dest = Array2Numpy(array_dest, dest.shape)
+    return numpy_dest
+
+
+def _test_ReplicationPad2d(test_case, shape, padding, device):
+    np_input = np.random.random(shape).astype(np.float32)
+    of_input = flow.Tensor(
+        np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    if isinstance(padding, int):
+        np_boundary = ((0, 0), (0, 0), (padding, padding), (padding, padding))
+        boundry = [padding, padding, padding, padding]
+    elif isinstance(padding, (tuple, int)) and len(padding) == 4:
+        np_boundary = (
+            (0, 0),
+            (0, 0),
+            (padding[2], padding[3]),
+            (padding[0], padding[1]),
+        )
+        boundry = [padding[0], padding[1], padding[2], padding[3]]
+    else:
+        raise ValueError("padding must be in or list or tuple!")
+    layer = flow.nn.ReplicationPad2d(padding=padding)
+    of_out = layer(of_input)
+    np_out = np.pad(np_input, np_boundary, mode="edge")
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_out_grad = _np_replication_pad2d_grad(np_out, np_input, boundry)
+    test_case.assertTrue(np.allclose(of_input.grad.numpy(), np_out_grad, 0.001, 0.001))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestReplicationPad2dModule(flow.unittest.TestCase):
+    def test_ReplicationPad2d(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(1, 2, 3, 4), (8, 3, 4, 4)]
+        arg_dict["padding"] = [2, (1, 1, 2, 2)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_ReplicationPad2d(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_reshape.py b/python/oneflow/test/modules/test_reshape.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf5cf34dab293775c23d85ba7b645007a949b786
--- /dev/null
+++ b/python/oneflow/test/modules/test_reshape.py
@@ -0,0 +1,88 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from automated_test_util import *
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_reshape(test_case, device):
+    x = np.array(
+        [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]]
+    ).astype(np.float32)
+    input = flow.Tensor(x, device=flow.device(device))
+    of_shape = flow.reshape(input, shape=[2, 2, 2, -1]).numpy().shape
+    np_shape = (2, 2, 2, 2)
+    test_case.assertTrue(np.array_equal(of_shape, np_shape))
+
+
+def _test_reshape_tuple(test_case, device):
+    x = np.array(
+        [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]]
+    ).astype(np.float32)
+    input = flow.Tensor(x, device=flow.device(device))
+    of_shape = flow.reshape(input, shape=(2, 2, 2, -1)).numpy().shape
+    np_shape = (2, 2, 2, 2)
+    test_case.assertTrue(np.array_equal(of_shape, np_shape))
+
+
+def _test_reshape_backward(test_case, device):
+    x = np.array(
+        [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]]
+    ).astype(np.float32)
+    input = flow.Tensor(x, device=flow.device(device), requires_grad=True)
+    of_out = flow.reshape(input, shape=[2, 2, 2, -1]).sum()
+    of_out.backward()
+    np_grad = np.array(
+        [
+            [1.0, 1.0, 1.0, 1.0],
+            [1.0, 1.0, 1.0, 1.0],
+            [1.0, 1.0, 1.0, 1.0],
+            [1.0, 1.0, 1.0, 1.0],
+        ]
+    )
+    test_case.assertTrue(np.allclose(np_grad, input.grad.numpy(), 0.0001, 0.0001))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestModule(flow.unittest.TestCase):
+    def test_reshape(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_reshape,
+            _test_reshape_tuple,
+            _test_reshape_backward,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+    @autotest()
+    def test_reshape_flow_with_random_data(test_case):
+        device = random_device()
+        x = random_pytorch_tensor(ndim=4).to(device)
+        y = torch.reshape(x, shape=(-1,))
+        return y
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_resnet50_with_bn.py b/python/oneflow/test/modules/test_resnet50_with_bn.py
new file mode 100644
index 0000000000000000000000000000000000000000..328adec8e7d289f5517590c9ba6baf5396ae86a5
--- /dev/null
+++ b/python/oneflow/test/modules/test_resnet50_with_bn.py
@@ -0,0 +1,97 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+from resnet50_model import resnet50
+
+import oneflow as flow
+import oneflow.unittest
+
+
+@flow.unittest.skip_unless_1n1d()
+@flow.unittest.skip_unless_1n1d()
+class TestResNet50(flow.unittest.TestCase):
+    def test_resnet50_with_batchnorm(test_case):
+        batch_size = 32
+        color_space = "RGB"
+        height = 224
+        width = 224
+        output_layout = "NCHW"
+        rgb_mean = [123.68, 116.779, 103.939]
+        rgb_std = [58.393, 57.12, 57.375]
+        record_reader = flow.nn.OfrecordReader(
+            "/dataset/imagenette/ofrecord",
+            batch_size=batch_size,
+            data_part_num=1,
+            part_name_suffix_length=5,
+            shuffle_after_epoch=False,
+        )
+        record_image_decoder = flow.nn.OFRecordImageDecoder(
+            "encoded", color_space=color_space
+        )
+        record_label_decoder = flow.nn.OfrecordRawDecoder(
+            "class/label", shape=(), dtype=flow.int32
+        )
+        resize = flow.nn.image.Resize(
+            resize_side="shorter", keep_aspect_ratio=True, target_size=256
+        )
+        crop_mirror_normal = flow.nn.CropMirrorNormalize(
+            color_space=color_space,
+            output_layout=output_layout,
+            crop_h=height,
+            crop_w=width,
+            crop_pos_y=0.5,
+            crop_pos_x=0.5,
+            mean=rgb_mean,
+            std=rgb_std,
+            output_dtype=flow.float,
+        )
+        res50_module = resnet50(
+            replace_stride_with_dilation=[False, False, False],
+            norm_layer=flow.nn.BatchNorm2d,
+        )
+        res50_module.train()
+        res50_module.load_state_dict(flow.load("/dataset/imagenette/resnet50_models"))
+        of_corss_entropy = flow.nn.CrossEntropyLoss()
+        res50_module.to("cuda")
+        of_corss_entropy.to("cuda")
+        learning_rate = 0.001
+        mom = 0.9
+        of_sgd = flow.optim.SGD(
+            res50_module.parameters(), lr=learning_rate, momentum=mom
+        )
+        errors = 0.0
+        for b in range(100):
+            val_record = record_reader()
+            label = record_label_decoder(val_record)
+            image_raw_buffer = record_image_decoder(val_record)
+            image = resize(image_raw_buffer)[0]
+            image = crop_mirror_normal(image)
+            image = image.to("cuda")
+            label = label.to("cuda")
+            logits = res50_module(image)
+            loss = of_corss_entropy(logits, label)
+            loss.backward()
+            of_sgd.step()
+            of_sgd.zero_grad()
+            l = loss.numpy()[0]
+        test_case.assertTrue(l < 3.5)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_resnet50_without_bn.py b/python/oneflow/test/modules/test_resnet50_without_bn.py
new file mode 100644
index 0000000000000000000000000000000000000000..4aab85267febd565919a997c0b88e66a94365079
--- /dev/null
+++ b/python/oneflow/test/modules/test_resnet50_without_bn.py
@@ -0,0 +1,200 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import unittest
+
+import numpy as np
+from resnet50_model import FakeBN, resnet50
+
+import oneflow as flow
+import oneflow.unittest
+
+
+@flow.unittest.skip_unless_1n1d()
+@flow.unittest.skip_unless_1n1d()
+class TestResNet50(flow.unittest.TestCase):
+    def test_resnet50_without_batchnorm(test_case):
+        batch_size = 32
+        color_space = "RGB"
+        height = 224
+        width = 224
+        output_layout = "NCHW"
+        rgb_mean = [123.68, 116.779, 103.939]
+        rgb_std = [58.393, 57.12, 57.375]
+        record_reader = flow.nn.OfrecordReader(
+            "/dataset/imagenette/ofrecord",
+            batch_size=batch_size,
+            data_part_num=1,
+            part_name_suffix_length=5,
+            shuffle_after_epoch=False,
+        )
+        record_image_decoder = flow.nn.OFRecordImageDecoder(
+            "encoded", color_space=color_space
+        )
+        record_label_decoder = flow.nn.OfrecordRawDecoder(
+            "class/label", shape=(), dtype=flow.int32
+        )
+        resize = flow.nn.image.Resize(
+            resize_side="shorter", keep_aspect_ratio=True, target_size=256
+        )
+        crop_mirror_normal = flow.nn.CropMirrorNormalize(
+            color_space=color_space,
+            output_layout=output_layout,
+            crop_h=height,
+            crop_w=width,
+            crop_pos_y=0.5,
+            crop_pos_x=0.5,
+            mean=rgb_mean,
+            std=rgb_std,
+            output_dtype=flow.float,
+        )
+        res50_module = resnet50(
+            replace_stride_with_dilation=[False, False, False], norm_layer=FakeBN
+        )
+        res50_module.train()
+        res50_module.load_state_dict(
+            flow.load("/dataset/imagenette/resnet50_pretrained")
+        )
+        of_corss_entropy = flow.nn.CrossEntropyLoss()
+        res50_module.to("cuda")
+        of_corss_entropy.to("cuda")
+        learning_rate = 0.001
+        mom = 0.9
+        of_sgd = flow.optim.SGD(
+            res50_module.parameters(), lr=learning_rate, momentum=mom
+        )
+        gt_of_losses = [
+            6.823004722595215,
+            6.818080902099609,
+            6.817478179931641,
+            6.820215702056885,
+            6.820272445678711,
+            6.805415630340576,
+            6.812217712402344,
+            6.822971343994141,
+            6.81321907043457,
+            6.812097549438477,
+            6.808729648590088,
+            6.809578895568848,
+            6.810042381286621,
+            6.81298303604126,
+            6.806015968322754,
+            6.809454917907715,
+            6.808111190795898,
+            6.80530309677124,
+            6.808160781860352,
+            6.809715747833252,
+            6.804327487945557,
+            6.801260948181152,
+            6.801140785217285,
+            6.802030086517334,
+            6.802935600280762,
+            6.793076992034912,
+            6.800511360168457,
+            6.7988386154174805,
+            6.798485279083252,
+            6.802251815795898,
+            6.798983573913574,
+            6.798493385314941,
+            6.796577453613281,
+            6.787880897521973,
+            6.796964645385742,
+            6.783697128295898,
+            6.7896833419799805,
+            6.786165714263916,
+            6.790346145629883,
+            6.785680770874023,
+            6.782796859741211,
+            6.784112930297852,
+            6.792185306549072,
+            6.780761241912842,
+            6.778015613555908,
+            6.778000354766846,
+            6.789952278137207,
+            6.773430824279785,
+            6.780228614807129,
+            6.774554252624512,
+            6.77685546875,
+            6.7801337242126465,
+            6.767944812774658,
+            6.7757134437561035,
+            6.772693157196045,
+            6.770571231842041,
+            6.766884803771973,
+            6.762784004211426,
+            6.765412330627441,
+            6.768856048583984,
+            6.769237518310547,
+            6.77099609375,
+            6.765361785888672,
+            6.7630228996276855,
+            6.757351875305176,
+            6.761430740356445,
+            6.757913112640381,
+            6.756040096282959,
+            6.75714111328125,
+            6.752540588378906,
+            6.7559967041015625,
+            6.759932041168213,
+            6.756745338439941,
+            6.750467300415039,
+            6.750478744506836,
+            6.750133514404297,
+            6.75436544418335,
+            6.744396209716797,
+            6.753242492675781,
+            6.747480392456055,
+            6.744192123413086,
+            6.744802474975586,
+            6.742746829986572,
+            6.7499589920043945,
+            6.739953517913818,
+            6.739869117736816,
+            6.744085311889648,
+            6.744339942932129,
+            6.741791248321533,
+            6.737485885620117,
+            6.735355377197266,
+            6.7377848625183105,
+            6.73032283782959,
+            6.734944820404053,
+            6.7288079261779785,
+            6.737483978271484,
+            6.730724334716797,
+            6.728422164916992,
+            6.723917007446289,
+            6.734870910644531,
+        ]
+        for b in range(100):
+            val_record = record_reader()
+            label = record_label_decoder(val_record)
+            image_raw_buffer = record_image_decoder(val_record)
+            image = resize(image_raw_buffer)[0]
+            image = crop_mirror_normal(image)
+            image = image.to("cuda")
+            label = label.to("cuda")
+            logits = res50_module(image)
+            loss = of_corss_entropy(logits, label)
+            loss.backward()
+            of_sgd.step()
+            of_sgd.zero_grad()
+            l = loss.numpy()[0]
+            test_case.assertTrue(np.allclose(l.item(), gt_of_losses[b], atol=1e-05))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_round.py b/python/oneflow/test/modules/test_round.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff9e1cf4488261663c2a96905c338a74935a0fd3
--- /dev/null
+++ b/python/oneflow/test/modules/test_round.py
@@ -0,0 +1,53 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_round_impl(test_case, shape, device):
+    np_input = np.random.randn(*shape)
+    of_input = flow.Tensor(
+        np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    of_out = flow.round(of_input)
+    np_out = np.round(np_input)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+    of_out = of_out.sum()
+    of_out.backward()
+    test_case.assertTrue(
+        np.allclose(of_input.grad.numpy(), np.zeros(shape), 0.0001, 0.0001)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestRound(flow.unittest.TestCase):
+    def test_round(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(2, 3), (2, 3, 4), (2, 4, 5, 6)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_round_impl(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_scatter_nd.py b/python/oneflow/test/modules/test_scatter_nd.py
new file mode 100644
index 0000000000000000000000000000000000000000..5636166ff340ea172ec9e2f195b51ac0ff2e2c86
--- /dev/null
+++ b/python/oneflow/test/modules/test_scatter_nd.py
@@ -0,0 +1,95 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_scatter_nd(test_case, device):
+    indices = flow.Tensor(
+        np.array([[1], [6], [4]]), dtype=flow.int, device=flow.device(device)
+    )
+    update = flow.Tensor(
+        np.array([10.2, 5.1, 12.7]), dtype=flow.float, device=flow.device(device)
+    )
+    np_out = np.array([0.0, 10.2, 0.0, 0.0, 12.7, 0.0, 5.1, 0.0])
+    output = flow.scatter_nd(indices, update, [8])
+    test_case.assertTrue(np.allclose(output.numpy(), np_out, 0.0001, 0.0001))
+
+
+def _test_scatter_nd_t(test_case, device):
+    indices = flow.Tensor(
+        np.array([[0], [4], [2]]), dtype=flow.int, device=flow.device(device)
+    )
+    update = flow.Tensor(
+        np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3]]),
+        dtype=flow.float,
+        device=flow.device(device),
+    )
+    np_out = np.array(
+        [
+            [1.0, 1.0, 1.0],
+            [0.0, 0.0, 0.0],
+            [3.0, 3.0, 3.0],
+            [0.0, 0.0, 0.0],
+            [2.0, 2.0, 2.0],
+        ]
+    )
+    output = flow.scatter_nd(indices, update, [5, 3])
+    test_case.assertTrue(np.allclose(output.numpy(), np_out, 0.0001, 0.0001))
+
+
+def _test_scatter_nd_backward(test_case, device):
+    indices = flow.Tensor(
+        np.array([[1], [6], [4]]), dtype=flow.int, device=flow.device(device)
+    )
+    of_update = flow.Tensor(
+        np.array([10.2, 5.1, 12.7]),
+        requires_grad=True,
+        dtype=flow.float,
+        device=flow.device(device),
+    )
+    np_out = np.array([0.0, 10.2, 0.0, 0.0, 12.7, 0.0, 5.1, 0.0])
+    np_grad = np.array([1.0, 1.0, 1.0])
+    output = flow.scatter_nd(indices, of_update, [8])
+    out_sum = output.sum()
+    out_sum.backward()
+    test_case.assertTrue(np.allclose(output.numpy(), np_out, 0.0001, 0.0001))
+    test_case.assertTrue(np.array_equal(of_update.grad.numpy(), np_grad))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestScatter_nd(flow.unittest.TestCase):
+    def test_scatter_nd(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_scatter_nd,
+            _test_scatter_nd_t,
+            _test_scatter_nd_backward,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_sign.py b/python/oneflow/test/modules/test_sign.py
new file mode 100644
index 0000000000000000000000000000000000000000..f46a6165b406f6ddbc8bf8c22c12ac87ad7aed05
--- /dev/null
+++ b/python/oneflow/test/modules/test_sign.py
@@ -0,0 +1,52 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_sign_impl(test_case, shape, device):
+    np_input = np.random.randn(*shape)
+    of_input = flow.Tensor(
+        np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    of_out = flow.sign(of_input)
+    np_out = np.sign(np_input)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = np.zeros_like(np_input)
+    test_case.assertTrue(np.allclose(of_input.grad.numpy(), np_grad, 0.0001, 0.0001))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestSign(flow.unittest.TestCase):
+    def test_sign(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(2, 3), (2, 4, 5, 6)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_sign_impl(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_slice.py b/python/oneflow/test/modules/test_slice.py
new file mode 100644
index 0000000000000000000000000000000000000000..7409be5af7beecd461158462582b49ff9e7f4d4d
--- /dev/null
+++ b/python/oneflow/test/modules/test_slice.py
@@ -0,0 +1,178 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_slice(test_case, device):
+    np_arr = np.random.randn(3, 6, 9).astype(np.float32)
+    x = flow.Tensor(np_arr, device=flow.device(device))
+    tup_list = [[None, None, None], [0, 5, 2], [0, 6, 3]]
+    y = flow.slice(x, slice_tup_list=tup_list)
+    tmp = np_arr[0:3, 0:5, 0:6]
+    np_out = tmp[::1, ::2, ::3]
+    test_case.assertTrue(np.array_equal(y.numpy(), np_out))
+
+
+def _test_slice_1_dim(test_case, device):
+    np_arr = np.random.randn(100).astype(np.float32)
+    x = flow.Tensor(np_arr, device=flow.device(device))
+    test_case.assertTrue(np.allclose(x[1].numpy(), np_arr[1], 1e-05, 1e-05))
+    test_case.assertTrue(np.allclose(x[99].numpy(), np_arr[99], 1e-05, 1e-05))
+    test_case.assertTrue(np.allclose(x[0:2].numpy(), np_arr[0:2], 1e-05, 1e-05))
+
+
+def _test_slice_3_dim(test_case, device):
+    np_arr = np.random.randn(2, 3, 4).astype(np.float32)
+    x = flow.Tensor(np_arr, device=flow.device(device))
+    test_case.assertTrue(np.allclose(x[:, 0].numpy(), np_arr[:, 0], 1e-05, 1e-05))
+
+
+def _test_slice_4_dim(test_case, device):
+    np_arr = np.random.randn(5, 3, 6, 9).astype(np.float32)
+    x = flow.Tensor(np_arr, device=flow.device(device))
+    tup_list = [[0, 5, 2], [None, None, None], [0, 5, 2], [0, 6, 3]]
+    y = flow.slice(x, slice_tup_list=tup_list)
+    tmp = np_arr[0:5, 0:3, 0:5, 0:6]
+    np_out = tmp[::2, ::1, ::2, ::3]
+    test_case.assertTrue(np.array_equal(y.numpy(), np_out))
+
+
+def _test_slice_with_int_index(test_case, device):
+    np_arr = np.random.randn(2, 3, 4).astype(np.float32)
+    x = flow.Tensor(np_arr, device=flow.device(device))
+    of_out = x[0, 1:2]
+    np_out = np_arr[0, 1:2]
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+    np_arr = np.random.randn(2, 3, 4).astype(np.float32)
+    x = flow.Tensor(np_arr, device=flow.device(device))
+    of_out = x[0, :]
+    np_out = np_arr[0, :]
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+    np_arr = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]).astype(np.float32)
+    x = flow.Tensor(np_arr, device=flow.device(device))
+    of_out = x[0, :, :]
+    np_out = np_arr[0, :, :]
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+    np_arr = np.random.randn(2, 3, 4, 5).astype(np.float32)
+    x = flow.Tensor(np_arr, device=flow.device(device))
+    of_out = x[0, :, :, :]
+    np_out = np_arr[0, :, :, :]
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+def _test_slice_negative_index(test_case, device):
+    np_arr = np.random.randn(4, 5, 6)
+    x = flow.Tensor(np_arr, device=flow.device(device))
+    test_case.assertTrue(np.allclose(x[-1].numpy(), np_arr[-1], 0.0001, 0.0001))
+    test_case.assertTrue(np.allclose(x[-2].numpy(), np_arr[-2], 0.0001, 0.0001))
+    test_case.assertTrue(np.allclose(x[-3].numpy(), np_arr[-3], 0.0001, 0.0001))
+    test_case.assertTrue(np.allclose(x[-4].numpy(), np_arr[-4], 0.0001, 0.0001))
+
+
+def _test_slice_ellipsis_type(test_case, device):
+    np_arr = np.random.randn(2, 3, 4, 5, 6, 7).astype(np.float32)
+    x = flow.Tensor(np_arr, device=flow.device(device))
+    of_out = x[..., ::2, ::2, 3:4]
+    np_out = np_arr[..., ::2, ::2, 3:4]
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+    of_out = x[..., 1:2, ::2, 1, ::3]
+    np_out = np_arr[..., 1:2, ::2, 1, ::3]
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+    of_out = x[0, 2, ..., 1, 1:2]
+    np_out = np_arr[0, 2, ..., 1, 1:2]
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+    of_out = x[::2, ..., 1:2]
+    np_out = np_arr[::2, ..., 1:2]
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+def _test_slice_backward(test_case, device):
+    np_arr = np.random.randn(3, 6, 9).astype(np.float32)
+    x = flow.Tensor(np_arr, device=flow.device(device), requires_grad=True)
+    tup_list = [[None, None, None], [0, 5, 2], [0, 6, 3]]
+    y = flow.slice(x, slice_tup_list=tup_list)
+    z = y.sum()
+    z.backward()
+    np_grad = np.zeros((3, 6, 9))
+    np_grad[0:3, 0:5, 0:6][::1, ::2, ::3] = 1
+    test_case.assertTrue(np.array_equal(x.grad.numpy(), np_grad))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestSlice(flow.unittest.TestCase):
+    def test_slice(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_slice,
+            _test_slice_1_dim,
+            _test_slice_3_dim,
+            _test_slice_4_dim,
+            _test_slice_with_int_index,
+            _test_slice_negative_index,
+            _test_slice_ellipsis_type,
+            _test_slice_backward,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestSliceUpdate(flow.unittest.TestCase):
+    def test_slice_update(test_case):
+        x = np.array([1, 1, 1, 1, 1]).astype(np.float32)
+        input = flow.Tensor(x)
+        update = flow.Tensor(np.array([2, 3, 4]).astype(np.float32))
+        output = np.array([1.0, 2.0, 3.0, 4.0, 1.0])
+        y = flow.slice_update(input, update, slice_tup_list=[[1, 4, 1]])
+        test_case.assertTrue(np.array_equal(y.numpy(), output))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestLogicalSliceAssign(flow.unittest.TestCase):
+    def test_logical_slice_assign(test_case):
+        x = np.array([1, 1, 1, 1, 1]).astype(np.float32)
+        input = flow.Tensor(x)
+        update = flow.Tensor(np.array([2, 3, 4]).astype(np.float32))
+        output = np.array([1.0, 2.0, 3.0, 4.0, 1.0])
+        flow.tmp.logical_slice_assign(input, update, slice_tup_list=[[1, 4, 1]])
+        test_case.assertTrue(np.array_equal(input.numpy(), output))
+
+    def test_logical_slice_assign_negative_index(test_case):
+        np_arr = np.zeros(shape=(2, 3, 4))
+        input = flow.Tensor(np_arr)
+        np_arr[-1] = 1
+        input[-1] = 1
+        test_case.assertTrue(np.array_equal(input.numpy(), np_arr))
+
+    def test_logical_slice_assign_ellipsis_type(test_case):
+        np_arr = np.zeros(shape=(2, 3, 4, 5, 6))
+        input = flow.Tensor(np_arr)
+        np_arr[0, ::1, ..., 2:3] = 1
+        input[0, ::1, ..., 2:3] = 1
+        test_case.assertTrue(np.array_equal(input.numpy(), np_arr))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_smoothl1loss.py b/python/oneflow/test/modules/test_smoothl1loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc2660b113e7caea479386b0b9c3e05f03d7f256
--- /dev/null
+++ b/python/oneflow/test/modules/test_smoothl1loss.py
@@ -0,0 +1,106 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList, type_name_to_flow_type, type_name_to_np_type
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _np_smoothl1loss(np_input, np_target, beta=1.0):
+    original_shape = np_input.shape
+    elem_cnt = np_input.size
+    np_input = np_input.reshape(-1)
+    np_target = np_target.reshape(-1)
+    loss = np.zeros(elem_cnt).astype(np_input.dtype)
+    for i in np.arange(elem_cnt):
+        abs_diff = abs(np_input[i] - np_target[i])
+        if abs_diff < beta:
+            loss[i] = 0.5 * abs_diff * abs_diff / beta
+        else:
+            loss[i] = abs_diff - 0.5 * beta
+    return {
+        "none": loss.reshape(original_shape),
+        "mean": np.mean(loss),
+        "sum": np.sum(loss),
+    }
+
+
+def _np_smoothl1loss_grad(np_input, np_target, beta=1.0):
+    original_shape = np_input.shape
+    elem_cnt = np_input.size
+    np_input = np_input.reshape(-1)
+    np_target = np_target.reshape(-1)
+    np_input_grad = np.zeros(elem_cnt).astype(np_input.dtype)
+    for i in np.arange(elem_cnt):
+        diff = np_input[i] - np_target[i]
+        abs_diff = abs(diff)
+        if abs_diff < beta:
+            np_input_grad[i] = diff / beta
+        else:
+            np_input_grad[i] = np.sign(diff)
+    np_input_grad_sum = np_input_grad.reshape(original_shape)
+    np_input_grad_mean = np_input_grad_sum / elem_cnt
+    return {
+        "none": np_input_grad_sum,
+        "mean": np_input_grad_mean,
+        "sum": np_input_grad_sum,
+    }
+
+
+def _test_smoothl1loss_impl(test_case, device, shape, data_type, reduction, beta):
+    x = np.random.randn(*shape).astype(type_name_to_np_type[data_type])
+    y = np.random.randn(*shape).astype(type_name_to_np_type[data_type])
+    input = flow.Tensor(
+        x,
+        dtype=type_name_to_flow_type[data_type],
+        requires_grad=True,
+        device=flow.device(device),
+    )
+    target = flow.Tensor(
+        y, dtype=type_name_to_flow_type[data_type], device=flow.device(device)
+    )
+    loss = flow.nn.SmoothL1Loss(reduction=reduction, beta=beta)
+    loss = loss.to(device)
+    of_out = loss(input, target)
+    np_out = _np_smoothl1loss(x, y, beta)[reduction]
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = _np_smoothl1loss_grad(x, y, beta=beta)[reduction]
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestSmoothL1LossModule(flow.unittest.TestCase):
+    def test_smoothl1loss(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [_test_smoothl1loss_impl]
+        arg_dict["device"] = ["cpu", "cuda"]
+        arg_dict["shape"] = [(10, 3), (100,)]
+        arg_dict["data_type"] = ["float32", "double"]
+        arg_dict["reduction"] = ["none", "mean", "sum"]
+        arg_dict["beta"] = [0, 0.5, 1]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_softplus.py b/python/oneflow/test/modules/test_softplus.py
new file mode 100644
index 0000000000000000000000000000000000000000..501f20a980ee0ac1c53c9b0b803ba7877605c438
--- /dev/null
+++ b/python/oneflow/test/modules/test_softplus.py
@@ -0,0 +1,52 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_softplus_impl(test_case, shape, device):
+    np_input = np.random.randn(*shape)
+    of_input = flow.Tensor(
+        np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    np_x_grad = np.exp(np_input) / (1 + np.exp(np_input))
+    of_out = flow.softplus(of_input)
+    np_out = np.log(1 + np.exp(np_input))
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+    of_out = of_out.sum()
+    of_out.backward()
+    test_case.assertTrue(np.allclose(of_input.grad.numpy(), np_x_grad, 0.0001, 0.0001))
+
+
+@flow.unittest.skip_unless_1n1d()
+class Testsoftplus(flow.unittest.TestCase):
+    def test_softplus(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(2, 3), (2, 4, 5, 6)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_softplus_impl(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_sort.py b/python/oneflow/test/modules/test_sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7cf1bd5af9e721c6cdc1a79bec0a4c8abf6d7d1
--- /dev/null
+++ b/python/oneflow/test/modules/test_sort.py
@@ -0,0 +1,80 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList, type_name_to_flow_type
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_sort(test_case, data_shape, axis, descending, data_type, device):
+    input = flow.Tensor(
+        np.random.randn(*data_shape),
+        dtype=type_name_to_flow_type[data_type],
+        device=flow.device(device),
+    )
+    (of_values, of_indices) = flow.sort(input, dim=axis, descending=descending)
+    np_input = -input.numpy() if descending else input.numpy()
+    np_indices = np.argsort(np_input, axis=axis)
+    np_out = np.sort(np_input, axis=axis)
+    np_values = -np_out if descending else np_out
+    test_case.assertTrue(
+        np.array_equal(of_values.numpy().flatten(), np_values.flatten())
+    )
+    test_case.assertTrue(
+        np.array_equal(of_indices.numpy().flatten(), np_indices.flatten())
+    )
+
+
+def _test_tensor_sort(test_case, data_shape, axis, descending, data_type, device):
+    input = flow.Tensor(
+        np.random.randn(*data_shape),
+        dtype=type_name_to_flow_type[data_type],
+        device=flow.device(device),
+    )
+    (of_values, of_indices) = input.sort(dim=axis, descending=descending)
+    np_input = -input.numpy() if descending else input.numpy()
+    np_indices = np.argsort(np_input, axis=axis)
+    np_out = np.sort(np_input, axis=axis)
+    np_values = -np_out if descending else np_out
+    test_case.assertTrue(
+        np.array_equal(of_values.numpy().flatten(), np_values.flatten())
+    )
+    test_case.assertTrue(
+        np.array_equal(of_indices.numpy().flatten(), np_indices.flatten())
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestSort(flow.unittest.TestCase):
+    def test_sort(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [_test_sort, _test_tensor_sort]
+        arg_dict["data_shape"] = [(2, 6, 5, 4), (3, 4, 8)]
+        arg_dict["axis"] = [-1, 0, 2]
+        arg_dict["descending"] = [True, False]
+        arg_dict["data_type"] = ["double", "float32", "int32"]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_sparse.py b/python/oneflow/test/modules/test_sparse.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb800ace0210b9862673e8286458c7a7a27a240d
--- /dev/null
+++ b/python/oneflow/test/modules/test_sparse.py
@@ -0,0 +1,99 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_embedding_impl(test_case, device):
+    weight = np.array(
+        [
+            [0.68258786, 0.6957856, 1.1829041],
+            [1.0154, -1.0616943, 0.50303376],
+            [0.29679507, 0.65562993, 1.0424724],
+            [-0.42980736, -0.35347632, -0.15600166],
+            [0.6763601, -0.24286619, -2.0873115],
+            [-0.13371214, -0.5589277, 1.9173933],
+            [0.08762296, 1.0264007, -0.67938024],
+            [0.32019204, -0.26137325, -1.3534237],
+            [-1.1555519, -0.67776406, 0.27372134],
+            [1.0615997, -0.59715784, 1.9855849],
+        ],
+        dtype=np.float32,
+    )
+    output = np.array(
+        [
+            [
+                [1.0154, -1.0616943, 0.50303376],
+                [0.29679507, 0.65562993, 1.0424724],
+                [0.6763601, -0.24286619, -2.0873115],
+                [-0.13371214, -0.5589277, 1.9173933],
+            ],
+            [
+                [0.6763601, -0.24286619, -2.0873115],
+                [-0.42980736, -0.35347632, -0.15600166],
+                [0.29679507, 0.65562993, 1.0424724],
+                [1.0615997, -0.59715784, 1.9855849],
+            ],
+        ],
+        dtype=np.float32,
+    )
+    indices = flow.Tensor(
+        [[1, 2, 4, 5], [4, 3, 2, 9]],
+        dtype=flow.int,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    m = flow.nn.Embedding(10, 3, _weight=flow.Tensor(weight))
+    m = m.to(device)
+    y = m(indices)
+    test_case.assertTrue(np.allclose(y.numpy(), output, 1e-05, 1e-05))
+    y = y.sum()
+    y.backward()
+    weight_grad_np = [
+        [0.0, 0.0, 0.0],
+        [1.0, 1.0, 1.0],
+        [2.0, 2.0, 2.0],
+        [1.0, 1.0, 1.0],
+        [2.0, 2.0, 2.0],
+        [1.0, 1.0, 1.0],
+        [0.0, 0.0, 0.0],
+        [0.0, 0.0, 0.0],
+        [0.0, 0.0, 0.0],
+        [1.0, 1.0, 1.0],
+    ]
+    test_case.assertTrue(
+        np.allclose(m.weight.grad.numpy(), weight_grad_np, 1e-05, 1e-05)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestEmbedding(flow.unittest.TestCase):
+    def test_embedding(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_embedding_impl(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_squeeze.py b/python/oneflow/test/modules/test_squeeze.py
new file mode 100644
index 0000000000000000000000000000000000000000..a69f92cc3b19badf05838861a010c29732a2bd30
--- /dev/null
+++ b/python/oneflow/test/modules/test_squeeze.py
@@ -0,0 +1,113 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from automated_test_util import *
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_squeeze(test_case, device):
+    np_arr = np.random.rand(1, 1, 1, 3)
+    input = flow.Tensor(np_arr, device=flow.device(device))
+    of_shape = flow.squeeze(input, dim=[1, 2]).numpy().shape
+    np_shape = (1, 3)
+    test_case.assertTrue(np.array_equal(of_shape, np_shape))
+    test_case.assertTrue(
+        np.allclose(
+            flow.squeeze(input, dim=[1, 2]).numpy(),
+            np.squeeze(input.numpy(), axis=(1, 2)),
+            0.0001,
+            0.0001,
+        )
+    )
+
+
+def _test_squeeze_1d_input(test_case, device):
+    np_arr = np.random.rand(10)
+    input = flow.Tensor(np_arr, device=flow.device(device))
+    output = flow.squeeze(input)
+    test_case.assertTrue(np.allclose(output.numpy(), np_arr, 1e-05, 1e-05))
+
+
+def _test_tensor_squeeze(test_case, device):
+    np_arr = np.random.rand(1, 1, 1, 3)
+    input = flow.Tensor(np_arr, device=flow.device(device))
+    of_shape = input.squeeze(dim=[1, 2]).numpy().shape
+    np_shape = (1, 3)
+    test_case.assertTrue(np.array_equal(of_shape, np_shape))
+    test_case.assertTrue(
+        np.allclose(
+            input.squeeze(dim=[1, 2]).numpy(),
+            np.squeeze(input.numpy(), axis=(1, 2)),
+            0.0001,
+            0.0001,
+        )
+    )
+
+
+def _test_squeeze_int(test_case, device):
+    np_arr = np.random.rand(1, 1, 1, 3)
+    input = flow.Tensor(np_arr, device=flow.device(device))
+    of_shape = flow.squeeze(input, 1).numpy().shape
+    np_shape = (1, 1, 3)
+    test_case.assertTrue(np.array_equal(of_shape, np_shape))
+    test_case.assertTrue(
+        np.allclose(
+            input.squeeze(1).numpy(), np.squeeze(input.numpy(), axis=1), 0.0001, 0.0001
+        )
+    )
+
+
+def _test_squeeze_backward(test_case, device):
+    np_arr = np.random.rand(1, 1, 1, 3)
+    input = flow.Tensor(np_arr, device=flow.device(device), requires_grad=True)
+    y = flow.squeeze(input, dim=1).sum()
+    y.backward()
+    np_grad = np.ones((1, 1, 1, 3))
+    test_case.assertTrue(np.array_equal(input.grad.numpy(), np_grad))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestSqueeze(flow.unittest.TestCase):
+    def test_squeeze(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_squeeze,
+            _test_squeeze_1d_input,
+            _test_squeeze_int,
+            _test_tensor_squeeze,
+            _test_squeeze_backward,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+    @autotest()
+    def test_flow_squeeze_with_random_data(test_case):
+        device = random_device()
+        x = random_pytorch_tensor().to(device)
+        y = torch.squeeze(x, random(1, 3).to(int))
+        return y
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_stack.py b/python/oneflow/test/modules/test_stack.py
new file mode 100644
index 0000000000000000000000000000000000000000..c74a90be7fee43abb91129a3683ea75a40f69c1f
--- /dev/null
+++ b/python/oneflow/test/modules/test_stack.py
@@ -0,0 +1,108 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import random
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_stack(test_case, device, shape):
+    x = np.random.rand(*shape)
+    y = np.random.rand(*shape)
+    x_tensor = flow.Tensor(x, dtype=flow.float32, device=flow.device(device))
+    y_tensor = flow.Tensor(y, dtype=flow.float32, device=flow.device(device))
+    out_np = np.stack([x, y], axis=1)
+    out_of = flow.stack([x_tensor, y_tensor], dim=1).numpy()
+    test_case.assertTrue(np.allclose(out_np, out_of, 1e-05, 1e-05))
+
+
+def _test_stack_tuple_input(test_case, device, shape):
+    x = np.random.rand(*shape)
+    y = np.random.rand(*shape)
+    x_tensor = flow.Tensor(x, dtype=flow.float32, device=flow.device(device))
+    y_tensor = flow.Tensor(y, dtype=flow.float32, device=flow.device(device))
+    out_np = np.stack([x, y], axis=0)
+    out_of = flow.stack((x_tensor, y_tensor), dim=0).numpy()
+    test_case.assertTrue(np.allclose(out_np, out_of, 1e-05, 1e-05))
+
+
+def _test_stack_backward(test_case, device, shape):
+    x = np.random.rand(*shape)
+    y = np.random.rand(*shape)
+    x_tensor = flow.Tensor(x, device=flow.device(device), requires_grad=True)
+    y_tensor = flow.Tensor(y, device=flow.device(device), requires_grad=True)
+    out_of = flow.stack([x_tensor, y_tensor]).sum()
+    out_of.backward()
+    test_case.assertTrue(
+        np.allclose(x_tensor.grad.numpy(), np.ones(x_tensor.shape), 1e-05, 1e-05)
+    )
+    test_case.assertTrue(
+        np.allclose(y_tensor.grad.numpy(), np.ones(y_tensor.shape), 1e-05, 1e-05)
+    )
+
+
+def _test_stack_different_dim(test_case, device, shape):
+    x = np.random.rand(*shape)
+    y = np.random.rand(*shape)
+    x_tensor = flow.Tensor(x, device=flow.device(device))
+    y_tensor = flow.Tensor(y, device=flow.device(device))
+    for axis in range(-len(x.shape) - 1, len(x.shape) + 1):
+        out_of = flow.stack([x_tensor, y_tensor], dim=axis)
+        out_np = np.stack([x, y], axis=axis)
+        test_case.assertTrue(np.allclose(out_np, out_of.numpy(), 1e-05, 1e-05))
+
+
+def _test_stack_multi_input(test_case, device, shape):
+    max_input_num = 10
+    for i in range(2, max_input_num):
+        x = []
+        x_tensor = []
+        for _ in range(0, i):
+            tmp = np.random.rand(*shape)
+            x.append(tmp)
+            x_tensor.append(flow.Tensor(tmp, device=flow.device(device)))
+        out_of = flow.stack(x_tensor, dim=-1)
+        out_np = np.stack(x, axis=-1)
+        test_case.assertTrue(np.allclose(out_np, out_of.numpy(), 1e-05, 1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestStack(flow.unittest.TestCase):
+    def test_stack(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_stack,
+            _test_stack_tuple_input,
+            _test_stack_backward,
+            _test_stack_different_dim,
+            _test_stack_multi_input,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        arg_dict["shape"] = [
+            tuple((random.randrange(1, 10) for _ in range(i))) for i in range(3, 6)
+        ]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_sub.py b/python/oneflow/test/modules/test_sub.py
new file mode 100644
index 0000000000000000000000000000000000000000..623d33d91db8b3e62af3705b778eb6a6f6ba27cf
--- /dev/null
+++ b/python/oneflow/test/modules/test_sub.py
@@ -0,0 +1,115 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from automated_test_util import *
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_sub_impl(test_case, shape, device):
+    x = flow.Tensor(
+        np.random.randn(*shape), device=flow.device(device), requires_grad=True
+    )
+    y = flow.Tensor(
+        np.random.randn(*shape), device=flow.device(device), requires_grad=True
+    )
+    of_out = flow.sub(x, y)
+    np_out = np.subtract(x.numpy(), y.numpy())
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad_x = np.ones(shape)
+    np_grad_y = -np.ones(shape)
+    test_case.assertTrue(np.allclose(x.grad.numpy(), np_grad_x, 1e-05, 1e-05))
+    test_case.assertTrue(np.allclose(y.grad.numpy(), np_grad_y, 1e-05, 1e-05))
+    x = 5
+    y = flow.Tensor(np.random.randn(*shape), device=flow.device(device))
+    of_out = flow.sub(x, y)
+    np_out = np.subtract(x, y.numpy())
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    x = flow.Tensor(np.random.randn(*shape), device=flow.device(device))
+    y = 5
+    of_out = flow.sub(x, y)
+    np_out = np.subtract(x.numpy(), y)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    x = flow.Tensor(np.random.randn(*shape), device=flow.device(device))
+    y = flow.Tensor(np.random.randn(1, 1), device=flow.device(device))
+    of_out = flow.sub(x, y)
+    np_out = np.subtract(x.numpy(), y.numpy())
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    x = flow.Tensor(np.array([5.0]))
+    y = flow.Tensor(np.random.randn(1, 1))
+    of_out = flow.sub(x, y)
+    np_out = np.subtract(x.numpy(), y.numpy())
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    x = flow.Tensor(np.random.randn(1, 1), requires_grad=True)
+    y = flow.Tensor(np.array([5.0]), requires_grad=True)
+    of_out = flow.sub(x, y)
+    np_out = np.subtract(x.numpy(), y.numpy())
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad_x = np.ones((1, 1))
+    np_grad_y = -np.ones(1)
+    test_case.assertTrue(np.allclose(x.grad.numpy(), np_grad_x, 1e-05, 1e-05))
+    test_case.assertTrue(np.allclose(y.grad.numpy(), np_grad_y, 1e-05, 1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestSubModule(flow.unittest.TestCase):
+    def test_sub(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(2, 3), (2, 3, 4), (2, 4, 5, 6)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_sub_impl(test_case, *arg)
+
+    def test_sub_against_pytorch(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_type"] = [test_flow_against_pytorch, test_tensor_against_pytorch]
+        arg_dict["device"] = ["cpu", "cuda"]
+        arg_dict["op"] = ["sub"]
+        for arg in GenArgList(arg_dict):
+            arg[0](
+                test_case,
+                arg[2],
+                extra_annotations={"other": flow.Tensor},
+                extra_generators={
+                    "input": random_tensor(ndim=2, dim0=2, dim1=3),
+                    "other": random_tensor(ndim=2, dim0=2, dim1=3),
+                },
+                device=arg[1],
+            )
+            arg[0](
+                test_case,
+                arg[2],
+                extra_annotations={"other": float},
+                extra_generators={
+                    "input": random_tensor(ndim=2, dim0=2, dim1=3),
+                    "other": random(0, 5),
+                },
+                device=arg[1],
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_sum.py b/python/oneflow/test/modules/test_sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9927eeda88c58a255899adae6da9a8927ff5d05
--- /dev/null
+++ b/python/oneflow/test/modules/test_sum.py
@@ -0,0 +1,81 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from automated_test_util import *
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_sum_impl(test_case, device):
+    input = flow.Tensor(
+        np.random.randn(2, 3), dtype=flow.float32, device=flow.device(device)
+    )
+    of_out = flow.sum(input, dim=0)
+    np_out = np.sum(input.numpy(), axis=0)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    input = flow.Tensor(
+        np.random.randn(2, 3), dtype=flow.float32, device=flow.device(device)
+    )
+    of_out = flow.sum(input, dim=0)
+    np_out = np.sum(input.numpy(), axis=0)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    input = flow.Tensor(
+        np.random.randn(2, 3), dtype=flow.float32, device=flow.device(device)
+    )
+    of_out = flow.sum(input, dim=1)
+    of_out2 = input.sum(dim=1)
+    np_out = np.sum(input.numpy(), axis=1)
+    test_case.assertTrue(np.allclose(of_out2.numpy(), of_out.numpy(), 1e-05, 1e-05))
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    input = flow.Tensor(
+        np.random.randn(4, 5, 6),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    of_out = flow.sum(input, dim=(2, 1))
+    np_out = np.sum(input.numpy(), axis=(2, 1))
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = np.ones((4, 5, 6))
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestSumModule(flow.unittest.TestCase):
+    def test_sum(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_sum_impl(test_case, *arg)
+
+    def test_sum_against_pytorch(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_type"] = [test_flow_against_pytorch, test_tensor_against_pytorch]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, "sum", device=arg[1])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_tan.py b/python/oneflow/test/modules/test_tan.py
new file mode 100644
index 0000000000000000000000000000000000000000..bad582d05c75fe0efc49e9ce6aacb6a9146eb6b9
--- /dev/null
+++ b/python/oneflow/test/modules/test_tan.py
@@ -0,0 +1,56 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_tan_impl(test_case, shape, device):
+    np_input = np.random.random(shape) - 0.5
+    of_input = flow.Tensor(
+        np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    of_out = flow.tan(of_input)
+    np_out = np.tan(np_input)
+    test_case.assertTrue(
+        np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001, equal_nan=True)
+    )
+    of_out = of_out.sum()
+    of_out.backward()
+    np_out_grad = 1 + np.square(np_out)
+    test_case.assertTrue(
+        np.allclose(of_input.grad.numpy(), np_out_grad, 0.0001, 0.0001, equal_nan=True)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestTan(flow.unittest.TestCase):
+    def test_tan(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(2,), (2, 3), (2, 3, 4), (2, 4, 5, 6)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_tan_impl(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_tensor_buffer.py b/python/oneflow/test/modules/test_tensor_buffer.py
new file mode 100644
index 0000000000000000000000000000000000000000..8977ff5f0b0e582df17f3f9d942832f9a0153845
--- /dev/null
+++ b/python/oneflow/test/modules/test_tensor_buffer.py
@@ -0,0 +1,49 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList, type_name_to_flow_type
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_tensor_buffer_convert(test_case, device):
+    input = flow.Tensor(
+        np.random.rand(16, 24, 32, 36), dtype=flow.float32, device=flow.device(device)
+    )
+    tensor_buffer = flow.tensor_to_tensor_buffer(input, instance_dims=2)
+    orig_tensor = flow.tensor_buffer_to_tensor(
+        tensor_buffer, dtype=flow.float32, instance_shape=[32, 36]
+    )
+    test_case.assertTrue(np.array_equal(input.numpy(), orig_tensor.numpy()))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestTensorBufferOps(flow.unittest.TestCase):
+    def test_tensor_buffer_convert(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [_test_tensor_buffer_convert]
+        arg_dict["device"] = ["cpu"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_tensor_ops.py b/python/oneflow/test/modules/test_tensor_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8bda42a96ca16a76947156b2f9131fb87f52e61
--- /dev/null
+++ b/python/oneflow/test/modules/test_tensor_ops.py
@@ -0,0 +1,63 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_type_as(test_case, shape, device, src_dtype, tgt_dtype):
+    np_input = np.random.rand(*shape)
+    input = flow.tensor(np_input, dtype=src_dtype, device=device)
+    target = flow.tensor(np_input, dtype=tgt_dtype, device=device)
+    input = input.type_as(target)
+    test_case.assertEqual(input.dtype, target.dtype)
+
+
+def _test_long(test_case, shape, device, dtype):
+    np_input = np.random.rand(*shape)
+    input = flow.tensor(np_input, dtype=dtype, device=device)
+    input = input.long()
+    test_case.assertEqual(input.dtype, flow.int64)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestTensorOps(flow.unittest.TestCase):
+    def test_type_as(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(1, 2), (3, 4, 5), (2, 3, 4, 5)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        arg_dict["src_dtype"] = [flow.int64, flow.int32, flow.float32, flow.float64]
+        arg_dict["tgt_dtype"] = [flow.int64, flow.int32, flow.float32, flow.float64]
+        for arg in GenArgList(arg_dict):
+            _test_type_as(test_case, *arg)
+
+    def test_long(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(1, 2), (3, 4, 5), (2, 3, 4, 5)]
+        arg_dict["device"] = ["cpu", "cuda"]
+        arg_dict["dtype"] = [flow.int64, flow.int32, flow.float32, flow.float64]
+        for arg in GenArgList(arg_dict):
+            _test_long(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_tensor_to.py b/python/oneflow/test/modules/test_tensor_to.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad4385fe2be9d894cdf91e79a6d5d2478d22dd35
--- /dev/null
+++ b/python/oneflow/test/modules/test_tensor_to.py
@@ -0,0 +1,84 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+import oneflow as flow
+import oneflow.unittest
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestTo(flow.unittest.TestCase):
+    def test_tensor_to_h2d(test_case):
+        input = flow.Tensor(np.random.randn(2, 3, 4, 5))
+        output = input.to(device=flow.device("cuda"))
+        test_case.assertEqual(output.device, flow.device("cuda"))
+        test_case.assertTrue(
+            np.allclose(input.numpy(), output.numpy(), rtol=0.0001, atol=0.0001)
+        )
+        gpu_output = output.to(device=flow.device("cuda"))
+        test_case.assertEqual(gpu_output.device, flow.device("cuda"))
+        test_case.assertTrue(
+            np.allclose(input.numpy(), gpu_output.numpy(), rtol=0.0001, atol=0.0001)
+        )
+
+    def test_tensor_to_d2h(test_case):
+        input = flow.Tensor(np.random.randn(2, 3, 4, 5), device=flow.device("cuda"))
+        output = input.to(device=flow.device("cpu"))
+        test_case.assertEqual(output.device, flow.device("cpu"))
+        test_case.assertTrue(
+            np.allclose(input.numpy(), output.numpy(), rtol=0.0001, atol=0.0001)
+        )
+
+    def test_tensor_to_d2d(test_case):
+        input = flow.Tensor(np.random.randn(2, 3, 4, 5), device=flow.device("cuda"))
+        output = input.to(device=flow.device("cuda:0"))
+        test_case.assertEqual(output.device, flow.device("cuda:0"))
+        test_case.assertTrue(
+            np.allclose(input.numpy(), output.numpy(), rtol=0.0001, atol=0.0001)
+        )
+
+    def test_tensor_to_h2h(test_case):
+        input = flow.Tensor(np.random.randn(2, 3, 4, 5))
+        output = input.to(device=flow.device("cpu"))
+        test_case.assertEqual(output.device, flow.device("cpu"))
+        test_case.assertTrue(
+            np.allclose(input.numpy(), output.numpy(), rtol=0.0001, atol=0.0001)
+        )
+
+    def test_tensor_to_cast(test_case):
+        input = flow.Tensor(np.random.randn(2, 3, 4, 5))
+        output = input.to(dtype=flow.int)
+        test_case.assertEqual(output.dtype, flow.int)
+
+    def test_tensor_to_cast_h2d(test_case):
+        input = flow.Tensor(np.random.randn(2, 3, 4, 5))
+        output = input.to(device=flow.device("cuda"), dtype=flow.int)
+        test_case.assertEqual(output.dtype, flow.int)
+        test_case.assertEqual(output.device, flow.device("cuda"))
+
+    def test_tensor_using_tensor(test_case):
+        tensor = flow.Tensor(np.random.randn(2, 3, 4, 5), device="cuda", dtype=flow.int)
+        input = flow.Tensor(np.random.randn(2, 3))
+        output = input.to(tensor)
+        test_case.assertEqual(output.dtype, flow.int)
+        test_case.assertEqual(output.device, flow.device("cuda"))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_tile.py b/python/oneflow/test/modules/test_tile.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae9a94ed84ed42db4ceda14a661d02e765964993
--- /dev/null
+++ b/python/oneflow/test/modules/test_tile.py
@@ -0,0 +1,180 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def np_tile(x, sizes):
+    return np.tile(x, sizes)
+
+
+def np_tile_grad(x, sizes):
+    times = np.array(sizes).prod()
+    return np.ones(shape=x.shape) * times
+
+
+def _test_tile_less_dim_a(test_case, device):
+    input = flow.Tensor(
+        np.random.randn(2, 4, 1, 3), dtype=flow.float32, device=flow.device(device)
+    )
+    sizes = (2,)
+    np_out = np_tile(input.numpy(), sizes)
+    of_out = input.tile(reps=sizes)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+def _test_tile_less_dim_b(test_case, device):
+    input = flow.Tensor(
+        np.random.randn(3, 2, 5), dtype=flow.float32, device=flow.device(device)
+    )
+    sizes = (3, 4)
+    np_out = np_tile(input.numpy(), sizes)
+    of_out = input.tile(reps=sizes)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+def _test_tile_less_dim_c(test_case, device):
+    input = flow.Tensor(
+        np.random.randn(4, 3, 2, 5, 3), dtype=flow.float32, device=flow.device(device)
+    )
+    sizes = (2, 3, 4, 4)
+    np_out = np_tile(input.numpy(), sizes)
+    of_out = input.tile(reps=sizes)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+def _test_tile_same_dim(test_case, device):
+    input = flow.Tensor(
+        np.random.randn(1, 2, 5, 3), dtype=flow.float32, device=flow.device(device)
+    )
+    sizes = (4, 2, 3, 19)
+    of_out = input.tile(reps=sizes)
+    np_out = np_tile(input.numpy(), sizes)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+
+def _test_tile_same_dim_int(test_case, device):
+    input = flow.Tensor(
+        np.random.randn(1, 2, 5, 3), dtype=flow.int32, device=flow.device(device)
+    )
+    size_tensor = flow.Tensor(np.random.randn(4, 2, 3, 19))
+    sizes = size_tensor.size()
+    of_out = input.tile(reps=sizes)
+    np_out = np_tile(input.numpy(), sizes)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out.astype(np.int32)))
+
+
+def _test_tile_same_dim_int8(test_case, device):
+    input = flow.Tensor(
+        np.random.randn(1, 2, 5, 3), dtype=flow.int8, device=flow.device(device)
+    )
+    size_tensor = flow.Tensor(np.random.randn(4, 2, 3, 19))
+    sizes = size_tensor.size()
+    of_out = input.tile(reps=sizes)
+    np_out = np_tile(input.numpy(), sizes)
+    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out.astype(np.int32)))
+
+
+def _test_tile_less_dim_a_backward(test_case, device):
+    input = flow.Tensor(
+        np.random.randn(2, 4, 1, 3),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    sizes = (2,)
+    of_out = input.tile(reps=sizes)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = np_tile_grad(input.numpy(), sizes)
+    test_case.assertTrue(np.array_equal(input.grad.numpy(), np_grad))
+
+
+def _test_tile_less_dim_b_backward(test_case, device):
+    input = flow.Tensor(
+        np.random.randn(3, 2, 5),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    sizes = (3, 4)
+    of_out = input.tile(reps=sizes)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = np_tile_grad(input.numpy(), sizes)
+    test_case.assertTrue(np.array_equal(input.grad.numpy(), np_grad))
+
+
+def _test_tile_less_dim_c_backward(test_case, device):
+    input = flow.Tensor(
+        np.random.randn(4, 3, 2, 5, 3),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    sizes = (2, 3, 4, 4)
+    of_out = input.tile(reps=sizes)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = np_tile_grad(input.numpy(), sizes)
+    test_case.assertTrue(np.array_equal(input.grad.numpy(), np_grad))
+
+
+def _test_tile_same_dim_backward(test_case, device):
+    input = flow.Tensor(
+        np.random.randn(1, 2, 5, 3),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    sizes = (1, 2, 3, 1)
+    of_out = input.tile(reps=sizes)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = np_tile_grad(input.numpy(), sizes)
+    test_case.assertTrue(np.array_equal(input.grad.numpy(), np_grad))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestTile(flow.unittest.TestCase):
+    def test_tile(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_tile_less_dim_a,
+            _test_tile_less_dim_b,
+            _test_tile_less_dim_c,
+            _test_tile_same_dim,
+            _test_tile_same_dim_int,
+            _test_tile_same_dim_int8,
+            _test_tile_less_dim_a_backward,
+            _test_tile_less_dim_b_backward,
+            _test_tile_less_dim_c_backward,
+            _test_tile_same_dim_backward,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_transpose.py b/python/oneflow/test/modules/test_transpose.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ba4b13305fbd242128f83809156c6258db379c3
--- /dev/null
+++ b/python/oneflow/test/modules/test_transpose.py
@@ -0,0 +1,107 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from automated_test_util import *
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_transpose(test_case, device):
+    input = flow.Tensor(
+        np.random.randn(2, 6, 5, 3), dtype=flow.float32, device=flow.device(device)
+    )
+    of_out = flow.transpose(input, 0, 1)
+    np_out = input.numpy().transpose((1, 0, 2, 3))
+    test_case.assertTrue(np.array_equal(of_out.numpy().flatten(), np_out.flatten()))
+
+
+def _test_tensor_transpose(test_case, device):
+    input = flow.Tensor(
+        np.random.randn(2, 6, 5, 3), dtype=flow.float32, device=flow.device(device)
+    )
+    of_out = input.transpose(0, 1)
+    np_out = input.numpy().transpose((1, 0, 2, 3))
+    test_case.assertTrue(np.array_equal(of_out.numpy().flatten(), np_out.flatten()))
+
+
+def _test_tranpose_negative_dim(test_case, device):
+    input = flow.Tensor(
+        np.random.randn(2, 6, 5, 3), dtype=flow.float32, device=flow.device(device)
+    )
+    of_out = flow.transpose(input, -4, -3)
+    np_out = input.numpy().transpose((1, 0, 2, 3))
+    test_case.assertTrue(np.array_equal(of_out.numpy().flatten(), np_out.flatten()))
+
+
+def _test_transpose_backward(test_case, device):
+    x = flow.Tensor(
+        np.random.randn(2, 6, 5, 3),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    y = flow.transpose(x, 0, 1).sum()
+    y.backward()
+    test_case.assertTrue(
+        np.allclose(x.grad.numpy(), np.ones((2, 6, 5, 3)), 1e-05, 1e-05)
+    )
+
+
+def _test_transpose_backward_v2(test_case, device):
+    x = flow.Tensor(
+        np.random.randn(2, 3, 4, 5),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    y = flow.transpose(x, 3, 1).sum()
+    y.backward()
+    test_case.assertTrue(
+        np.allclose(x.grad.numpy(), np.ones((2, 3, 4, 5)), 1e-05, 1e-05)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestTranspose(flow.unittest.TestCase):
+    def test_transpose(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["fun"] = [
+            _test_transpose,
+            _test_tensor_transpose,
+            _test_tranpose_negative_dim,
+            _test_transpose_backward,
+            _test_transpose_backward_v2,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+    @autotest()
+    def test_transpose_flow_with_random_data(test_case):
+        device = random_device()
+        x = random_pytorch_tensor(ndim=4).to(device)
+        y = torch.transpose(x, dim0=random(1, 3).to(int), dim1=random(1, 3).to(int))
+        return y
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_triu.py b/python/oneflow/test/modules/test_triu.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c85f6796005fca5efeb79cf8d04d31b1c61852a
--- /dev/null
+++ b/python/oneflow/test/modules/test_triu.py
@@ -0,0 +1,55 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.nn as nn
+import oneflow.unittest
+
+
+def _test_triu(test_case, diagonal, device):
+    arr_shape = (4, 4, 8)
+    np_arr = np.random.randn(*arr_shape)
+    input_tensor = flow.Tensor(
+        np_arr, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    output = flow.triu(input_tensor, diagonal=diagonal)
+    np_out = np.triu(np_arr, diagonal)
+    test_case.assertTrue(np.allclose(output.numpy(), np_out, 1e-06, 1e-06))
+    output = output.sum()
+    output.backward()
+    np_grad = np.triu(np.ones(shape=arr_shape, dtype=np.float32), diagonal)
+    test_case.assertTrue(np.allclose(input_tensor.grad.numpy(), np_grad, 1e-06, 1e-06))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestTriu(flow.unittest.TestCase):
+    def test_triu(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [_test_triu]
+        arg_dict["diagonal"] = [2, -1]
+        arg_dict["device"] = ["cuda", "cpu"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_unsqueeze.py b/python/oneflow/test/modules/test_unsqueeze.py
new file mode 100644
index 0000000000000000000000000000000000000000..1baf9cbae3be1884e360ea7145e6dcdf294e18ac
--- /dev/null
+++ b/python/oneflow/test/modules/test_unsqueeze.py
@@ -0,0 +1,86 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from automated_test_util import *
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_unsqueeze(test_case, device):
+    np_arr = np.random.rand(2, 6, 9, 3)
+    x = flow.Tensor(np_arr, device=flow.device(device))
+    y = flow.unsqueeze(x, dim=1)
+    output = np.expand_dims(np_arr, axis=1)
+    test_case.assertTrue(np.allclose(output, y.numpy(), 1e-05, 1e-05))
+
+
+def _test_unsqueeze_tensor_function(test_case, device):
+    np_arr = np.random.rand(2, 3, 4)
+    x = flow.Tensor(np_arr, device=flow.device(device))
+    y = x.unsqueeze(dim=2)
+    output = np.expand_dims(np_arr, axis=2)
+    test_case.assertTrue(np.allclose(output, y.numpy(), 1e-05, 1e-05))
+
+
+def _test_unsqueeze_different_dim(test_case, device):
+    np_arr = np.random.rand(4, 5, 6, 7)
+    x = flow.Tensor(np_arr, device=flow.device(device))
+    for axis in range(-5, 5):
+        y = flow.unsqueeze(x, dim=axis)
+        output = np.expand_dims(np_arr, axis=axis)
+        test_case.assertTrue(np.allclose(output, y.numpy(), 1e-05, 1e-05))
+
+
+def _test_unsqueeze_backward(test_case, device):
+    np_arr = np.random.rand(2, 3, 4, 5)
+    x = flow.Tensor(np_arr, device=flow.device(device), requires_grad=True)
+    y = flow.unsqueeze(x, dim=1).sum()
+    y.backward()
+    test_case.assertTrue(
+        np.allclose(x.grad.numpy(), np.ones((2, 3, 4, 5)), 1e-05, 1e-05)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestUnsqueeze(flow.unittest.TestCase):
+    def test_unsqueeze(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_unsqueeze,
+            _test_unsqueeze_tensor_function,
+            _test_unsqueeze_different_dim,
+            _test_unsqueeze_backward,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+    @autotest()
+    def test_flow_unsqueeze_with_random_data(test_case):
+        device = random_device()
+        x = random_pytorch_tensor().to(device)
+        y = torch.unsqueeze(x, random(1, 3).to(int))
+        return y
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_upsample2d.py b/python/oneflow/test/modules/test_upsample2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..69f52ad0fb1f10586b8f42b08d67a3fb371998ea
--- /dev/null
+++ b/python/oneflow/test/modules/test_upsample2d.py
@@ -0,0 +1,401 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_upsample2d(test_case, device):
+    input = flow.Tensor(
+        np.arange(1, 5).reshape((1, 1, 2, 2)),
+        device=flow.device(device),
+        dtype=flow.float32,
+    )
+    m = flow.nn.Upsample(scale_factor=2.0, mode="nearest")
+    of_out = m(input)
+    np_out = np.array(
+        [
+            [
+                [
+                    [1.0, 1.0, 2.0, 2.0],
+                    [1.0, 1.0, 2.0, 2.0],
+                    [3.0, 3.0, 4.0, 4.0],
+                    [3.0, 3.0, 4.0, 4.0],
+                ]
+            ]
+        ]
+    )
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_upsample2d_bilinear(test_case, device):
+    input = flow.Tensor(
+        np.arange(1, 5).reshape((1, 1, 2, 2)),
+        device=flow.device(device),
+        dtype=flow.float32,
+    )
+    m = flow.nn.Upsample(scale_factor=2.0, mode="bilinear")
+    of_out = m(input)
+    np_out = np.array(
+        [
+            [
+                [
+                    [1.0, 1.25, 1.75, 2.0],
+                    [1.5, 1.75, 2.25, 2.5],
+                    [2.5, 2.75, 3.25, 3.5],
+                    [3.0, 3.25, 3.75, 4.0],
+                ]
+            ]
+        ]
+    )
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_upsample2d_bilinear_aligncorner(test_case, device):
+    input = flow.Tensor(
+        np.arange(1, 5).reshape((1, 1, 2, 2)),
+        device=flow.device(device),
+        dtype=flow.float32,
+    )
+    m = flow.nn.Upsample(scale_factor=2.0, mode="bilinear", align_corners=True)
+    of_out = m(input)
+    np_out = np.array(
+        [
+            [
+                [
+                    [1.0, 1.3333, 1.6667, 2.0],
+                    [1.6667, 2.0, 2.3333, 2.6667],
+                    [2.3333, 2.6667, 3.0, 3.3333],
+                    [3.0, 3.3333, 3.6667, 4.0],
+                ]
+            ]
+        ]
+    )
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+
+
+def _test_UpsamplingNearest2d(test_case, device):
+    input = flow.Tensor(
+        np.arange(1, 5).reshape((1, 1, 2, 2)),
+        device=flow.device(device),
+        dtype=flow.float32,
+    )
+    m = flow.nn.UpsamplingNearest2d(scale_factor=2.0)
+    of_out = m(input)
+    np_out = np.array(
+        [
+            [
+                [
+                    [1.0, 1.0, 2.0, 2.0],
+                    [1.0, 1.0, 2.0, 2.0],
+                    [3.0, 3.0, 4.0, 4.0],
+                    [3.0, 3.0, 4.0, 4.0],
+                ]
+            ]
+        ]
+    )
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_UpsamplingBilinear2d(test_case, device):
+    input = flow.Tensor(
+        np.arange(1, 5).reshape((1, 1, 2, 2)),
+        device=flow.device(device),
+        dtype=flow.float32,
+    )
+    m = flow.nn.UpsamplingBilinear2d(scale_factor=2.0)
+    of_out = m(input)
+    np_out = np.array(
+        [
+            [
+                [
+                    [1.0, 1.3333, 1.6667, 2.0],
+                    [1.6667, 2.0, 2.3333, 2.6667],
+                    [2.3333, 2.6667, 3.0, 3.3333],
+                    [3.0, 3.3333, 3.6667, 4.0],
+                ]
+            ]
+        ]
+    )
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+
+
+def _test_upsample2d_4dim(test_case, device):
+    input = flow.Tensor(
+        np.arange(1, 37).reshape((2, 2, 3, 3)),
+        device=flow.device(device),
+        dtype=flow.float32,
+    )
+    m = flow.nn.Upsample(scale_factor=2.0, mode="nearest")
+    of_out = m(input)
+    np_out = np.array(
+        [
+            [
+                [
+                    [1.0, 1.0, 2.0, 2.0, 3.0, 3.0],
+                    [1.0, 1.0, 2.0, 2.0, 3.0, 3.0],
+                    [4.0, 4.0, 5.0, 5.0, 6.0, 6.0],
+                    [4.0, 4.0, 5.0, 5.0, 6.0, 6.0],
+                    [7.0, 7.0, 8.0, 8.0, 9.0, 9.0],
+                    [7.0, 7.0, 8.0, 8.0, 9.0, 9.0],
+                ],
+                [
+                    [10.0, 10.0, 11.0, 11.0, 12.0, 12.0],
+                    [10.0, 10.0, 11.0, 11.0, 12.0, 12.0],
+                    [13.0, 13.0, 14.0, 14.0, 15.0, 15.0],
+                    [13.0, 13.0, 14.0, 14.0, 15.0, 15.0],
+                    [16.0, 16.0, 17.0, 17.0, 18.0, 18.0],
+                    [16.0, 16.0, 17.0, 17.0, 18.0, 18.0],
+                ],
+            ],
+            [
+                [
+                    [19.0, 19.0, 20.0, 20.0, 21.0, 21.0],
+                    [19.0, 19.0, 20.0, 20.0, 21.0, 21.0],
+                    [22.0, 22.0, 23.0, 23.0, 24.0, 24.0],
+                    [22.0, 22.0, 23.0, 23.0, 24.0, 24.0],
+                    [25.0, 25.0, 26.0, 26.0, 27.0, 27.0],
+                    [25.0, 25.0, 26.0, 26.0, 27.0, 27.0],
+                ],
+                [
+                    [28.0, 28.0, 29.0, 29.0, 30.0, 30.0],
+                    [28.0, 28.0, 29.0, 29.0, 30.0, 30.0],
+                    [31.0, 31.0, 32.0, 32.0, 33.0, 33.0],
+                    [31.0, 31.0, 32.0, 32.0, 33.0, 33.0],
+                    [34.0, 34.0, 35.0, 35.0, 36.0, 36.0],
+                    [34.0, 34.0, 35.0, 35.0, 36.0, 36.0],
+                ],
+            ],
+        ]
+    )
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_upsample2d_bilinear_4dim(test_case, device):
+    input = flow.Tensor(
+        np.arange(1, 37).reshape((2, 2, 3, 3)),
+        device=flow.device(device),
+        dtype=flow.float32,
+    )
+    m = flow.nn.Upsample(scale_factor=2.0, mode="bilinear")
+    of_out = m(input)
+    np_out = np.array(
+        [
+            [
+                [
+                    [1.0, 1.25, 1.75, 2.25, 2.75, 3.0],
+                    [1.75, 2.0, 2.5, 3.0, 3.5, 3.75],
+                    [3.25, 3.5, 4.0, 4.5, 5.0, 5.25],
+                    [4.75, 5.0, 5.5, 6.0, 6.5, 6.75],
+                    [6.25, 6.5, 7.0, 7.5, 8.0, 8.25],
+                    [7.0, 7.25, 7.75, 8.25, 8.75, 9.0],
+                ],
+                [
+                    [10.0, 10.25, 10.75, 11.25, 11.75, 12.0],
+                    [10.75, 11.0, 11.5, 12.0, 12.5, 12.75],
+                    [12.25, 12.5, 13.0, 13.5, 14.0, 14.25],
+                    [13.75, 14.0, 14.5, 15.0, 15.5, 15.75],
+                    [15.25, 15.5, 16.0, 16.5, 17.0, 17.25],
+                    [16.0, 16.25, 16.75, 17.25, 17.75, 18.0],
+                ],
+            ],
+            [
+                [
+                    [19.0, 19.25, 19.75, 20.25, 20.75, 21.0],
+                    [19.75, 20.0, 20.5, 21.0, 21.5, 21.75],
+                    [21.25, 21.5, 22.0, 22.5, 23.0, 23.25],
+                    [22.75, 23.0, 23.5, 24.0, 24.5, 24.75],
+                    [24.25, 24.5, 25.0, 25.5, 26.0, 26.25],
+                    [25.0, 25.25, 25.75, 26.25, 26.75, 27.0],
+                ],
+                [
+                    [28.0, 28.25, 28.75, 29.25, 29.75, 30.0],
+                    [28.75, 29.0, 29.5, 30.0, 30.5, 30.75],
+                    [30.25, 30.5, 31.0, 31.5, 32.0, 32.25],
+                    [31.75, 32.0, 32.5, 33.0, 33.5, 33.75],
+                    [33.25, 33.5, 34.0, 34.5, 35.0, 35.25],
+                    [34.0, 34.25, 34.75, 35.25, 35.75, 36.0],
+                ],
+            ],
+        ]
+    )
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_upsample2d_backward(test_case, device):
+    input = flow.Tensor(
+        np.arange(1, 5).reshape((1, 1, 2, 2)),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    m = flow.nn.Upsample(scale_factor=2.0, mode="nearest")
+    of_out = m(input)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [[[[4.0, 4.0], [4.0, 4.0]]]]
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+def _test_upsample2d_bilinear_aligncorner_backward(test_case, device):
+    input = flow.Tensor(
+        np.arange(1, 5).reshape((1, 1, 2, 2)),
+        device=flow.device(device),
+        dtype=flow.float32,
+        requires_grad=True,
+    )
+    m = flow.nn.Upsample(scale_factor=2.0, mode="bilinear", align_corners=True)
+    of_out = m(input)
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = [[[[3.999999523162842, 4.000000476837158], [3.999999761581421, 4.0]]]]
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+def _test_interpolate_nearest_float_scale(test_case, device):
+    input = flow.Tensor(
+        np.arange(1, 10).reshape((1, 1, 3, 3)),
+        device=flow.device(device),
+        dtype=flow.float32,
+        requires_grad=True,
+    )
+    m = flow.nn.Upsample(scale_factor=1.5)
+    of_out = m(input)
+    np_out = np.array(
+        [
+            [
+                [
+                    [1.0, 1.0, 2.0, 3.0],
+                    [1.0, 1.0, 2.0, 3.0],
+                    [4.0, 4.0, 5.0, 6.0],
+                    [7.0, 7.0, 8.0, 9.0],
+                ]
+            ]
+        ]
+    )
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = np.array([[[[4.0, 2.0, 2.0], [2.0, 1.0, 1.0], [2.0, 1.0, 1.0]]]])
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+def _test_interpolate_bilinear_float_scale(test_case, device):
+    input = flow.Tensor(
+        np.arange(1, 5, dtype=np.int32).reshape((1, 1, 2, 2)),
+        device=flow.device(device),
+        dtype=flow.float32,
+        requires_grad=True,
+    )
+    m = flow.nn.Upsample(scale_factor=0.5, mode="bilinear")
+    of_out = m(input)
+    np_out = np.array([[[[2.5]]]])
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = np.array([[[[0.25, 0.25], [0.25, 0.25]]]])
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05))
+    input = flow.Tensor(
+        np.arange(1, 10, dtype=np.int32).reshape((1, 1, 3, 3)),
+        device=flow.device(device),
+        dtype=flow.float32,
+        requires_grad=True,
+    )
+    m = flow.nn.Upsample(scale_factor=0.5, mode="bilinear")
+    of_out = m(input)
+    np_out = np.array([[[[3.0]]]])
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = np.array([[[[0.25, 0.25, 0.0], [0.25, 0.25, 0.0], [0.0, 0.0, 0.0]]]])
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05))
+    input = flow.Tensor(
+        np.arange(1, 11, dtype=np.int32).reshape((1, 1, 5, 2)),
+        device=flow.device(device),
+        dtype=flow.float32,
+        requires_grad=True,
+    )
+    m = flow.nn.Upsample(size=(4, 4), mode="bilinear")
+    of_out = m(input)
+    np_out = np.array(
+        [
+            [
+                [
+                    [1.25, 1.5, 2.0, 2.25],
+                    [3.75, 4.0, 4.5, 4.75],
+                    [6.25, 6.5, 7.0, 7.25],
+                    [8.75, 9.0, 9.5, 9.75],
+                ]
+            ]
+        ]
+    )
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = np.array(
+        [[[[1.75, 1.75], [1.5, 1.5], [1.5, 1.5], [1.5, 1.5], [1.75, 1.75]]]]
+    )
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+def _test_upsample_bilinear_align_corners(test_case, device):
+    input = flow.Tensor(
+        np.arange(1, 5, dtype=np.int32).reshape((1, 1, 2, 2)),
+        device=flow.device(device),
+        dtype=flow.float32,
+        requires_grad=True,
+    )
+    m = flow.nn.Upsample(scale_factor=0.5, mode="bilinear", align_corners=True)
+    of_out = m(input)
+    np_out = np.array([[[[1.0]]]])
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = np.array([[[[1.0, 0.0], [0.0, 0.0]]]])
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestUpsample2d(flow.unittest.TestCase):
+    def test_upsample2d(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_upsample2d,
+            _test_upsample2d_bilinear,
+            _test_upsample2d_bilinear_aligncorner,
+            _test_UpsamplingNearest2d,
+            _test_UpsamplingBilinear2d,
+            _test_upsample2d_4dim,
+            _test_upsample2d_bilinear_4dim,
+            _test_upsample2d_backward,
+            _test_upsample2d_bilinear_aligncorner_backward,
+            _test_interpolate_nearest_float_scale,
+            _test_interpolate_bilinear_float_scale,
+            _test_upsample_bilinear_align_corners,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_util.py b/python/oneflow/test/modules/test_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..10884b9c3c3a7fb499e4362228ea11b58f6713dc
--- /dev/null
+++ b/python/oneflow/test/modules/test_util.py
@@ -0,0 +1,117 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import itertools
+import os
+from collections import OrderedDict
+from collections.abc import Iterable
+
+import numpy as np
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def GenCartesianProduct(sets):
+    assert isinstance(sets, Iterable)
+    for set in sets:
+        assert isinstance(set, Iterable)
+        if os.getenv("ONEFLOW_TEST_CPU_ONLY"):
+            if "gpu" in set:
+                set.remove("gpu")
+    return itertools.product(*sets)
+
+
+def GenArgList(arg_dict):
+    assert isinstance(arg_dict, OrderedDict)
+    assert all([isinstance(x, list) for x in arg_dict.values()])
+    sets = [arg_set for (_, arg_set) in arg_dict.items()]
+    return GenCartesianProduct(sets)
+
+
+def GenArgDict(arg_dict):
+    return [dict(zip(arg_dict.keys(), x)) for x in GenArgList(arg_dict)]
+
+
+class Args:
+    def __init__(self, flow_args, tf_args=None):
+        super().__init__()
+        if tf_args is None:
+            tf_args = flow_args
+        self.flow_args = flow_args
+        self.tf_args = tf_args
+
+    def __str__(self):
+        return "flow_args={} tf_args={}".format(self.flow_args, self.tf_args)
+
+    def __repr__(self):
+        return self.__str__()
+
+
+type_name_to_flow_type = {
+    "float16": flow.float16,
+    "float32": flow.float32,
+    "double": flow.double,
+    "int8": flow.int8,
+    "int32": flow.int32,
+    "int64": flow.int64,
+    "char": flow.char,
+    "uint8": flow.uint8,
+}
+type_name_to_np_type = {
+    "float16": np.float16,
+    "float32": np.float32,
+    "double": np.float64,
+    "int8": np.int8,
+    "int32": np.int32,
+    "int64": np.int64,
+    "char": np.byte,
+    "uint8": np.uint8,
+}
+
+
+def FlattenArray(input_array):
+    output_array = list()
+    for x in np.nditer(input_array):
+        output_array.append(x.tolist())
+    return output_array
+
+
+def Array2Numpy(input_array, target_shape):
+    return np.array(input_array).reshape(target_shape, order="C")
+
+
+def Index2Coordinate(idx, tensor_shape):
+    coordinate = []
+    tmp = idx
+    for i in range(len(tensor_shape) - 1, -1, -1):
+        axis_size = tensor_shape[i]
+        coor = tmp % axis_size
+        coordinate.insert(0, int(coor))
+        tmp = (tmp - coor) / axis_size
+    return coordinate
+
+
+def Coordinate2Index(coordinate, tensor_shape):
+    if len(coordinate) != len(tensor_shape):
+        raise "wrong coordinate or shape"
+    idx = 0
+    for (i, coor) in enumerate(coordinate):
+        size_at_axis = coor
+        for j in range(i + 1, len(tensor_shape)):
+            size_at_axis *= tensor_shape[j]
+        idx += size_at_axis
+    return idx
diff --git a/python/oneflow/test/modules/test_view.py b/python/oneflow/test/modules/test_view.py
new file mode 100644
index 0000000000000000000000000000000000000000..df5aaffe42a06f0972e1edabe01b37b932b5d055
--- /dev/null
+++ b/python/oneflow/test/modules/test_view.py
@@ -0,0 +1,92 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_view(test_case, device):
+    x = np.array(
+        [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]]
+    ).astype(np.float32)
+    input = flow.Tensor(x, device=flow.device(device))
+    of_shape = flow.view(input, shape=[2, 2, 2, -1]).numpy().shape
+    of_shape = flow.view(input, shape=[2, 2, 2, -1]).numpy().shape
+    np_shape = (2, 2, 2, 2)
+    test_case.assertTrue(np.array_equal(of_shape, np_shape))
+
+
+def _test_view_tuple(test_case, device):
+    x = np.array(
+        [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]]
+    ).astype(np.float32)
+    input = flow.Tensor(x, device=flow.device(device))
+    of_shape = flow.view(input, shape=(2, 2, 2, -1)).numpy().shape
+    np_shape = (2, 2, 2, 2)
+    test_case.assertTrue(np.array_equal(of_shape, np_shape))
+
+
+def _test_tensor_view(test_case, device):
+    x = np.array(
+        [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]]
+    ).astype(np.float32)
+    input = flow.Tensor(x, device=flow.device(device))
+    of_shape = input.view(shape=[2, 2, 2, -1]).numpy().shape
+    np_shape = (2, 2, 2, 2)
+    test_case.assertTrue(np.array_equal(of_shape, np_shape))
+
+
+def _test_view_backward(test_case, device):
+    x = np.array(
+        [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]]
+    ).astype(np.float32)
+    input = flow.Tensor(x, device=flow.device(device), requires_grad=True)
+    of_out = flow.view(input, shape=[2, 2, 2, -1]).sum()
+    of_out.backward()
+    np_grad = np.array(
+        [
+            [1.0, 1.0, 1.0, 1.0],
+            [1.0, 1.0, 1.0, 1.0],
+            [1.0, 1.0, 1.0, 1.0],
+            [1.0, 1.0, 1.0, 1.0],
+        ]
+    )
+    test_case.assertTrue(np.allclose(np_grad, input.grad.numpy(), 0.0001, 0.0001))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestModule(flow.unittest.TestCase):
+    def test_view(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_view,
+            _test_view_tuple,
+            _test_tensor_view,
+            _test_view_backward,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_where.py b/python/oneflow/test/modules/test_where.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb45bf4054b7f01077e3b359eff8f202e867f97a
--- /dev/null
+++ b/python/oneflow/test/modules/test_where.py
@@ -0,0 +1,196 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_where(test_case, device):
+    x = flow.Tensor(
+        np.array([[-0.462, 0.3139], [0.3898, -0.7197], [0.0478, -0.1657]]),
+        dtype=flow.float32,
+        device=flow.device(device),
+    )
+    y = flow.Tensor(
+        np.ones(shape=(3, 2)), dtype=flow.float32, device=flow.device(device)
+    )
+    condition = flow.Tensor(
+        np.array([[0, 1], [1, 0], [1, 0]]), dtype=flow.int32, device=flow.device(device)
+    )
+    of_out = flow.where(condition, x, y)
+    np_out = np.array([[1.0, 0.3139], [0.3898, 1.0], [0.0478, 1.0]])
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_where_broadcast(test_case, device):
+    x = flow.Tensor(
+        np.array([[[-0.462, 0.3139], [0.3898, -0.7197], [0.0478, -0.1657]]]),
+        dtype=flow.float32,
+        device=flow.device(device),
+    )
+    y = flow.Tensor(
+        np.ones(shape=(3, 3, 2)), dtype=flow.float32, device=flow.device(device)
+    )
+    condition = flow.Tensor(
+        np.array([[[0, 1], [1, 0], [1, 0]]]),
+        dtype=flow.int32,
+        device=flow.device(device),
+    )
+    of_out = flow.where(condition, x, y)
+    np_out = np.array(
+        [
+            [[1.0, 0.3139], [0.3898, 1.0], [0.0478, 1.0]],
+            [[1.0, 0.3139], [0.3898, 1.0], [0.0478, 1.0]],
+            [[1.0, 0.3139], [0.3898, 1.0], [0.0478, 1.0]],
+        ]
+    )
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_where_scalar(test_case, device):
+    x = 0.5
+    y = 2.0
+    condition = flow.Tensor(np.array([1]), dtype=flow.int32)
+    of_out = flow.where(condition, x, y)
+    np_out = np.array([0.5])
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_where_dim4(test_case, device):
+    x = flow.Tensor(
+        np.array([[[[-0.462, 0.3139], [0.3898, -0.7197], [0.0478, -0.1657]]]]),
+        dtype=flow.float32,
+        device=flow.device(device),
+    )
+    y = flow.Tensor(
+        np.ones(shape=(1, 1, 3, 2)), dtype=flow.float32, device=flow.device(device)
+    )
+    condition = flow.Tensor(
+        np.array([[[[0, 1], [1, 0], [1, 0]]]]),
+        dtype=flow.int32,
+        device=flow.device(device),
+    )
+    of_out = flow.where(condition, x, y)
+    np_out = np.array([[[[1.0, 0.3139], [0.3898, 1.0], [0.0478, 1.0]]]])
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+
+def _test_where_backward(test_case, device):
+    x = flow.Tensor(
+        np.array([[-0.462, 0.3139], [0.3898, -0.7197], [0.0478, -0.1657]]),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    y = flow.Tensor(
+        np.ones(shape=(3, 2)),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    condition = flow.Tensor(
+        np.array([[0, 1], [1, 0], [1, 0]]), dtype=flow.int32, device=flow.device(device)
+    )
+    of_out = flow.where(condition, x, y)
+    of_out = of_out.sum()
+    of_out.backward()
+    test_case.assertTrue(
+        np.allclose(x.grad.numpy(), condition.numpy() == 1, 1e-05, 1e-05)
+    )
+    test_case.assertTrue(
+        np.allclose(y.grad.numpy(), condition.numpy() == 0, 1e-05, 1e-05)
+    )
+
+
+def _test_where_broadcast_backward(test_case, device):
+    x = flow.Tensor(
+        np.array([[[-0.462, 0.3139], [0.3898, -0.7197], [0.0478, -0.1657]]]),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    y = flow.Tensor(
+        np.ones(shape=(3, 3, 2)),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    condition = flow.Tensor(
+        np.array([[[0, 1], [1, 0], [1, 0]]]),
+        dtype=flow.int32,
+        device=flow.device(device),
+    )
+    of_out = flow.where(condition, x, y)
+    of_out = of_out.sum()
+    of_out.backward()
+    x_grad = [[[0.0, 3.0], [3.0, 0.0], [3.0, 0.0]]]
+    test_case.assertTrue(np.allclose(x.grad.numpy(), x_grad, 1e-05, 1e-05))
+    y_grad = [
+        [[1.0, 0.0], [0.0, 1.0], [0.0, 1.0]],
+        [[1.0, 0.0], [0.0, 1.0], [0.0, 1.0]],
+        [[1.0, 0.0], [0.0, 1.0], [0.0, 1.0]],
+    ]
+    test_case.assertTrue(np.allclose(y.grad.numpy(), y_grad, 1e-05, 1e-05))
+
+
+def _test_where_broadcast_x_backward(test_case, device):
+    x = flow.Tensor(
+        np.array([[[-0.462, 0.3139], [0.3898, -0.7197], [0.0478, -0.1657]]]),
+        dtype=flow.float32,
+        device=flow.device(device),
+        requires_grad=True,
+    )
+    y = flow.Tensor(
+        np.ones(shape=(3, 3, 2)), dtype=flow.float32, device=flow.device(device)
+    )
+    condition = flow.Tensor(
+        np.array([[[0, 1], [1, 0], [1, 0]]]),
+        dtype=flow.int32,
+        device=flow.device(device),
+    )
+    of_out = flow.where(condition, x, y)
+    of_out = of_out.sum()
+    of_out.backward()
+    x_grad = [[[0.0, 3.0], [3.0, 0.0], [3.0, 0.0]]]
+    test_case.assertTrue(np.allclose(x.grad.numpy(), x_grad, 1e-05, 1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestWhere(flow.unittest.TestCase):
+    def test_where(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_where,
+            _test_where_broadcast,
+            _test_where_scalar,
+            _test_where_dim4,
+            _test_where_backward,
+            _test_where_broadcast_backward,
+            _test_where_broadcast_x_backward,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_zeropad2d.py b/python/oneflow/test/modules/test_zeropad2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa8f2bd475ed40fc51303f61c8eef3ff81346197
--- /dev/null
+++ b/python/oneflow/test/modules/test_zeropad2d.py
@@ -0,0 +1,102 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from test_util import Array2Numpy, FlattenArray, GenArgList, Index2Coordinate
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _np_zero_pad2d_grad(src, dest, padding):
+    (c_idx, h_idx, w_idx) = (1, 2, 3)
+    pad_left = padding[0]
+    pad_right = padding[1]
+    pad_top = padding[2]
+    pad_bottom = padding[3]
+    (dx_height, dx_width) = (dest.shape[h_idx], dest.shape[w_idx])
+    (dy_height, dy_width) = (src.shape[h_idx], src.shape[w_idx])
+    numpy_src = np.ones(src.shape, np.int32)
+    numpy_dest = np.zeros(dest.shape, np.int32)
+    array_src = FlattenArray(numpy_src)
+    array_dest = FlattenArray(numpy_dest)
+    src_num = src.shape[c_idx] * src.shape[h_idx] * src.shape[w_idx]
+    dest_num = dest.shape[c_idx] * dest.shape[h_idx] * dest.shape[w_idx]
+    elements_num = src.shape[0] * src_num
+    for iter_n in range(elements_num):
+        coords = Index2Coordinate(iter_n, src.shape)
+        (n, c, i, j) = (coords[0], coords[c_idx], coords[h_idx], coords[w_idx])
+        ip_x = ip_y = 0
+        if (
+            j >= pad_left
+            and j < dx_width + pad_left
+            and (i >= pad_top)
+            and (i < dx_height + pad_top)
+        ):
+            ip_x = j - pad_left
+            ip_y = i - pad_top
+            src_index = n * src_num + c * dy_width * dy_height + i * dy_width + j
+            dest_index = (
+                n * dest_num + c * dx_width * dx_height + ip_y * dx_width + ip_x
+            )
+            array_dest[dest_index] += array_src[src_index]
+    numpy_dest = Array2Numpy(array_dest, dest.shape)
+    return numpy_dest
+
+
+def _test_ZeroPad2d(test_case, shape, padding, value, device):
+    np_input = np.random.random(shape)
+    of_input = flow.Tensor(
+        np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+    )
+    if isinstance(padding, int):
+        np_boundary = ((0, 0), (0, 0), (padding, padding), (padding, padding))
+    elif isinstance(padding, (tuple, int)) and len(padding) == 4:
+        np_boundary = (
+            (0, 0),
+            (0, 0),
+            (padding[2], padding[3]),
+            (padding[0], padding[1]),
+        )
+    else:
+        raise ValueError("padding must be in  or tuple!")
+    layer = flow.nn.ZeroPad2d(padding=padding)
+    of_out = layer(of_input)
+    np_out = np.pad(np_input, np_boundary, mode="constant", constant_values=value)
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+    of_out = of_out.sum()
+    of_out.backward()
+    np_out_grad = _np_zero_pad2d_grad(np_out, np_input, layer.padding)
+    test_case.assertTrue(np.allclose(of_input.grad.numpy(), np_out_grad, 1e-05, 1e-05))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestZeroPad2dModule(flow.unittest.TestCase):
+    def test_ConstantPad2d(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["shape"] = [(1, 2, 3, 4), (8, 3, 4, 4)]
+        arg_dict["padding"] = [2, (1, 1, 2, 2)]
+        arg_dict["value"] = [0.0]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            _test_ZeroPad2d(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/tensor/automated_test_util.py b/python/oneflow/test/tensor/automated_test_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..682939b32a9e936c16040732ecb65b7c1c5290f0
--- /dev/null
+++ b/python/oneflow/test/tensor/automated_test_util.py
@@ -0,0 +1,29 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+import sys
+
+test_util_parent_dir = os.path.dirname(
+    os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+)
+oneflow_test_utils_dir_from_env = os.getenv("ONEFLOW_TEST_UTILS_DIR")
+if oneflow_test_utils_dir_from_env:
+    from pathlib import Path
+
+    oneflow_test_utils_dir_from_env = Path(oneflow_test_utils_dir_from_env)
+    test_util_parent_dir = str(oneflow_test_utils_dir_from_env.parent.absolute())
+sys.path.append(test_util_parent_dir)
+from test_utils.automated_test_util import *
diff --git a/python/oneflow/test/tensor/test_tensor.py b/python/oneflow/test/tensor/test_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5bc832f3e36b7bb4bcbe0af4950bcaccb5a7c10
--- /dev/null
+++ b/python/oneflow/test/tensor/test_tensor.py
@@ -0,0 +1,958 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import random
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from automated_test_util import *
+
+import oneflow as flow
+import oneflow.typing as oft
+import oneflow.unittest
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestTensor(flow.unittest.TestCase):
+    def test_numpy_and_default_dtype(test_case):
+        shape = (2, 3, 4, 5)
+        tensor = flow.Tensor(*shape)
+        flow.nn.init.ones_(tensor)
+        test_case.assertTrue(tensor.dtype == flow.float32)
+        test_case.assertTrue(
+            np.array_equal(tensor.numpy(), np.ones(shape, dtype=np.float32))
+        )
+
+    def test_tensor_property(test_case):
+        shape = (2, 3, 4, 5)
+        tensor = flow.Tensor(*shape)
+        tensor.determine()
+        test_case.assertEqual(tensor.storage_offset(), 0)
+        test_case.assertEqual(tensor.stride(), (60, 20, 5, 1))
+        test_case.assertEqual(tensor.is_cuda, False)
+        test_case.assertTrue(tensor.is_contiguous())
+
+    def test_copy_to_and_from_numpy(test_case):
+        np_arr = np.array([4, 6], dtype=np.float32)
+        tensor = flow.Tensor(np_arr, dtype=flow.float32)
+        test_case.assertTrue(np.array_equal(tensor.numpy(), np_arr))
+        test_case.assertEqual(np.float32, tensor.numpy().dtype)
+        np_arr = np.array([4, 6], dtype=np.int32)
+        tensor = flow.Tensor(np_arr, dtype=flow.int32)
+        test_case.assertTrue(np.array_equal(tensor.numpy(), np_arr))
+        test_case.assertEqual(np.int32, tensor.numpy().dtype)
+
+    def test_construct_from_numpy_or_list(test_case):
+        shape = (2, 3, 4, 5)
+        np_arr = np.random.rand(*shape).astype(np.float32)
+        tensor = flow.Tensor(np_arr)
+        test_case.assertTrue(np.array_equal(tensor.numpy(), np_arr))
+        np_int_arr = np.random.randint(-100, high=100, size=shape, dtype=np.int32)
+        tensor = flow.Tensor(np_int_arr, dtype=flow.int32)
+        test_case.assertEqual(tensor.dtype, flow.int32)
+        test_case.assertTrue(np_arr.flags["C_CONTIGUOUS"])
+        test_case.assertTrue(np.array_equal(tensor.numpy(), np_int_arr))
+        np_arr = np.random.random((1, 256, 256, 3)).astype(np.float32)
+        np_arr = np_arr.transpose(0, 3, 1, 2)
+        tensor = flow.Tensor(np_arr)
+        test_case.assertFalse(np_arr.flags["C_CONTIGUOUS"])
+        test_case.assertTrue(np.array_equal(tensor.numpy(), np_arr))
+
+    def test_construct_from_another_tensor(test_case):
+        shape = (2, 3, 4, 5)
+        np_arr = np.random.rand(*shape).astype(np.float32)
+        tensor = flow.Tensor(np_arr)
+        output = flow.Tensor(tensor)
+        test_case.assertEqual(output.dtype, flow.float32)
+        test_case.assertTrue(np.array_equal(output.numpy(), np_arr))
+
+    def test_tensor_init_methods(test_case):
+        shape = (2, 3, 4, 5)
+        x = flow.Tensor(*shape)
+        np_ones = np.ones(x.shape)
+        np_zeros = np.zeros(x.shape)
+        random_fill_val = np.random.uniform(-100.0, 100.0)
+        x.fill_(random_fill_val)
+        test_case.assertTrue(np.allclose(x.numpy(), random_fill_val * np_ones))
+        flow.nn.init.ones_(x)
+        test_case.assertTrue(np.array_equal(x.numpy(), np_ones))
+        flow.nn.init.zeros_(x)
+        test_case.assertTrue(np.array_equal(x.numpy(), np_zeros))
+        flow.nn.init.constant_(x, random_fill_val)
+        test_case.assertTrue(np.allclose(x.numpy(), random_fill_val * np_ones))
+        z = flow.Tensor(5, 4, 3, 2)
+        flow.nn.init.kaiming_normal_(z, a=0.1, mode="fan_out", nonlinearity="relu")
+        flow.nn.init.kaiming_uniform_(z)
+        flow.nn.init.xavier_normal_(z)
+        flow.nn.init.xavier_uniform_(z)
+        x = flow.Tensor(*shape, dtype=flow.int32)
+        np_ones = np.ones(x.shape, dtype=np.int32)
+        np_zeros = np.zeros(x.shape, dtype=np.int32)
+        random_fill_val = np.random.randint(-100, 100)
+        x.fill_(random_fill_val)
+        test_case.assertTrue(np.allclose(x.numpy(), random_fill_val * np_ones))
+        flow.nn.init.ones_(x)
+        test_case.assertTrue(np.array_equal(x.numpy(), np_ones))
+        flow.nn.init.zeros_(x)
+        test_case.assertTrue(np.array_equal(x.numpy(), np_zeros))
+        flow.nn.init.constant_(x, random_fill_val)
+        test_case.assertTrue(np.allclose(x.numpy(), random_fill_val * np_ones))
+        test_case.assertEqual(flow.nn.init.calculate_gain("conv2d"), 1)
+        test_case.assertEqual(flow.nn.init.calculate_gain("tanh"), 5.0 / 3)
+
+    @unittest.skipIf(True, "consistent_tensor doesn't work right now")
+    def test_creating_consistent_tensor(test_case):
+        shape = (2, 3)
+        x = flow.Tensor(*shape, placement=flow.placement("gpu", ["0:0"], None))
+        x.set_placement(flow.placement("cpu", ["0:0"], None))
+        x.set_is_consistent(True)
+        test_case.assertTrue(not x.is_cuda)
+        x.determine()
+
+    def test_tensor_device(test_case):
+        shape = (2, 3, 4, 5)
+        x = flow.Tensor(*shape)
+        test_case.assertTrue(not x.is_cuda)
+        x = flow.Tensor(*shape, device=flow.device("cuda"))
+        test_case.assertTrue(x.is_cuda)
+        x = flow.Tensor(*shape, device=flow.device("cpu"))
+        test_case.assertTrue(not x.is_cuda)
+
+    def test_tensor_autograd_related_methods(test_case):
+        shape = (2, 3, 4, 5)
+        x = flow.Tensor(*shape)
+        y = flow.Tensor(*shape, requires_grad=True)
+        x.fill_(1.0)
+        y.fill_(2.0)
+        z = x + y
+        test_case.assertFalse(x.requires_grad)
+        test_case.assertTrue(x.is_leaf)
+        test_case.assertTrue(y.requires_grad)
+        test_case.assertTrue(y.is_leaf)
+        test_case.assertTrue(z.requires_grad)
+        test_case.assertFalse(z.is_leaf)
+        with flow.no_grad():
+            m = x + y
+        test_case.assertTrue(m.is_leaf)
+        test_case.assertFalse(m.requires_grad)
+        m.requires_grad = True
+        v = flow.Tensor(*shape, requires_grad=True)
+        z.retain_grad()
+        w = v + z
+        grad = flow.Tensor(*shape)
+        grad.fill_(1.0)
+        w.backward(gradient=grad, retain_graph=True)
+        test_case.assertNotEqual(v.grad, None)
+        test_case.assertNotEqual(y.grad, None)
+        test_case.assertNotEqual(z.grad, None)
+        test_case.assertIsNone(x.grad)
+        w.backward(gradient=grad, retain_graph=True)
+
+    def test_tensor_register_hook(test_case):
+        shape = (2, 3)
+        x = flow.Tensor(*shape, requires_grad=True)
+        x.register_hook(lambda grad: grad * 2 + 1)
+        y = x.sum() + (x * 2).sum()
+        y.backward()
+        test_case.assertTrue(np.array_equal(x.grad.numpy(), np.ones(shape) * 7))
+        x = flow.Tensor(*shape, requires_grad=True)
+        new_grad = flow.Tensor([[1, 2, 3], [4, 5, 6]])
+        x.register_hook(lambda _: new_grad)
+        y = x.sum() + (x * 2).sum()
+        y.backward()
+        test_case.assertTrue(np.array_equal(x.grad.numpy(), new_grad.numpy()))
+        grad_nonlocal = None
+
+        def assign_nonlocal_variable_and_return_none(grad):
+            nonlocal grad_nonlocal
+            grad_nonlocal = grad
+
+        x = flow.Tensor(*shape, requires_grad=True)
+        new_grad = flow.Tensor([[1, 2, 3], [4, 5, 6]])
+        x.register_hook(assign_nonlocal_variable_and_return_none)
+        y = x.sum() + (x * 2).sum()
+        y.backward()
+        test_case.assertTrue(np.array_equal(grad_nonlocal.numpy(), np.ones(shape) * 3))
+
+    def test_user_defined_data(test_case):
+        list_data = [5, 5]
+        tuple_data = (5, 5)
+        numpy_data = np.array((5, 5))
+        x = flow.Tensor(list_data)
+        y = flow.Tensor(tuple_data)
+        z = flow.Tensor(numpy_data)
+        test_case.assertTrue(np.array_equal(x.numpy(), 5 * np.ones(x.shape)))
+        test_case.assertTrue(np.array_equal(y.numpy(), 5 * np.ones(y.shape)))
+        test_case.assertTrue(np.array_equal(z.numpy(), 5 * np.ones(z.shape)))
+
+    def test_mirrored_tensor_and_op(test_case):
+        x1 = flow.Tensor([[1.0, 2.0]])
+        test_case.assertEqual(x1.dtype, flow.float32)
+        test_case.assertEqual(x1.shape, flow.Size((1, 2)))
+        x2 = flow.Tensor([[1.0], [2.0]])
+        op = (
+            flow.builtin_op("matmul")
+            .Input("a")
+            .Input("b")
+            .Attr("transpose_a", False)
+            .Attr("transpose_b", False)
+            .Attr("alpha", float(1.0))
+            .Output("out")
+            .Build()
+        )
+        y = op(x1, x2)[0]
+        test_case.assertTrue(
+            np.array_equal(y.numpy(), np.array([[5.0]], dtype=np.float32))
+        )
+
+    def test_tensor_to_list(test_case):
+        list_data = [[1.0, 3.0], [5.0, 6.0]]
+        input = flow.Tensor(list_data)
+        test_case.assertEqual(list_data, input.tolist())
+
+    def test_tensor_nelement(test_case):
+        shape = (2, 3, 4)
+        input = flow.Tensor(*shape)
+        test_case.assertEqual(input.nelement(), 24)
+
+    def test_tensor_numel(test_case):
+        shape = (2, 3, 4, 5)
+        input = flow.Tensor(*shape)
+        test_case.assertEqual(input.numel(), 120)
+
+    def test_tensor_print(test_case):
+        shape = (2, 3, 4, 5)
+        input = flow.Tensor(*shape)
+        input_str = str(input)
+        test_case.assertTrue(input_str.startswith("tensor("))
+        test_case.assertTrue("device=" not in input_str)
+        gpu_input = flow.Tensor(*shape, device="cuda")
+        gpu_input_str = str(gpu_input)
+        test_case.assertTrue("device=" in gpu_input_str)
+        test_case.assertTrue("cuda:0" in gpu_input_str)
+        requires_grad_input = flow.Tensor(*shape, requires_grad=True)
+        requires_grad_input_str = str(requires_grad_input)
+        test_case.assertTrue("requires_grad=" in requires_grad_input_str)
+
+    def test_indexing(test_case):
+        class SliceExtracter:
+            def __getitem__(self, key):
+                return key
+
+        se = SliceExtracter()
+
+        def compare_getitem_with_numpy(tensor, slices):
+            np_arr = tensor.numpy()
+            test_case.assertTrue(np.array_equal(np_arr[slices], tensor[slices].numpy()))
+
+        def compare_setitem_with_numpy(tensor, slices, value):
+            np_arr = tensor.numpy()
+            if isinstance(value, flow.Tensor):
+                np_value = value.numpy()
+            else:
+                np_value = value
+            np_arr[slices] = np_value
+            tensor[slices] = value
+            test_case.assertTrue(np.array_equal(np_arr, tensor.numpy()))
+
+        x = flow.Tensor(5, 5)
+        v = flow.Tensor([[0, 1, 2, 3, 4]])
+        compare_getitem_with_numpy(x, se[-4:-1:2])
+        compare_getitem_with_numpy(x, se[-1:])
+        compare_setitem_with_numpy(x, se[-1:], v)
+        compare_setitem_with_numpy(x, se[2::2], 2)
+        x = flow.Tensor(2, 3, 4)
+        v = flow.Tensor(3)
+        compare_setitem_with_numpy(x, se[:, :, 2], v)
+        x = flow.Tensor(2, 3, 4)
+        compare_setitem_with_numpy(x, se[1, :, 2], v)
+
+    def test_div(test_case):
+        x = flow.Tensor(np.random.randn(1, 1))
+        y = flow.Tensor(np.random.randn(2, 3))
+        of_out = x / y
+        np_out = np.divide(x.numpy(), y.numpy())
+        test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+        x = flow.Tensor(np.random.randn(2, 3))
+        of_out = x / 3
+        np_out = np.divide(x.numpy(), 3)
+        test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+        x = flow.Tensor(np.random.randn(2, 3))
+        of_out = 3 / x
+        np_out = np.divide(3, x.numpy())
+        test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+        x = flow.Tensor(np.random.randn(1))
+        of_out = 3 / x
+        np_out = np.divide(3, x.numpy())
+        test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+
+    def test_mul(test_case):
+        x = flow.Tensor(np.random.randn(1, 1))
+        y = flow.Tensor(np.random.randn(2, 3))
+        of_out = x * y
+        np_out = np.multiply(x.numpy(), y.numpy())
+        test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+        x = flow.Tensor(np.random.randn(2, 3))
+        of_out = x * 3
+        np_out = np.multiply(x.numpy(), 3)
+        test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+        x = flow.Tensor(np.random.randn(2, 3))
+        of_out = 3 * x
+        np_out = np.multiply(3, x.numpy())
+        test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+
+    def test_add_tensor_method(test_case):
+        x = flow.Tensor(np.random.randn(1, 1))
+        y = flow.Tensor(np.random.randn(2, 3))
+        of_out = x + y
+        np_out = np.add(x.numpy(), y.numpy())
+        test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+        x = flow.Tensor(np.random.randn(2, 3))
+        of_out = x + 3
+        np_out = np.add(x.numpy(), 3)
+        test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+        x = flow.Tensor(np.random.randn(2, 3))
+        of_out = 3 + x
+        np_out = np.add(3, x.numpy())
+        test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+
+    def test_sub_tensor_method(test_case):
+        x = flow.Tensor(np.random.randn(1, 1))
+        y = flow.Tensor(np.random.randn(2, 3))
+        of_out = x - y
+        np_out = np.subtract(x.numpy(), y.numpy())
+        test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+        x = flow.Tensor(np.random.randn(2, 3))
+        of_out = x - 3
+        np_out = np.subtract(x.numpy(), 3)
+        test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+        x = flow.Tensor(np.random.randn(2, 3))
+        of_out = 3 - x
+        np_out = np.subtract(3, x.numpy())
+        test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+
+    def test_sum(test_case):
+        input = flow.Tensor(np.random.randn(4, 5, 6), dtype=flow.float32)
+        of_out = input.sum(dim=(2, 1))
+        np_out = np.sum(input.numpy(), axis=(2, 1))
+        test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+
+    @autotest()
+    def test_tensor_tanh_with_random_data(test_case):
+        device = random_device()
+        x = random_pytorch_tensor().to(device)
+        y = x.tanh()
+        return y
+
+    @unittest.skip("asin has bug")
+    @autotest()
+    def test_flow_tensor_asin_with_random_data(test_case):
+        device = random_device()
+        x = random_pytorch_tensor().to(device)
+        y = x.asin()
+        return y
+
+    @unittest.skip("arcsin has bug")
+    @autotest()
+    def test_flow_tensor_arcsin_with_random_data(test_case):
+        device = random_device()
+        x = random_pytorch_tensor().to(device)
+        y = x.arcsin()
+        return y
+
+    @autotest()
+    def test_flow_tensor_asinh_with_random_data(test_case):
+        device = random_device()
+        x = random_pytorch_tensor().to(device)
+        y = x.asinh()
+        return y
+
+    @autotest()
+    def test_flow_tensor_arcsinh_with_random_data(test_case):
+        device = random_device()
+        x = random_pytorch_tensor().to(device)
+        y = x.arcsinh()
+        return y
+
+    @autotest()
+    def test_flow_tensor_sinh_with_random_data(test_case):
+        device = random_device()
+        x = random_pytorch_tensor().to(device)
+        y = x.sinh()
+        return y
+
+    @autotest()
+    def test_flow_tensor_atan2_with_random_data(test_case):
+        device = random_device()
+        x1 = random_pytorch_tensor(ndim=1, dim0=1).to(device)
+        x2 = random_pytorch_tensor(ndim=1, dim0=1).to(device)
+        y = x1.atan2(x2)
+        return y
+
+    @unittest.skip("arccosh has bug")
+    @autotest()
+    def test_arccosh_tensor_with_random_data(test_case):
+        device = random_device()
+        x = random_pytorch_tensor().to(device)
+        y = x.arccosh()
+        return y
+
+    @unittest.skip("acosh has bug")
+    @autotest()
+    def test_acosh_tensor_with_random_data(test_case):
+        device = random_device()
+        x = random_pytorch_tensor().to(device)
+        y = x.acosh()
+        return y
+
+    def test_mean(test_case):
+        input = flow.Tensor(np.random.randn(2, 3), dtype=flow.float32)
+        of_out = input.mean(dim=0)
+        np_out = np.mean(input.numpy(), axis=0)
+        test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+
+    def test_neg(test_case):
+        input = flow.Tensor(np.random.randn(2, 3), dtype=flow.float32)
+        of_out = -input
+        np_out = -input.numpy()
+        test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+
+    def test_negative(test_case):
+        input = flow.Tensor(np.random.randn(2, 3), dtype=flow.float32)
+        of_out = input.negative()
+        np_out = -input.numpy()
+        test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+
+    def test_greater(test_case):
+        input1 = flow.Tensor(np.array([1, 1, 4]).astype(np.float32), dtype=flow.float32)
+        input2 = flow.Tensor(np.array([1, 2, 3]).astype(np.float32), dtype=flow.float32)
+        of_out = input1.gt(input2)
+        np_out = np.greater(input1.numpy(), input2.numpy())
+        test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+
+    def test_less(test_case):
+        input1 = flow.Tensor(np.random.randn(2, 6, 5, 3), dtype=flow.float32)
+        input2 = flow.Tensor(np.random.randn(2, 6, 5, 3), dtype=flow.float32)
+        of_out = input1.lt(input2)
+        np_out = np.less(input1.numpy(), input2.numpy())
+        test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+
+    def test_tensor_slice(test_case):
+        x = np.random.randn(2, 3, 4, 5).astype(np.float32)
+        input = flow.Tensor(x)
+        test_case.assertTrue(np.allclose(input[0].numpy(), x[0], 1e-05, 1e-05))
+        test_case.assertTrue(np.allclose(input[1].numpy(), x[1], 1e-05, 1e-05))
+        test_case.assertTrue(np.allclose(input[0, :].numpy(), x[0, :], 1e-05, 1e-05))
+        test_case.assertTrue(
+            np.allclose(input[0, :, 0:2].numpy(), x[0, :, 0:2], 1e-05, 1e-05)
+        )
+
+    def test_tensor_logical_slice_assign(test_case):
+        x = np.random.randn(2, 3, 4, 5).astype(np.float32)
+        input = flow.Tensor(x)
+        input[:, 0] = 3.1415926
+        x[:, 0] = 3.1415926
+        test_case.assertTrue(np.allclose(input.numpy(), x, 1e-05, 1e-05))
+        input[:, 1:2] = 1
+        x[:, 1:2] = 1
+        test_case.assertTrue(np.allclose(input.numpy(), x, 1e-05, 1e-05))
+        input[:] = 1.234
+        x[:] = 1.234
+        test_case.assertTrue(np.allclose(input.numpy(), x, 1e-05, 1e-05))
+        input[0] = 0
+        x[0] = 0
+        test_case.assertTrue(np.allclose(input.numpy(), x, 1e-05, 1e-05))
+
+    def test_zeros_(test_case):
+        shape = (2, 3)
+        x = flow.Tensor(np.random.randn(*shape), dtype=flow.float32)
+        x.zeros_()
+        test_case.assertTrue(np.array_equal(x.numpy(), np.zeros(shape)))
+
+    def test_construct_small_tensor(test_case):
+        shape = (2, 3, 4, 5)
+        np_arr = np.random.rand(*shape).astype(np.float32)
+        tensor = flow.tensor(np_arr)
+        test_case.assertTrue(np.array_equal(tensor.numpy(), np_arr))
+        test_case.assertEqual(tensor.dtype, flow.float32)
+        np_int_arr = np.random.randint(-100, high=100, size=shape, dtype=np.int32)
+        tensor = flow.tensor(np_int_arr, dtype=flow.int32)
+        test_case.assertEqual(tensor.dtype, flow.int32)
+        list_data = [[1, 2.0], [5, 3]]
+        tensor = flow.tensor(list_data)
+        test_case.assertEqual(tensor.dtype, flow.float32)
+        test_case.assertTrue(
+            np.allclose(tensor.numpy(), np.array(list_data), 0.0001, 0.0001)
+        )
+        tuple_data = ((1, 2, 5), (4, 3, 10))
+        tensor = flow.tensor(tuple_data)
+        test_case.assertEqual(tensor.dtype, flow.int64)
+        test_case.assertTrue(np.array_equal(tensor.numpy(), np.array(tuple_data)))
+        scalar = 5.5
+        tensor = flow.tensor(scalar)
+        test_case.assertEqual(tensor.dtype, flow.float32)
+        test_case.assertTrue(
+            np.allclose(tensor.numpy(), np.array(scalar), 0.0001, 0.0001)
+        )
+
+    def test_floor(test_case):
+        input = flow.Tensor(np.random.randn(4, 5, 6), dtype=flow.float32)
+        of_out = input.floor()
+        np_out = np.floor(input.numpy())
+        test_case.assertTrue(
+            np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05, equal_nan=True)
+        )
+
+    def test_tensor_round(test_case):
+        shape = (2, 3)
+        np_input = np.random.randn(*shape)
+        of_input = flow.Tensor(np_input, dtype=flow.float32, requires_grad=True)
+        of_out = flow.round(of_input)
+        np_out = np.round(np_input)
+        test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+        of_out = of_out.sum()
+        of_out.backward()
+        test_case.assertTrue(
+            np.allclose(of_input.grad.numpy(), np.zeros(shape), 0.0001, 0.0001)
+        )
+
+    def _test_tensor_reshape(test_case):
+        x = np.array(
+            [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]]
+        ).astype(np.float32)
+        input = flow.Tensor(x)
+        of_shape = input.reshape(shape=[2, 2, 2, -1]).numpy().shape
+        np_shape = (2, 2, 2, 2)
+        test_case.assertTrue(np.array_equal(of_shape, np_shape))
+
+    @autotest()
+    def test_reshape_tensor_with_random_data(test_case):
+        device = random_device()
+        x = random_pytorch_tensor(ndim=4).to(device)
+        y = x.reshape(shape=(-1,))
+        return y
+
+    @autotest()
+    def test_tensor_squeeze_with_random_data(test_case):
+        device = random_device()
+        x = random_pytorch_tensor().to(device)
+        y = x.squeeze(random().to(int))
+        return y
+
+    @autotest()
+    def test_flow_unsqueeze_with_random_data(test_case):
+        device = random_device()
+        x = random_pytorch_tensor().to(device)
+        y = x.unsqueeze(random(1, 3).to(int))
+        return y
+
+    @autotest()
+    def test_permute_flow_with_random_data(test_case):
+        device = random_device()
+        x = random_pytorch_tensor(ndim=4).to(device)
+        y = x.permute(
+            random(0, 4).to(int),
+            random(0, 4).to(int),
+            random(0, 4).to(int),
+            random(0, 4).to(int),
+        )
+        return y
+
+    @autotest()
+    def test_transpose_tensor_with_random_data(test_case):
+        device = random_device()
+        x = random_pytorch_tensor(ndim=4).to(device)
+        y = x.transpose(dim0=random(1, 3).to(int), dim1=random(1, 3).to(int))
+        return y
+
+    def test_tensor_where(test_case):
+        x = flow.Tensor(
+            np.array([[-0.462, 0.3139], [0.3898, -0.7197], [0.0478, -0.1657]]),
+            dtype=flow.float32,
+        )
+        y = flow.Tensor(np.ones(shape=(3, 2)), dtype=flow.float32)
+        condition = flow.Tensor(np.array([[0, 1], [1, 0], [1, 0]]), dtype=flow.int32)
+        of_out = condition.where(x, y)
+        np_out = np.array([[1.0, 0.3139], [0.3898, 1.0], [0.0478, 1.0]])
+        test_case.assertTrue(np.allclose(of_out.numpy(), np_out))
+
+    def test_tensor_equal(test_case):
+        arr1 = np.random.randint(1, 10, size=(2, 3, 4, 5))
+        arr2 = np.random.randint(1, 10, size=(2, 3, 4, 5))
+        input = flow.Tensor(arr1, dtype=flow.float32)
+        other = flow.Tensor(arr2, dtype=flow.float32)
+        of_out = input.eq(other)
+        np_out = np.equal(arr1, arr2)
+        test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
+
+    def _test_tensor_atan(test_case, shape, device):
+        np_input = np.random.randn(*shape)
+        of_input = flow.Tensor(
+            np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+        )
+        of_out = of_input.atan()
+        np_out = np.arctan(np_input)
+        test_case.assertTrue(
+            np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05, equal_nan=True)
+        )
+        of_out = of_out.sum()
+        of_out.backward()
+        np_out_grad = 1 / (1 + np_input ** 2)
+        test_case.assertTrue(
+            np.allclose(
+                of_input.grad.numpy(), np_out_grad, 1e-05, 1e-05, equal_nan=True
+            )
+        )
+
+    def _test_tensor_arctan(test_case, shape, device):
+        np_input = np.random.randn(*shape)
+        of_input = flow.Tensor(
+            np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True
+        )
+        of_out = of_input.arctan()
+        np_out = np.arctan(np_input)
+        test_case.assertTrue(
+            np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05, equal_nan=True)
+        )
+        of_out = of_out.sum()
+        of_out.backward()
+        np_out_grad = 1 / (1 + np_input ** 2)
+        test_case.assertTrue(
+            np.allclose(
+                of_input.grad.numpy(), np_out_grad, 1e-05, 1e-05, equal_nan=True
+            )
+        )
+
+    def test_tensor_detach(test_case):
+        shape = (2, 3, 4, 5)
+        x = flow.Tensor(np.random.randn(*shape), dtype=flow.float32, requires_grad=True)
+        test_case.assertTrue(np.allclose(x.detach().numpy(), x.numpy(), 0.0001, 0.0001))
+        test_case.assertEqual(x.detach().requires_grad, False)
+        y = x * 2
+        z = y.detach()
+        test_case.assertEqual(z.is_leaf, True)
+        test_case.assertEqual(z.grad_fn, None)
+
+    def test_tensor_clamp_(test_case):
+        input = flow.Tensor(np.random.randn(2, 6, 5, 3), dtype=flow.float32)
+        of_out = input.clamp(0.1, 0.5)
+        np_out = np.clip(input.numpy(), 0.1, 0.5)
+        test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+    def test_tensor_clip_(test_case):
+        input = flow.Tensor(np.random.randn(2, 6, 5, 3), dtype=flow.float32)
+        of_out = input.clip(0.1, 0.5)
+        np_out = np.clip(input.numpy(), 0.1, 0.5)
+        test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+    def _test_cast_tensor_function(test_case):
+        shape = (2, 3, 4, 5)
+        np_arr = np.random.randn(*shape).astype(np.float32)
+        input = flow.Tensor(np_arr, dtype=flow.float32)
+        output = input.cast(flow.int8)
+        np_out = np_arr.astype(np.int8)
+        test_case.assertTrue(np.array_equal(output.numpy(), np_out))
+
+    def _test_sin_tensor_function(test_case, shape, device):
+        input = flow.Tensor(np.random.randn(2, 3, 4, 5))
+        of_out = input.sin()
+        np_out = np.sin(input.numpy())
+        test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+    def test_cos_tensor_function(test_case):
+        arr = np.random.randn(2, 3, 4, 5)
+        input = flow.Tensor(arr, dtype=flow.float32)
+        np_out = np.cos(arr)
+        of_out = input.cos()
+        test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+    def test_std_tensor_function(test_case):
+        np_arr = np.random.randn(9, 8, 7, 6)
+        input = flow.Tensor(np_arr)
+        of_out = input.std(dim=1, keepdim=False)
+        np_out = np.std(np_arr, axis=1)
+        test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+    def test_sqrt_tensor_function(test_case):
+        input_arr = np.random.rand(1, 6, 3, 8)
+        np_out = np.sqrt(input_arr)
+        x = flow.Tensor(input_arr)
+        of_out = x.sqrt()
+        test_case.assertTrue(
+            np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05, equal_nan=True)
+        )
+
+    def test_rsqrt_tensor_function(test_case):
+        np_arr = np.random.rand(3, 2, 5, 7)
+        np_out = 1 / np.sqrt(np_arr)
+        x = flow.Tensor(np_arr)
+        of_out = flow.rsqrt(input=x)
+        test_case.assertTrue(
+            np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05, equal_nan=True)
+        )
+
+    def test_square_tensor_function(test_case):
+        np_arr = np.random.randn(2, 7, 7, 3)
+        np_out = np.square(np_arr)
+        x = flow.Tensor(np_arr)
+        of_out = x.square()
+        test_case.assertTrue(
+            np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05, equal_nan=True)
+        )
+
+    def test_tensor_addmm_(test_case):
+        input = flow.Tensor(np.random.randn(2, 6), dtype=flow.float32)
+        mat1 = flow.Tensor(np.random.randn(2, 3), dtype=flow.float32)
+        mat2 = flow.Tensor(np.random.randn(3, 6), dtype=flow.float32)
+        of_out = input.addmm(mat1, mat2, alpha=1, beta=2)
+        np_out = np.add(2 * input.numpy(), 1 * np.matmul(mat1.numpy(), mat2.numpy()))
+        test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+
+    def test_norm_tensor_function(test_case):
+        input = flow.Tensor(
+            np.array([[-4.0, -3.0, -2.0], [-1.0, 0.0, 1.0], [2.0, 3.0, 4.0]]),
+            dtype=flow.float32,
+        )
+        of_out_1 = input.norm("fro")
+        np_out_1 = np.linalg.norm(input.numpy(), "fro")
+        of_out_2 = input.norm(2, dim=1)
+        np_out_2 = np.linalg.norm(input.numpy(), ord=2, axis=1)
+        of_out_3 = input.norm(float("inf"), dim=0, keepdim=True)
+        np_out_3 = np.linalg.norm(
+            input.numpy(), ord=float("inf"), axis=0, keepdims=True
+        )
+        test_case.assertTrue(np.allclose(of_out_1.numpy(), np_out_1, 1e-05, 1e-05))
+        test_case.assertTrue(np.allclose(of_out_2.numpy(), np_out_2, 1e-05, 1e-05))
+        test_case.assertTrue(np.allclose(of_out_3.numpy(), np_out_3, 1e-05, 1e-05))
+
+    def test_pow_tensor_function(test_case):
+        input = flow.Tensor(np.array([1, 2, 3, 4, 5, 6]), dtype=flow.float32)
+        of_out = input.pow(2.1)
+        np_out = np.power(input.numpy(), 2.1)
+        test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+        of_out_magic = input ** 2.1
+        test_case.assertTrue(np.allclose(of_out_magic.numpy(), np_out, 1e-05, 1e-05))
+
+    def test_tensor_atanh(test_case):
+        np_input = np.random.random((2, 3)) - 0.5
+        of_input = flow.Tensor(np_input, dtype=flow.float32, requires_grad=True)
+        of_out = of_input.atanh()
+        np_out = np.arctanh(np_input)
+        test_case.assertTrue(
+            np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001, equal_nan=True)
+        )
+        of_out = of_out.sum()
+        of_out.backward()
+        np_out_grad = 1.0 / (1.0 - np.square(np_input))
+        test_case.assertTrue(
+            np.allclose(
+                of_input.grad.numpy(), np_out_grad, 0.0001, 0.0001, equal_nan=True
+            )
+        )
+
+    def test_tensor_arctanh(test_case):
+        np_input = np.random.random((2, 3)) - 0.5
+        of_input = flow.Tensor(np_input, dtype=flow.float32, requires_grad=True)
+        of_out = of_input.arctanh()
+        np_out = np.arctanh(np_input)
+        test_case.assertTrue(
+            np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001, equal_nan=True)
+        )
+        of_out = of_out.sum()
+        of_out.backward()
+        np_out_grad = 1.0 / (1.0 - np.square(np_input))
+        test_case.assertTrue(
+            np.allclose(
+                of_input.grad.numpy(), np_out_grad, 0.0001, 0.0001, equal_nan=True
+            )
+        )
+
+    def test_tensor_tan(test_case):
+        np_input = np.random.random((2, 3)) - 0.5
+        of_input = flow.Tensor(np_input, dtype=flow.float32, requires_grad=True)
+        of_out = of_input.tan()
+        np_out = np.tan(np_input)
+        test_case.assertTrue(
+            np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001, equal_nan=True)
+        )
+        of_out = of_out.sum()
+        of_out.backward()
+        np_out_grad = 1 + np.square(np_out)
+        test_case.assertTrue(
+            np.allclose(
+                of_input.grad.numpy(), np_out_grad, 0.0001, 0.0001, equal_nan=True
+            )
+        )
+
+    def test_tensor_acos(test_case):
+        input = flow.Tensor(np.random.rand(8, 11, 9, 7) - 0.5, requires_grad=True)
+        of_out = input.acos()
+        np_out = np.arccos(input.numpy())
+        test_case.assertTrue(
+            np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05, equal_nan=True)
+        )
+        of_out = of_out.sum()
+        of_out.backward()
+        np_grad = -1.0 / np.sqrt(1 - np.square(input.numpy()))
+        test_case.assertTrue(
+            np.allclose(input.grad.numpy(), np_grad, 0.0001, 0.0001, equal_nan=True)
+        )
+
+    @unittest.skipIf(
+        not flow.unittest.env.eager_execution_enabled(),
+        "numpy doesn't work in lazy mode",
+    )
+    def test_tensor_fmod(test_case):
+        x = flow.Tensor(np.random.uniform(-100, 100, (5, 5)), requires_grad=True)
+        y = random.uniform(-10, 10)
+        of_out = x.fmod(y)
+        np_out = np.sign(x.numpy()) * np.abs(np.fmod(x.numpy(), y))
+        test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+        of_out = of_out.sum()
+        of_out.backward()
+        test_case.assertTrue(
+            np.allclose(x.grad.numpy(), np.ones((5, 5)), 0.0001, 0.0001)
+        )
+
+    @unittest.skipIf(
+        not flow.unittest.env.eager_execution_enabled(),
+        "numpy doesn't work in lazy mode",
+    )
+    def test_magic_fmod(test_case):
+        x = flow.Tensor(np.random.uniform(-100, 100, (5, 5)), requires_grad=True)
+        y = random.uniform(-10, 10)
+        of_out = x % y
+        np_out = np.sign(x.numpy()) * np.abs(np.fmod(x.numpy(), y))
+        test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+        of_out = of_out.sum()
+        of_out.backward()
+        test_case.assertTrue(
+            np.allclose(x.grad.numpy(), np.ones((5, 5)), 0.0001, 0.0001)
+        )
+
+    @unittest.skipIf(
+        not flow.unittest.env.eager_execution_enabled(),
+        "numpy doesn't work in lazy mode",
+    )
+    def test_tensor_ceil(test_case):
+        x = flow.Tensor(np.random.randn(2, 3), requires_grad=True)
+        of_out = x.ceil()
+        np_out = np.ceil(x.numpy())
+        test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+        of_out = of_out.sum()
+        of_out.backward()
+        test_case.assertTrue(
+            np.allclose(x.grad.numpy(), np.zeros((2, 3)), 0.0001, 0.0001)
+        )
+
+    def test_tensor_expm1(test_case):
+        x = flow.Tensor(np.random.randn(2, 3), requires_grad=True)
+        of_out = x.expm1()
+        np_out = np.expm1(x.numpy())
+        test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 0.0001, 0.0001))
+        of_out = of_out.sum()
+        of_out.backward()
+        test_case.assertTrue(
+            np.allclose(x.grad.numpy(), np.exp(x.numpy()), 0.0001, 0.0001)
+        )
+
+    def test_tensor_mish(test_case):
+        def np_mish(x):
+            f = 1 + np.exp(x)
+            y = x * ((f * f - 1) / (f * f + 1))
+            y_grad = (f * f - 1) / (f * f + 1) + x * (4 * f * (f - 1)) / (
+                (f * f + 1) * (f * f + 1)
+            )
+            return [y, y_grad]
+
+        np_input = np.random.randn(2, 4, 5, 6)
+        of_input = flow.Tensor(np_input, dtype=flow.float32, requires_grad=True)
+        of_out = of_input.mish()
+        (np_out, np_grad) = np_mish(np_input)
+        test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+        of_out = of_out.sum()
+        of_out.backward()
+        test_case.assertTrue(np.allclose(of_input.grad.numpy(), np_grad, 1e-05, 1e-05))
+
+    def test_tensor_triu(test_case):
+        def np_triu(x, diagonal):
+            y = np.triu(x, diagonal)
+            y_grad = np.triu(np.ones_like(x), diagonal)
+            return [y, y_grad]
+
+        diagonal_list = [2, -1]
+        for diagonal in diagonal_list:
+            np_input = np.random.randn(2, 4, 6)
+            of_input = flow.Tensor(np_input, dtype=flow.float32, requires_grad=True)
+            of_out = of_input.triu(diagonal)
+            (np_out, np_grad) = np_triu(np_input, diagonal)
+            test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
+            of_out = of_out.sum()
+            of_out.backward()
+            test_case.assertTrue(
+                np.allclose(of_input.grad.numpy(), np_grad, 1e-05, 1e-05)
+            )
+
+    def test_tensor_grad_assignment(test_case):
+        np_input = np.random.randn(2, 4, 5, 6)
+        of_input = flow.Tensor(np_input, dtype=flow.float32, requires_grad=True)
+        of_output = 2 * of_input
+        of_output = of_output.sum()
+        of_output.backward()
+        new_grad = flow.Tensor(
+            np.full(np_input.shape, np.random.randn(1)), dtype=flow.float32
+        )
+        of_input.grad = new_grad
+        test_case.assertTrue(
+            np.allclose(of_input.grad.detach().numpy(), new_grad.numpy(), 1e-05, 1e-05)
+        )
+        of_input.grad = None
+        test_case.assertTrue(of_input.grad is None)
+
+    def test_tensor_grad_assignment_sum(test_case):
+        np_input = np.random.randn(1, 5, 7, 3)
+        of_input = flow.Tensor(np_input, dtype=flow.float32, requires_grad=True)
+        of_output = of_input.sum()
+        of_output.backward()
+        rand_init = np.random.randn(1)
+        rand_scale = np.random.randn(1)
+        new_grad = flow.Tensor(np.full(np_input.shape, rand_init), dtype=flow.float32)
+        of_input.grad = new_grad
+        of_output = flow.Tensor(rand_scale, dtype=flow.float32) * of_input
+        of_output = of_output.sum()
+        of_output.backward()
+        test_case.assertTrue(
+            np.allclose(
+                of_input.grad.detach().numpy(),
+                np.full(np_input.shape, rand_init + rand_scale),
+                1e-05,
+                1e-05,
+            )
+        )
+        of_input.grad = of_input.grad * 2
+        test_case.assertTrue(
+            np.allclose(
+                of_input.grad.detach().numpy(),
+                2 * np.full(np_input.shape, rand_init + rand_scale),
+                1e-05,
+                1e-05,
+            )
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test_utils/__init__.py b/python/oneflow/test_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/python/oneflow/test_utils/automated_test_util/__init__.py b/python/oneflow/test_utils/automated_test_util/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..abf12a3daccf3e22b5adab11c6617dec088ea333
--- /dev/null
+++ b/python/oneflow/test_utils/automated_test_util/__init__.py
@@ -0,0 +1,17 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from .generators import *
+from .torch_flow_dual_object import *
diff --git a/python/oneflow/test_utils/automated_test_util/generators.py b/python/oneflow/test_utils/automated_test_util/generators.py
new file mode 100644
index 0000000000000000000000000000000000000000..29732284eabc50d4e1fe848d1234d5445b4c08d1
--- /dev/null
+++ b/python/oneflow/test_utils/automated_test_util/generators.py
@@ -0,0 +1,648 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import inspect
+import os
+import random as random_util
+import typing
+from collections import namedtuple
+from typing import Any, Dict, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+import oneflow as flow
+
+py_tuple = tuple
+TEST_MODULE = 0
+TEST_FLOW = 1
+TEST_TENSOR = 2
+rng = np.random.default_rng()
+annotation2default_generator = {}
+annotation2torch_to_flow_converter = {}
+
+
+def data_generator(annotation):
+    def register_data_generator(cls):
+        annotation2default_generator[annotation] = lambda: cls()
+        return cls
+
+    return register_data_generator
+
+
+def torch_to_flow_converter(annotation):
+    def register_flow_to_flow_converter(func):
+        annotation2torch_to_flow_converter[annotation] = func
+        return func
+
+    return register_flow_to_flow_converter
+
+
+@torch_to_flow_converter(torch.Tensor)
+def tensor_converter(torch_tensor):
+    return flow.tensor(torch_tensor.cpu().numpy())
+
+
+def convert_torch_object_to_flow(x):
+    for (annotation, converter) in annotation2torch_to_flow_converter.items():
+        if isinstance(x, annotation):
+            return converter(x)
+    return x
+
+
+def pack(x):
+    if isinstance(x, generator):
+        return x
+    return constant(x)
+
+
+class Nothing:
+    pass
+
+
+class generator:
+    def __init__(self, children):
+        self.children = children
+        self._value = None
+
+    def _init(self):
+        self._value = None
+        for x in self.children:
+            x._init()
+
+    def eval(self):
+        self._init()
+        return self.value()
+
+    def _calc_value(self):
+        raise NotImplementedError()
+
+    def value(self):
+        if self._value is None:
+            self._value = self._calc_value()
+        return self._value
+
+    def size(self):
+        return 1
+
+    def __or__(self, other):
+        other = pack(other)
+        return oneof(
+            self, other, possibility=self.size() / (self.size() + other.size())
+        )
+
+    def __ror__(self, other):
+        return self | other
+
+    def __add__(self, other):
+        return add(self, other)
+
+    def __radd__(self, other):
+        return self + other
+
+    def __sub__(self, other):
+        return self + neg(other)
+
+    def __rsub__(self, other):
+        return neg(self - other)
+
+    def __mul__(self, other):
+        return mul(self, other)
+
+    def __rmul__(self, other):
+        return self * other
+
+    def to(self, annotation):
+        self._to(annotation)
+        for x in self.children:
+            x.to(annotation)
+        return self
+
+    def _to(self, annotation):
+        pass
+
+
+class add(generator):
+    def __init__(self, a, b):
+        self.a = pack(a)
+        self.b = pack(b)
+        super().__init__([self.a, self.b])
+
+    def _calc_value(self):
+        return self.a.value() + self.b.value()
+
+
+class mul(generator):
+    def __init__(self, a, b):
+        self.a = pack(a)
+        self.b = pack(b)
+        super(mul, self).__init__([self.a, self.b])
+
+    def _calc_value(self):
+        return self.a.value() * self.b.value()
+
+
+class neg(generator):
+    def __init__(self, a):
+        self.a = pack(a)
+        super().__init__([self.a])
+
+    def _calc_value(self):
+        return -self.a.value()
+
+
+class oneof(generator):
+    def __init__(self, *args, possibility=None):
+        self.args = list(map(pack, args))
+        super().__init__(self.args)
+        if isinstance(possibility, float):
+            assert len(args) == 2
+            possibility = [possibility, 1 - possibility]
+        if possibility is None:
+            possibility = [1 / len(args)] * len(args)
+        self.possibility = pack(possibility)
+
+    def _calc_value(self):
+        rand = rng.random()
+        sum = 0
+        for (i, possibility) in enumerate(self.possibility.value()):
+            sum += possibility
+            if sum > rand:
+                return self.args[i].value()
+        raise RuntimeError()
+
+    def size(self):
+        return sum([x.size() for x in self.args])
+
+
+class tuple(generator):
+    def __init__(self, *args):
+        self.args = list(map(pack, args))
+        super().__init__(self.args)
+
+    def _calc_value(self):
+        return py_tuple([x.value() for x in self.args])
+
+
+class constant(generator):
+    def __init__(self, x):
+        super().__init__([])
+        self.x = x
+
+    def _calc_value(self):
+        return self.x
+
+
+class nothing(generator):
+    def __init__(self):
+        super().__init__([])
+
+    def _calc_value(self):
+        return Nothing()
+
+
+class random(generator):
+    def __init__(self, low=1, high=6):
+        self.low = pack(low)
+        self.high = pack(high)
+        super().__init__([self.low, self.high])
+        self.annotation = None
+
+    def _to(self, annotation):
+        if self.annotation is not None:
+            return
+        if hasattr(annotation, "__origin__"):
+            annotation = eval(repr(annotation))
+        self.annotation = annotation
+
+    def _generate(self, annotation):
+        if hasattr(annotation, "__origin__"):
+            if annotation.__origin__ is Union:
+                x = random_util.choice(annotation.__args__)
+                return self._generate(x)
+            if annotation.__origin__ is Tuple or annotation.__origin__ is py_tuple:
+                return [self._generate(x) for x in annotation.__args__]
+            else:
+                raise NotImplementedError(
+                    f"Not implemented annotation {annotation} in random, type(annotation.__origin__) is {type(annotation.__origin__)}"
+                )
+        (low, high) = (self.low.value(), self.high.value())
+        if annotation == int:
+            val = int(rng.integers(low, high))
+        elif annotation == float:
+            val = float(rng.random() * (high - low) + low)
+        elif annotation == bool:
+            val = random_util.choice([True, False])
+        else:
+            raise NotImplementedError(
+                f"Not implemented annotation {annotation} in random"
+            )
+        return val
+
+    def _calc_value(self):
+        return self._generate(self.annotation)
+
+
+def random_or_nothing(low, high):
+    return oneof(random(low, high), nothing(), possibility=2 / 3)
+
+
+@data_generator(torch.Tensor)
+class random_tensor(generator):
+    def __init__(
+        self,
+        ndim=None,
+        dim0=1,
+        dim1=None,
+        dim2=None,
+        dim3=None,
+        dim4=None,
+        low=0,
+        high=1,
+        dtype=float,
+    ):
+        if ndim is None:
+            ndim = random(1, 6)
+        if dim0 is None:
+            dim0 = random(1, 8)
+        if dim1 is None:
+            dim1 = random(1, 8)
+        if dim2 is None:
+            dim2 = random(1, 8)
+        if dim3 is None:
+            dim3 = random(1, 8)
+        if dim4 is None:
+            dim4 = random(1, 8)
+        self.ndim = pack(ndim).to(int)
+        self.dim0 = pack(dim0).to(int)
+        self.dim1 = pack(dim1).to(int)
+        self.dim2 = pack(dim2).to(int)
+        self.dim3 = pack(dim3).to(int)
+        self.dim4 = pack(dim4).to(int)
+        self.low = pack(low).to(float)
+        self.high = pack(high).to(float)
+        self.dtype = pack(dtype)
+        super().__init__(
+            [
+                self.ndim,
+                self.dim0,
+                self.dim1,
+                self.dim2,
+                self.dim3,
+                self.dim4,
+                self.low,
+                self.high,
+                self.dtype,
+            ]
+        )
+
+    def _calc_value(self):
+        ndim = self.ndim.value()
+        dim0 = self.dim0.value()
+        dim1 = self.dim1.value()
+        dim2 = self.dim2.value()
+        dim3 = self.dim3.value()
+        dim4 = self.dim4.value()
+        low = self.low.value()
+        high = self.high.value()
+        dtype = self.dtype.value()
+        shape = rng.integers(low=1, high=8, size=ndim)
+        if dim0 is not None:
+            shape[0] = dim0
+        if ndim >= 2:
+            shape[1] = dim1
+        if ndim >= 3:
+            shape[2] = dim2
+        if ndim >= 4:
+            shape[3] = dim3
+        if ndim == 5:
+            shape[4] = dim4
+        if dtype == float:
+            np_arr = rng.random(shape)
+            return torch.Tensor(np_arr)
+        elif dtype == int:
+            np_arr = rng.integers(low=low, high=high, size=shape)
+            return torch.tensor(np_arr, dtype=torch.int64)
+        else:
+            raise NotImplementedError(f"Not implemented dtype {dtype} in random")
+
+
+@data_generator(bool)
+def random_bool():
+    return random().to(bool)
+
+
+class random_device(generator):
+    def __init__(self):
+        super().__init__([])
+
+    def _calc_value(self):
+        return random_util.choice(["cuda", "cpu"])
+
+
+def test_against_pytorch(
+    test_case,
+    callable_name,
+    extra_annotations: Optional[Dict[str, Any]] = None,
+    extra_generators: Optional[Dict[str, Any]] = None,
+    extra_defaults: Optional[Dict[str, Any]] = None,
+    device: str = "cuda",
+    training: bool = True,
+    backward: bool = True,
+    rtol=0.0001,
+    atol=1e-05,
+    n=20,
+    pytorch_callable_name=None,
+    api_flag: int = TEST_MODULE,
+):
+    assert device in ["cuda", "cpu"]
+    if not training:
+        assert not backward
+    if extra_annotations is None:
+        extra_annotations = {}
+    if extra_generators is None:
+        extra_generators = {}
+    if extra_defaults is None:
+        extra_defaults = {}
+    if pytorch_callable_name is None:
+        pytorch_callable_name = callable_name
+    verbose = os.getenv("ONEFLOW_TEST_VERBOSE") is not None
+
+    def has_full_args_spec(callable):
+        try:
+            inspect.getfullargspec(callable)
+            return True
+        except Exception:
+            return False
+
+    if api_flag == TEST_TENSOR:
+        pytorch_tensor = torch.Tensor(1)
+        pytorch_call = eval(f"pytorch_tensor.{pytorch_callable_name}")
+    else:
+        pytorch_call = eval(f"torch.{pytorch_callable_name}")
+    Spec = namedtuple(
+        "spec",
+        "args, varargs, varkw, defaults, kwonlyargs, kwonlydefaults, annotations",
+    )
+    if has_full_args_spec(pytorch_call):
+        tmp_spec = inspect.getfullargspec(pytorch_call)
+        new_defaults = tmp_spec.defaults
+        if new_defaults is None:
+            new_defaults = []
+        new_kwonlydefaults = tmp_spec.kwonlydefaults
+        if new_kwonlydefaults is None:
+            new_kwonlydefaults = []
+        spec = Spec(
+            tmp_spec.args,
+            tmp_spec.varargs,
+            tmp_spec.varkw,
+            new_defaults,
+            tmp_spec.kwonlyargs,
+            new_kwonlydefaults,
+            tmp_spec.annotations,
+        )
+    else:
+        args = list(extra_annotations.keys()) + list(extra_defaults.keys())
+        spec = Spec(args, None, None, [], [], {}, {})
+    annotations = spec.annotations
+    annotations.update(extra_annotations)
+    if "return" in annotations:
+        del annotations["return"]
+    args = (set(spec.args) | set(spec.kwonlyargs)) - {"self"}
+    assert args == set(
+        annotations.keys()
+    ), f"args = {args}, annotations = {annotations.keys()}"
+    if "input" not in annotations:
+        annotations.update({"input": torch.Tensor})
+
+    def has_default(name):
+        if name in spec.args:
+            return len(spec.args) - spec.args.index(name) <= len(spec.defaults)
+        else:
+            assert name in spec.kwonlyargs
+            return len(spec.kwonlyargs) - spec.kwonlyargs.index(name) <= len(
+                spec.kwonlydefaults
+            )
+
+    def get_generator(name):
+        annotation = annotations[name]
+        if name in extra_generators:
+            generator = extra_generators[name]
+        else:
+            generator = annotation2default_generator[annotation]()
+        generator = generator.to(annotation)
+        return generator
+
+    while n > 0:
+        flow_attr_dict = {}
+        torch_attr_dict = {}
+        generator_tuple = tuple(
+            *[get_generator(name) for name in args] + [get_generator("input")]
+        )
+        values = generator_tuple.eval()
+        for (i, name) in enumerate(args):
+            torch_data = values[i]
+            if isinstance(torch_data, Nothing):
+                continue
+            flow_data = convert_torch_object_to_flow(torch_data)
+            if isinstance(torch_data, torch.Tensor):
+                torch_data = torch_data.to(device)
+            if isinstance(flow_data, flow.Tensor):
+                flow_data = flow_data.to(device)
+            flow_attr_dict[name] = flow_data
+            torch_attr_dict[name] = torch_data
+        if verbose:
+            print(f"attr = {torch_attr_dict}, device = {device}")
+        torch_input_original = values[-1]
+        flow_input_original = convert_torch_object_to_flow(torch_input_original)
+        flow_input_original.requires_grad_(backward)
+        torch_input_original.requires_grad_(backward)
+        (flow_input, torch_input) = (
+            flow_input_original.to(device),
+            torch_input_original.to(device),
+        )
+        try:
+            if api_flag == TEST_MODULE:
+                torch_call = pytorch_call(**torch_attr_dict)
+                torch_call = torch_call.to(device)
+                torch_call.train(training)
+                torch_res = torch_call(torch_input)
+                state_dict = torch_call.state_dict()
+                state_dict = {
+                    k: v.detach().cpu().numpy() for (k, v) in state_dict.items()
+                }
+            elif api_flag == TEST_FLOW:
+                torch_xxx_func = eval(f"torch.{pytorch_callable_name}")
+                torch_res = torch_xxx_func(torch_input, **torch_attr_dict)
+            else:
+                torch_tensor_xxx_func = eval(f"torch_input.{pytorch_callable_name}")
+                torch_res = torch_tensor_xxx_func(**torch_attr_dict)
+            loss = torch_res.sum()
+            loss.backward()
+            if api_flag == TEST_MODULE:
+                state_dict = torch_call.state_dict()
+                state_dict = {
+                    k: v.detach().cpu().numpy() for (k, v) in state_dict.items()
+                }
+        except Exception as e:
+            if verbose:
+                print(f"PyTorch error: {e}")
+            continue
+        if api_flag == TEST_MODULE:
+            flow_call_class = eval(f"flow.{callable_name}")
+            flow_call = flow_call_class(**flow_attr_dict)
+            flow_call = flow_call.to(device)
+            flow_call.train(training)
+            flow_call.load_state_dict(state_dict)
+            flow_res = flow_call(flow_input)
+        elif api_flag == TEST_FLOW:
+            flow_xxx_func = eval(f"flow.{callable_name}")
+            flow_res = flow_xxx_func(flow_input, **flow_attr_dict)
+        else:
+            flow_tensor_xxx_func = eval(f"flow_input.{callable_name}")
+            flow_res = flow_tensor_xxx_func(**flow_attr_dict)
+        loss = flow_res.sum()
+        loss.backward()
+
+        def allclose_or_fail(flow_tensor, torch_tensor):
+            is_allclose = np.allclose(
+                flow_tensor.numpy(),
+                torch_tensor.detach().cpu().numpy(),
+                rtol=rtol,
+                atol=atol,
+            )
+            test_case.assertTrue(
+                is_allclose,
+                f"flow_tensor = {flow_tensor},\ntorch_tensor = {torch_tensor},\nattr_dict = {torch_attr_dict},\nflow_input_tensor = {flow_input_original}",
+            )
+
+        allclose_or_fail(flow_res, torch_res)
+        allclose_or_fail(flow_input_original.grad, torch_input_original.grad)
+        if api_flag == TEST_MODULE:
+            flow_parameters = dict(flow_call.named_parameters())
+            for (name, torch_param) in torch_call.named_parameters():
+                flow_param = flow_parameters[name]
+                allclose_or_fail(flow_param.grad, torch_param.grad)
+        if verbose:
+            print("test passed")
+        n -= 1
+
+
+def test_module_against_pytorch(
+    test_case,
+    callable_name,
+    extra_annotations: Optional[Dict[str, Any]] = None,
+    extra_generators: Optional[Dict[str, Any]] = None,
+    extra_defaults: Optional[Dict[str, Any]] = None,
+    device: str = "cuda",
+    training: bool = True,
+    backward: bool = True,
+    rtol=0.0001,
+    atol=1e-05,
+    n=20,
+    pytorch_callable_name=None,
+):
+    return test_against_pytorch(
+        test_case=test_case,
+        callable_name=callable_name,
+        extra_annotations=extra_annotations,
+        extra_generators=extra_generators,
+        extra_defaults=extra_defaults,
+        device=device,
+        training=training,
+        backward=backward,
+        rtol=rtol,
+        atol=atol,
+        n=n,
+        pytorch_callable_name=pytorch_callable_name,
+        api_flag=TEST_MODULE,
+    )
+
+
+def test_flow_against_pytorch(
+    test_case,
+    callable_name,
+    extra_annotations: Optional[Dict[str, Any]] = None,
+    extra_generators: Optional[Dict[str, Any]] = None,
+    extra_defaults: Optional[Dict[str, Any]] = None,
+    device: str = "cuda",
+    training: bool = True,
+    backward: bool = True,
+    rtol=0.0001,
+    atol=1e-05,
+    n=20,
+    pytorch_callable_name=None,
+):
+    return test_against_pytorch(
+        test_case=test_case,
+        callable_name=callable_name,
+        extra_annotations=extra_annotations,
+        extra_generators=extra_generators,
+        extra_defaults=extra_defaults,
+        device=device,
+        training=training,
+        backward=backward,
+        rtol=rtol,
+        atol=atol,
+        n=n,
+        pytorch_callable_name=pytorch_callable_name,
+        api_flag=TEST_FLOW,
+    )
+
+
+def test_tensor_against_pytorch(
+    test_case,
+    callable_name,
+    extra_annotations: Optional[Dict[str, Any]] = None,
+    extra_generators: Optional[Dict[str, Any]] = None,
+    extra_defaults: Optional[Dict[str, Any]] = None,
+    device: str = "cuda",
+    training: bool = True,
+    backward: bool = True,
+    rtol=0.0001,
+    atol=1e-05,
+    n=20,
+    pytorch_callable_name=None,
+):
+    return test_against_pytorch(
+        test_case=test_case,
+        callable_name=callable_name,
+        extra_annotations=extra_annotations,
+        extra_generators=extra_generators,
+        extra_defaults=extra_defaults,
+        device=device,
+        training=training,
+        backward=backward,
+        rtol=rtol,
+        atol=atol,
+        n=n,
+        pytorch_callable_name=pytorch_callable_name,
+        api_flag=TEST_TENSOR,
+    )
+
+
+__all__ = [
+    "random_tensor",
+    "random_bool",
+    "random_device",
+    "random",
+    "random_or_nothing",
+    "oneof",
+    "constant",
+    "nothing",
+    "test_module_against_pytorch",
+    "test_flow_against_pytorch",
+    "test_tensor_against_pytorch",
+]
diff --git a/python/oneflow/test_utils/automated_test_util/torch_flow_dual_object.py b/python/oneflow/test_utils/automated_test_util/torch_flow_dual_object.py
new file mode 100644
index 0000000000000000000000000000000000000000..232ea133e03ba3b6bd983c275f7cc78634f9155f
--- /dev/null
+++ b/python/oneflow/test_utils/automated_test_util/torch_flow_dual_object.py
@@ -0,0 +1,308 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import collections.abc
+import functools
+import inspect
+import os
+
+import numpy as np
+import torch as torch_original
+
+import oneflow as flow
+
+from .generators import Nothing, generator, random_tensor
+
+postulate = [".rand", ".Tensor"]
+
+
+def torch_tensor_to_flow(x):
+    return flow.tensor(x.cpu().numpy())
+
+
+class PyTorchDoesNotSupportError(Exception):
+    def __init__(self, exc):
+        self.exc = exc
+
+    def __str__(self):
+        return repr(self)
+
+    def __repr__(self):
+        return f"PyTorch error: {str(self.exc)}"
+
+
+def get_args(callable, *args, **kwargs):
+    try:
+        spec = inspect.getfullargspec(callable)
+        spec_args = spec.args
+        if spec_args[0] == "self":
+            del spec_args[0]
+        for (i, arg) in enumerate(args):
+            arg_name = spec_args[i]
+            annotation = spec.annotations[arg_name]
+            if isinstance(arg, generator):
+                arg.to(annotation)
+        for (arg_name, arg) in kwargs.items():
+            annotation = spec.annotations[arg_name]
+            if isinstance(arg, generator):
+                arg.to(annotation)
+    except:
+        pass
+    (pytorch_args, pytorch_kwargs, oneflow_args, oneflow_kwargs) = ([], {}, [], {})
+
+    def get_pytorch_value(x):
+        if isinstance(x, DualObject):
+            return x.pytorch
+        return x
+
+    def get_oneflow_value(x):
+        if isinstance(x, DualObject):
+            return x.oneflow
+        return x
+
+    def get_generator_value(x):
+        if isinstance(x, generator):
+            return x.value()
+        return x
+
+    for arg in args:
+        arg = get_generator_value(arg)
+        pytorch_args.append(get_pytorch_value(arg))
+        oneflow_args.append(get_oneflow_value(arg))
+    for (key, value) in kwargs.items():
+        value = get_generator_value(value)
+        if isinstance(value, Nothing):
+            continue
+        pytorch_kwargs[key] = get_pytorch_value(value)
+        oneflow_kwargs[key] = get_oneflow_value(value)
+    return (pytorch_args, pytorch_kwargs, oneflow_args, oneflow_kwargs)
+
+
+counter = 0
+
+
+def GetDualObject(name, pytorch, oneflow):
+    global counter
+    counter += 1
+    skipped_magic_methods = [
+        "__class__",
+        "__mro__",
+        "__new__",
+        "__init__",
+        "__getattr__",
+        "__setattr__",
+        "__getattribute__",
+        "__dict__",
+        "__weakref__",
+        "__builtins__",
+        "__qualname__",
+        "__name__",
+        "__str__",
+        "__repr__",
+    ]
+    pytorch_methods = dir(pytorch)
+    if hasattr(pytorch, "__call__") and "__call__" not in pytorch_methods:
+        pytorch_methods.append("__call__")
+    magic_methods_for_new_cls = {}
+    for method_name in pytorch_methods:
+        if method_name.startswith("__") and method_name not in skipped_magic_methods:
+
+            def get_dual_method(method_name):
+                if method_name == "__call__":
+
+                    def dual_method(self, *args, **kwargs):
+                        (
+                            pytorch_args,
+                            pytorch_kwargs,
+                            oneflow_args,
+                            oneflow_kwargs,
+                        ) = get_args(pytorch, *args, **kwargs)
+                        try:
+                            pytorch_res = pytorch(*pytorch_args, **pytorch_kwargs)
+                        except Exception as e:
+                            raise PyTorchDoesNotSupportError(e)
+                        if name in postulate:
+                            oneflow_res = torch_tensor_to_flow(pytorch_res)
+                        else:
+                            oneflow_res = oneflow(*oneflow_args, **oneflow_kwargs)
+                        return GetDualObject("unused", pytorch_res, oneflow_res)
+
+                else:
+
+                    def dual_method(self, *args, **kwargs):
+                        pytorch_method = getattr(pytorch, method_name)
+                        oneflow_method = getattr(oneflow, method_name)
+                        (
+                            pytorch_args,
+                            pytorch_kwargs,
+                            oneflow_args,
+                            oneflow_kwargs,
+                        ) = get_args(pytorch_method, *args, **kwargs)
+                        try:
+                            pytorch_res = pytorch_method(
+                                *pytorch_args, **pytorch_kwargs
+                            )
+                        except Exception as e:
+                            raise PyTorchDoesNotSupportError(e)
+                        oneflow_res = oneflow_method(*oneflow_args, **oneflow_kwargs)
+                        return GetDualObject("unused", pytorch_res, oneflow_res)
+
+                return dual_method
+
+            magic_methods_for_new_cls[method_name] = get_dual_method(method_name)
+    Cls = type(f"{name}_{counter}", (DualObject,), magic_methods_for_new_cls)
+    return Cls(name, pytorch, oneflow)
+
+
+class DualObject:
+    def __init__(self, name, pytorch, oneflow):
+        self.name = name
+        self.pytorch = pytorch
+        self.oneflow = oneflow
+        if isinstance(pytorch, torch_original.nn.Module):
+            state_dict = pytorch.state_dict()
+            state_dict = {k: v.detach().cpu().numpy() for (k, v) in state_dict.items()}
+            oneflow.load_state_dict(state_dict)
+            dual_modules_to_test.append(self)
+        if isinstance(pytorch, torch_original.Tensor):
+            dual_objects_to_test.append(self)
+
+    def __repr__(self):
+        return f"PyTorch object:\n{self.pytorch}\n\nOneFlow object:\n{self.oneflow}"
+
+    def __getattr__(self, key):
+        pytorch_attr = getattr(self.pytorch, key)
+        oneflow_attr = getattr(self.oneflow, key)
+        new_name = f"{self.name}.{key}"
+        return GetDualObject(new_name, pytorch_attr, oneflow_attr)
+
+
+dual_modules_to_test = []
+dual_objects_to_test = []
+torch_type2checker = {}
+
+
+def equality_checker(torch_type, flow_type):
+    def deco(f):
+        torch_type2checker[torch_type, flow_type] = f
+        return f
+
+    return deco
+
+
+def check_equality(dual_object: DualObject, rtol=0.0001, atol=1e-05):
+    checker = torch_type2checker.get(
+        (type(dual_object.pytorch), type(dual_object.oneflow)), None
+    )
+    if checker is None:
+        for (key, value) in torch_type2checker.items():
+            if isinstance(dual_object.pytorch, key[0]) and isinstance(
+                dual_object.oneflow, key[1]
+            ):
+                checker = value
+                break
+    assert checker is not None
+    return checker(dual_object.pytorch, dual_object.oneflow, rtol, atol)
+
+
+@equality_checker(torch_original.Tensor, flow.Tensor)
+@equality_checker(torch_original.Tensor, flow._oneflow_internal.Tensor)
+def check_tensor_equality(torch_tensor, flow_tensor, rtol=0.0001, atol=1e-05):
+    if torch_tensor.grad is not None:
+        assert (
+            flow_tensor.grad is not None
+        ), "OneFlow tensor doesn't have grad while PyTorch tensor has one"
+        if not np.allclose(
+            torch_tensor.grad.detach().cpu().numpy(), flow_tensor.grad.numpy()
+        ):
+            return False
+    return np.allclose(
+        torch_tensor.detach().cpu().numpy(),
+        flow_tensor.numpy(),
+        rtol=rtol,
+        atol=atol,
+        equal_nan=True,
+    )
+
+
+def autotest(n=20, auto_backward=True, rtol=0.0001, atol=1e-05):
+    verbose = os.getenv("ONEFLOW_TEST_VERBOSE") is not None
+
+    def deco(f):
+        @functools.wraps(f)
+        def new_f(test_case):
+            nonlocal n
+            while n > 0:
+                dual_modules_to_test.clear()
+                dual_objects_to_test.clear()
+                try:
+                    res = f(test_case)
+                except PyTorchDoesNotSupportError as e:
+                    if verbose:
+                        print(e)
+                    continue
+                if res is not None:
+                    if not isinstance(res, collections.abc.Sequence):
+                        res = [res]
+                    for x in res:
+                        if auto_backward:
+                            if isinstance(x.pytorch, torch_original.Tensor):
+                                x.sum().backward()
+                        dual_objects_to_test.append(x)
+                for x in dual_modules_to_test:
+                    for key in x.pytorch.state_dict().keys():
+                        dual_objects_to_test.append(
+                            GetDualObject(
+                                "unused",
+                                x.pytorch.state_dict()[key],
+                                x.oneflow.state_dict()[key],
+                            )
+                        )
+                for x in dual_objects_to_test:
+                    test_case.assertTrue(check_equality(x, rtol=rtol, atol=atol))
+                if verbose:
+                    print("test passed")
+                n -= 1
+
+        return new_f
+
+    return deco
+
+
+def random_pytorch_tensor(
+    ndim=None,
+    dim0=1,
+    dim1=None,
+    dim2=None,
+    dim3=None,
+    dim4=None,
+    low=0,
+    high=1,
+    dtype=float,
+    requires_grad=True,
+):
+    if isinstance(requires_grad, generator):
+        requires_grad = requires_grad.value()
+    pytorch_tensor = (
+        random_tensor(ndim, dim0, dim1, dim2, dim3, dim4, low, high, dtype)
+        .value()
+        .requires_grad_(requires_grad and dtype != int)
+    )
+    flow_tensor = flow.tensor(pytorch_tensor.detach().cpu().numpy(), requires_grad=True)
+    return GetDualObject("unused", pytorch_tensor, flow_tensor)
+
+
+torch = GetDualObject("", torch_original, flow)
+__all__ = ["torch", "autotest", "random_pytorch_tensor"]
diff --git a/python/oneflow/tmp.py b/python/oneflow/tmp.py
new file mode 100644
index 0000000000000000000000000000000000000000..3089717e2aeea835b87c568f2fdc18a8a2a21398
--- /dev/null
+++ b/python/oneflow/tmp.py
@@ -0,0 +1,18 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.nn.modules.dataset import get_ofrecord_handle as OfrecordReader
+from oneflow.nn.modules.dataset import raw_decoder as RawDecoder
+from oneflow.nn.modules.slice import logical_slice_assign_op as logical_slice_assign
diff --git a/python/oneflow/train.py b/python/oneflow/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffc7d94d7678b1316aadbdec0937b85bdc5631a2
--- /dev/null
+++ b/python/oneflow/train.py
@@ -0,0 +1,16 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.framework.check_point import CheckPoint, SimpleCheckPointManager
diff --git a/python/oneflow/typing.py b/python/oneflow/typing.py
new file mode 100644
index 0000000000000000000000000000000000000000..e872b56fc2e04118628f88056f48e6d2d634977a
--- /dev/null
+++ b/python/oneflow/typing.py
@@ -0,0 +1,16 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.framework.typing import Bundle, Callback, ListNumpy, Numpy
diff --git a/python/oneflow/unittest/__init__.py b/python/oneflow/unittest/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..827f60443dca9827d509ea04bd286e239bd22910
--- /dev/null
+++ b/python/oneflow/unittest/__init__.py
@@ -0,0 +1,28 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.framework.unittest import (
+    TestCase,
+    num_nodes_required,
+    register_test_cases,
+    skip_unless_1n1d,
+    skip_unless_1n2d,
+    skip_unless_1n4d,
+    skip_unless_2n1d,
+    skip_unless_2n2d,
+    skip_unless_2n4d,
+)
+
+from . import env
diff --git a/python/oneflow/unittest/env.py b/python/oneflow/unittest/env.py
new file mode 100644
index 0000000000000000000000000000000000000000..33d588bb2276179f35087b8e6bad67f0bbdf4d5b
--- /dev/null
+++ b/python/oneflow/unittest/env.py
@@ -0,0 +1,25 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.framework.unittest import (
+    device_num,
+    eager_execution_enabled,
+    has_node_list,
+    has_world_size,
+    node_list,
+    node_size,
+    typing_check_enabled,
+    world_size,
+)
diff --git a/python/oneflow/util.py b/python/oneflow/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..552854f24c07c34cd6fa5e2c1f742e1ec5283b70
--- /dev/null
+++ b/python/oneflow/util.py
@@ -0,0 +1,16 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.framework.id_util import UniqueStr as unique_str
diff --git a/python/oneflow/utils/__init__.py b/python/oneflow/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/python/oneflow/utils/data/__init__.py b/python/oneflow/utils/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fb3431f5ec22b97a0056a30183c206a6fa09610
--- /dev/null
+++ b/python/oneflow/utils/data/__init__.py
@@ -0,0 +1,57 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from oneflow.utils.data.sampler import (
+    Sampler,
+    SequentialSampler,
+    RandomSampler,
+    SubsetRandomSampler,
+    BatchSampler,
+)
+from oneflow.utils.data.dataset import (
+    Dataset,
+    IterableDataset,
+    TensorDataset,
+    ConcatDataset,
+    Subset,
+    random_split,
+)
+from oneflow.utils.data.dataset import IterableDataset as IterDataPipe
+from oneflow.utils.data.dataloader import DataLoader, _DatasetKind
+from oneflow.utils.data.decorator import (
+    functional_datapipe,
+    guaranteed_datapipes_determinism,
+    non_deterministic,
+)
+
+__all__ = [
+    "Sampler",
+    "SequentialSampler",
+    "RandomSampler",
+    "SubsetRandomSampler",
+    "BatchSampler",
+    "Dataset",
+    "IterableDataset",
+    "TensorDataset",
+    "ConcatDataset",
+    "Subset",
+    "random_split",
+    "DataLoader",
+    "_DatasetKind",
+    "IterDataPipe",
+    "functional_datapipe",
+    "guaranteed_datapipes_determinism",
+    "non_deterministic",
+]
diff --git a/python/oneflow/utils/data/_utils/__init__.py b/python/oneflow/utils/data/_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a857ec7d7652212dfd198112e4965ded80d8e37
--- /dev/null
+++ b/python/oneflow/utils/data/_utils/__init__.py
@@ -0,0 +1,40 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+"""Utility classes & functions for data loading. Code in this folder is mostly
+used by ../dataloder.py.
+
+A lot of multiprocessing is used in data loading, which only supports running
+functions defined in global environment (py2 can't serialize static methods).
+Therefore, for code tidiness we put these functions into different files in this
+folder.
+"""
+import atexit
+import sys
+
+IS_WINDOWS = sys.platform == "win32"
+MP_STATUS_CHECK_INTERVAL = 5.0
+"Interval (in seconds) to check status of processes to avoid hanging in\n    multiprocessing data loading. This is mainly used in getting data from\n    another process, in which case we need to periodically check whether the\n    sender is alive to prevent hanging."
+python_exit_status = False
+"Whether Python is shutting down. This flag is guaranteed to be set before\nthe Python core library resources are freed, but Python may already be exiting\nfor some time when this is set.\n\nHook to set this flag is `_set_python_exit_flag`, and is inspired by a similar\nhook in Python 3.7 multiprocessing library:\nhttps://github.com/python/cpython/blob/d4d60134b29290049e28df54f23493de4f1824b6/Lib/multiprocessing/util.py#L277-L327\n"
+
+
+def _set_python_exit_flag():
+    global python_exit_status
+    python_exit_status = True
+
+
+atexit.register(_set_python_exit_flag)
+from . import collate, fetch
diff --git a/python/oneflow/utils/data/_utils/collate.py b/python/oneflow/utils/data/_utils/collate.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2e43da6230aaee31fa15a6778721db005d672b2
--- /dev/null
+++ b/python/oneflow/utils/data/_utils/collate.py
@@ -0,0 +1,98 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+""""Contains definitions of the methods used by the _BaseDataLoaderIter workers to
+collate samples fetched from dataset into Tensor(s).
+
+These **needs** to be in global scope since Py2 doesn't support serializing
+static methods.
+"""
+import collections
+import re
+
+import oneflow as flow
+import oneflow.utils as utils
+
+string_classes = (str, bytes)
+np_str_obj_array_pattern = re.compile("[SaUO]")
+
+
+def default_convert(data):
+    """Converts each NumPy array data field into a tensor"""
+    elem_type = type(data)
+    if isinstance(data, (flow.Tensor, flow._oneflow_internal.Tensor)):
+        return data
+    elif (
+        elem_type.__module__ == "numpy"
+        and elem_type.__name__ != "str_"
+        and (elem_type.__name__ != "string_")
+    ):
+        if (
+            elem_type.__name__ == "ndarray"
+            and np_str_obj_array_pattern.search(data.dtype.str) is not None
+        ):
+            return data
+        return flow.tensor(data)
+    elif isinstance(data, collections.abc.Mapping):
+        return {key: default_convert(data[key]) for key in data}
+    elif isinstance(data, tuple) and hasattr(data, "_fields"):
+        return elem_type(*(default_convert(d) for d in data))
+    elif isinstance(data, collections.abc.Sequence) and (
+        not isinstance(data, string_classes)
+    ):
+        return [default_convert(d) for d in data]
+    else:
+        raise TypeError(default_convert_err_msg_format.format(elem_type))
+
+
+default_collate_err_msg_format = "default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found {}"
+default_convert_err_msg_format = "default_convert: batch must contain tensors, numpy arrays, numbers, dicts or lists; found {}"
+
+
+def default_collate(batch):
+    """Puts each data field into a tensor with outer dimension batch size"""
+    elem = batch[0]
+    elem_type = type(elem)
+    if isinstance(elem, (flow.Tensor, flow._oneflow_internal.Tensor)):
+        return flow.stack(batch, dim=0)
+    elif (
+        elem_type.__module__ == "numpy"
+        and elem_type.__name__ != "str_"
+        and (elem_type.__name__ != "string_")
+    ):
+        if elem_type.__name__ == "ndarray" or elem_type.__name__ == "memmap":
+            if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
+                raise TypeError(default_collate_err_msg_format.format(elem.dtype))
+            return default_collate([flow.Tensor(b) for b in batch])
+        elif elem.shape == ():
+            return flow.Tensor(batch)
+    elif isinstance(elem, float):
+        return flow.tensor(batch, dtype=flow.float64)
+    elif isinstance(elem, int):
+        return flow.tensor(batch)
+    elif isinstance(elem, string_classes):
+        return batch
+    elif isinstance(elem, collections.abc.Mapping):
+        return {key: default_collate([d[key] for d in batch]) for key in elem}
+    elif isinstance(elem, tuple) and hasattr(elem, "_fields"):
+        return elem_type(*(default_collate(samples) for samples in zip(*batch)))
+    elif isinstance(elem, collections.abc.Sequence):
+        it = iter(batch)
+        elem_size = len(next(it))
+        if not all((len(elem) == elem_size for elem in it)):
+            raise RuntimeError("each element in list of batch should be of equal size")
+        transposed = zip(*batch)
+        return [default_collate(samples) for samples in transposed]
+    raise TypeError(default_collate_err_msg_format.format(elem_type))
diff --git a/python/oneflow/utils/data/_utils/fetch.py b/python/oneflow/utils/data/_utils/fetch.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bd2b3464a2c67fd0729e22f212d30f3d3584a6c
--- /dev/null
+++ b/python/oneflow/utils/data/_utils/fetch.py
@@ -0,0 +1,68 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+""""Contains definitions of the methods used by the _BaseDataLoaderIter to fetch
+data from an iterable-style or map-style dataset. This logic is shared in both
+single- and multi-processing data loading.
+"""
+
+
+class _BaseDatasetFetcher(object):
+    def __init__(self, dataset, auto_collation, collate_fn, drop_last):
+        self.dataset = dataset
+        self.auto_collation = auto_collation
+        self.collate_fn = collate_fn
+        self.drop_last = drop_last
+
+    def fetch(self, possibly_batched_index):
+        raise NotImplementedError()
+
+
+class _IterableDatasetFetcher(_BaseDatasetFetcher):
+    def __init__(self, dataset, auto_collation, collate_fn, drop_last):
+        super(_IterableDatasetFetcher, self).__init__(
+            dataset, auto_collation, collate_fn, drop_last
+        )
+        self.dataset_iter = iter(dataset)
+
+    def fetch(self, possibly_batched_index):
+        if self.auto_collation:
+            data = []
+            for _ in possibly_batched_index:
+                try:
+                    data.append(next(self.dataset_iter))
+                except StopIteration:
+                    break
+            if len(data) == 0 or (
+                self.drop_last and len(data) < len(possibly_batched_index)
+            ):
+                raise StopIteration
+        else:
+            data = next(self.dataset_iter)
+        return self.collate_fn(data)
+
+
+class _MapDatasetFetcher(_BaseDatasetFetcher):
+    def __init__(self, dataset, auto_collation, collate_fn, drop_last):
+        super(_MapDatasetFetcher, self).__init__(
+            dataset, auto_collation, collate_fn, drop_last
+        )
+
+    def fetch(self, possibly_batched_index):
+        if self.auto_collation:
+            data = [self.dataset[idx] for idx in possibly_batched_index]
+        else:
+            data = self.dataset[possibly_batched_index]
+        return self.collate_fn(data)
diff --git a/python/oneflow/utils/data/dataloader.py b/python/oneflow/utils/data/dataloader.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6c246974827d00df4f7a5ff4b237360621862c4
--- /dev/null
+++ b/python/oneflow/utils/data/dataloader.py
@@ -0,0 +1,414 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+import sys
+import traceback
+import warnings
+from typing import Any, Callable, Generic, List, Optional, Sequence, TypeVar
+
+import oneflow as flow
+
+
+class ExceptionWrapper(object):
+    """Wraps an exception plus traceback to communicate across threads"""
+
+    def __init__(self, exc_info=None, where="in background"):
+        if exc_info is None:
+            exc_info = sys.exc_info()
+        self.exc_type = exc_info[0]
+        self.exc_msg = "".join(traceback.format_exception(*exc_info))
+        self.where = where
+
+    def reraise(self):
+        """Reraises the wrapped exception in the current thread"""
+        msg = "Caught {} {}.\nOriginal {}".format(
+            self.exc_type.__name__, self.where, self.exc_msg
+        )
+        if self.exc_type == KeyError:
+            msg = KeyErrorMessage(msg)
+        elif getattr(self.exc_type, "message", None):
+            raise self.exc_type(message=msg)
+        raise self.exc_type(msg)
+
+
+string_classes = (str, bytes)
+from . import (
+    BatchSampler,
+    Dataset,
+    IterableDataset,
+    RandomSampler,
+    Sampler,
+    SequentialSampler,
+    _utils,
+)
+
+T_co = TypeVar("T_co", covariant=True)
+T = TypeVar("T")
+_worker_init_fn_t = Callable[[int], None]
+_collate_fn_t = Callable[[List[T]], Any]
+default_collate: _collate_fn_t = _utils.collate.default_collate
+
+
+class _DatasetKind(object):
+    Map = 0
+    Iterable = 1
+
+    @staticmethod
+    def create_fetcher(kind, dataset, auto_collation, collate_fn, drop_last):
+        if kind == _DatasetKind.Map:
+            return _utils.fetch._MapDatasetFetcher(
+                dataset, auto_collation, collate_fn, drop_last
+            )
+        else:
+            return _utils.fetch._IterableDatasetFetcher(
+                dataset, auto_collation, collate_fn, drop_last
+            )
+
+
+class _InfiniteConstantSampler(Sampler):
+    """Analogous to ``itertools.repeat(None, None)``.
+    Used as sampler for :class:`~flow.utils.data.IterableDataset`.
+
+    Args:
+        data_source (Dataset): dataset to sample from
+    """
+
+    def __init__(self):
+        super(_InfiniteConstantSampler, self).__init__(None)
+
+    def __iter__(self):
+        while True:
+            yield None
+
+
+class DataLoader(Generic[T_co]):
+    """
+    Data loader. Combines a dataset and a sampler, and provides an iterable over
+    the given dataset.
+
+    The :class:`~flow.utils.data.DataLoader` supports both map-style and
+    iterable-style datasets with single- or multi-process loading, customizing
+    loading order and optional automatic batching (collation) and memory pinning.
+
+    See :py:mod:`flow.utils.data` documentation page for more details.
+
+    Args:
+        dataset (Dataset): dataset from which to load the data.
+        batch_size (int, optional): how many samples per batch to load
+            (default: ``1``).
+        shuffle (bool, optional): set to ``True`` to have the data reshuffled
+            at every epoch (default: ``False``).
+        sampler (Sampler or Iterable, optional): defines the strategy to draw
+            samples from the dataset. Can be any ``Iterable`` with ``__len__``
+            implemented. If specified, :attr:`shuffle` must not be specified.
+        batch_sampler (Sampler or Iterable, optional): like :attr:`sampler`, but
+            returns a batch of indices at a time. Mutually exclusive with
+            :attr:`batch_size`, :attr:`shuffle`, :attr:`sampler`,
+            and :attr:`drop_last`.
+        num_workers (int, optional): how many subprocesses to use for data
+            loading. ``0`` means that the data will be loaded in the main process.
+            (default: ``0``)
+        collate_fn (callable, optional): merges a list of samples to form a
+            mini-batch of Tensor(s).  Used when using batched loading from a
+            map-style dataset.
+        drop_last (bool, optional): set to ``True`` to drop the last incomplete batch,
+            if the dataset size is not divisible by the batch size. If ``False`` and
+            the size of dataset is not divisible by the batch size, then the last batch
+            will be smaller. (default: ``False``)
+        timeout (numeric, optional): if positive, the timeout value for collecting a batch
+            from workers. Should always be non-negative. (default: ``0``)
+        worker_init_fn (callable, optional): If not ``None``, this will be called on each
+            worker subprocess with the worker id (an int in ``[0, num_workers - 1]``) as
+            input, after seeding and before data loading. (default: ``None``)
+        prefetch_factor (int, optional, keyword-only arg): Number of samples loaded
+            in advance by each worker. ``2`` means there will be a total of
+            2 * num_workers samples prefetched across all workers. (default: ``2``)
+        persistent_workers (bool, optional): If ``True``, the data loader will not shutdown
+            the worker processes after a dataset has been consumed once. This allows to
+            maintain the workers `Dataset` instances alive. (default: ``False``)
+
+
+    .. warning:: If the ``spawn`` start method is used, :attr:`worker_init_fn`
+                 cannot be an unpicklable object, e.g., a lambda function. See
+                 :ref:`multiprocessing-best-practices` on more details related
+                 to multiprocessing in OneFlow.
+
+    .. warning:: ``len(dataloader)`` heuristic is based on the length of the sampler used.
+                 When :attr:`dataset` is an :class:`~flow.utils.data.IterableDataset`,
+                 it instead returns an estimate based on ``len(dataset) / batch_size``, with proper
+                 rounding depending on :attr:`drop_last`, regardless of multi-process loading
+                 configurations. This represents the best guess OneFlow can make because OneFlow
+                 trusts user :attr:`dataset` code in correctly handling multi-process
+                 loading to avoid duplicate data.
+
+                 However, if sharding results in multiple workers having incomplete last batches,
+                 this estimate can still be inaccurate, because (1) an otherwise complete batch can
+                 be broken into multiple ones and (2) more than one batch worth of samples can be
+                 dropped when :attr:`drop_last` is set. Unfortunately, OneFlow can not detect such
+                 cases in general.
+
+                 See `Dataset Types`_ for more details on these two types of datasets and how
+                 :class:`~flow.utils.data.IterableDataset` interacts with
+                 `Multi-process data loading`_.
+
+    .. warning:: See :ref:`reproducibility`, and :ref:`dataloader-workers-random-seed`, and
+                 :ref:`data-loading-randomness` notes for random seed related questions.
+    """
+
+    dataset: Dataset[T_co]
+    batch_size: Optional[int]
+    num_workers: int
+    drop_last: bool
+    timeout: float
+    sampler: Sampler
+    prefetch_factor: int
+    _iterator: Optional["_BaseDataLoaderIter"]
+    __initialized = False
+
+    def __init__(
+        self,
+        dataset: Dataset[T_co],
+        batch_size: Optional[int] = 1,
+        shuffle: bool = False,
+        sampler: Optional[Sampler[int]] = None,
+        batch_sampler: Optional[Sampler[Sequence[int]]] = None,
+        num_workers: int = 0,
+        collate_fn: Optional[_collate_fn_t] = None,
+        drop_last: bool = False,
+        timeout: float = 0,
+        worker_init_fn: Optional[_worker_init_fn_t] = None,
+        generator=None,
+        *,
+        prefetch_factor: int = 2,
+        persistent_workers: bool = False
+    ):
+        if num_workers < 0:
+            raise ValueError(
+                "num_workers option should be non-negative; use num_workers=0 to disable multiprocessing."
+            )
+        if num_workers >= 1:
+            warnings.warn(
+                "Not support multiprocessing dataloader yet, we will temporary set num_workers=0!"
+            )
+            num_workers = 0
+        if timeout < 0:
+            raise ValueError("timeout option should be non-negative")
+        if num_workers == 0 and prefetch_factor != 2:
+            raise ValueError(
+                "prefetch_factor option could only be specified in multiprocessing.let num_workers > 0 to enable multiprocessing."
+            )
+        assert prefetch_factor > 0
+        if persistent_workers and num_workers == 0:
+            raise ValueError("persistent_workers option needs num_workers > 0")
+        self.dataset = dataset
+        self.num_workers = num_workers
+        self.prefetch_factor = prefetch_factor
+        self.timeout = timeout
+        self.worker_init_fn = worker_init_fn
+        if isinstance(dataset, IterableDataset):
+            self._dataset_kind = _DatasetKind.Iterable
+            if shuffle is not False:
+                raise ValueError(
+                    "DataLoader with IterableDataset: expected unspecified shuffle option, but got shuffle={}".format(
+                        shuffle
+                    )
+                )
+            elif sampler is not None:
+                raise ValueError(
+                    "DataLoader with IterableDataset: expected unspecified sampler option, but got sampler={}".format(
+                        sampler
+                    )
+                )
+            elif batch_sampler is not None:
+                raise ValueError(
+                    "DataLoader with IterableDataset: expected unspecified batch_sampler option, but got batch_sampler={}".format(
+                        batch_sampler
+                    )
+                )
+        else:
+            self._dataset_kind = _DatasetKind.Map
+        if sampler is not None and shuffle:
+            raise ValueError("sampler option is mutually exclusive with shuffle")
+        if batch_sampler is not None:
+            if batch_size != 1 or shuffle or sampler is not None or drop_last:
+                raise ValueError(
+                    "batch_sampler option is mutually exclusive with batch_size, shuffle, sampler, and drop_last"
+                )
+            batch_size = None
+            drop_last = False
+        elif batch_size is None:
+            if drop_last:
+                raise ValueError(
+                    "batch_size=None option disables auto-batching and is mutually exclusive with drop_last"
+                )
+        if sampler is None:
+            if self._dataset_kind == _DatasetKind.Iterable:
+                sampler = _InfiniteConstantSampler()
+            elif shuffle:
+                sampler = RandomSampler(dataset, generator=generator)
+            else:
+                sampler = SequentialSampler(dataset)
+        if batch_size is not None and batch_sampler is None:
+            batch_sampler = BatchSampler(sampler, batch_size, drop_last)
+        self.batch_size = batch_size
+        self.drop_last = drop_last
+        self.sampler = sampler
+        self.batch_sampler = batch_sampler
+        self.generator = generator
+        if collate_fn is None:
+            if self._auto_collation:
+                collate_fn = _utils.collate.default_collate
+            else:
+                collate_fn = _utils.collate.default_convert
+        self.collate_fn = collate_fn
+        self.persistent_workers = persistent_workers
+        self.__initialized = True
+        self._IterableDataset_len_called = None
+        self._iterator = None
+
+    def _get_iterator(self) -> "_BaseDataLoaderIter":
+        if self.num_workers == 0 or self.num_workers == 1:
+            return _SingleProcessDataLoaderIter(self)
+        else:
+            raise NotImplementedError("Multiprocessing dataloader is not support yet!")
+
+    def __setattr__(self, attr, val):
+        if self.__initialized and attr in (
+            "batch_size",
+            "batch_sampler",
+            "sampler",
+            "drop_last",
+            "dataset",
+            "persistent_workers",
+        ):
+            raise ValueError(
+                "{} attribute should not be set after {} is initialized".format(
+                    attr, self.__class__.__name__
+                )
+            )
+        super(DataLoader, self).__setattr__(attr, val)
+
+    def __iter__(self) -> "_BaseDataLoaderIter":
+        if self.persistent_workers and self.num_workers > 0:
+            if self._iterator is None:
+                self._iterator = self._get_iterator()
+            else:
+                self._iterator._reset(self)
+            return self._iterator
+        else:
+            return self._get_iterator()
+
+    @property
+    def _auto_collation(self):
+        return self.batch_sampler is not None
+
+    @property
+    def _index_sampler(self):
+        if self._auto_collation:
+            return self.batch_sampler
+        else:
+            return self.sampler
+
+    def __len__(self) -> int:
+        if self._dataset_kind == _DatasetKind.Iterable:
+            length = self._IterableDataset_len_called = len(self.dataset)
+            if self.batch_size is not None:
+                from math import ceil
+
+                if self.drop_last:
+                    length = length // self.batch_size
+                else:
+                    length = ceil(length / self.batch_size)
+            return length
+        else:
+            return len(self._index_sampler)
+
+
+class _BaseDataLoaderIter(object):
+    def __init__(self, loader: DataLoader) -> None:
+        self._dataset = loader.dataset
+        self._dataset_kind = loader._dataset_kind
+        self._IterableDataset_len_called = loader._IterableDataset_len_called
+        self._auto_collation = loader._auto_collation
+        self._drop_last = loader.drop_last
+        self._index_sampler = loader._index_sampler
+        self._num_workers = loader.num_workers
+        self._prefetch_factor = loader.prefetch_factor
+        self._timeout = loader.timeout
+        self._collate_fn = loader.collate_fn
+        self._sampler_iter = iter(self._index_sampler)
+        self._base_seed = flow.Tensor([0], dtype=flow.int64).uniform_().numpy().item()
+        self._persistent_workers = loader.persistent_workers
+        self._num_yielded = 0
+        self._profile_name = "enumerate(DataLoader)#{}.__next__".format(
+            self.__class__.__name__
+        )
+
+    def __iter__(self) -> "_BaseDataLoaderIter":
+        return self
+
+    def _reset(self, loader, first_iter=False):
+        self._sampler_iter = iter(self._index_sampler)
+        self._num_yielded = 0
+        self._IterableDataset_len_called = loader._IterableDataset_len_called
+
+    def _next_index(self):
+        return next(self._sampler_iter)
+
+    def _next_data(self):
+        raise NotImplementedError
+
+    def __next__(self) -> Any:
+        if self._sampler_iter is None:
+            self._reset()
+        data = self._next_data()
+        self._num_yielded += 1
+        if (
+            self._dataset_kind == _DatasetKind.Iterable
+            and self._IterableDataset_len_called is not None
+            and (self._num_yielded > self._IterableDataset_len_called)
+        ):
+            warn_msg = "Length of IterableDataset {} was reported to be {} (when accessing len(dataloader)), but {} samples have been fetched. ".format(
+                self._dataset, self._IterableDataset_len_called, self._num_yielded
+            )
+            if self._num_workers > 1:
+                warn_msg += "Multiprocessing dataloader is not support yet!"
+            warnings.warn(warn_msg)
+        return data
+
+    def __len__(self) -> int:
+        return len(self._index_sampler)
+
+    def __getstate__(self):
+        raise NotImplementedError("{} cannot be pickled", self.__class__.__name__)
+
+
+class _SingleProcessDataLoaderIter(_BaseDataLoaderIter):
+    def __init__(self, loader):
+        super(_SingleProcessDataLoaderIter, self).__init__(loader)
+        assert self._timeout == 0
+        assert 0 <= self._num_workers <= 1
+        self._dataset_fetcher = _DatasetKind.create_fetcher(
+            self._dataset_kind,
+            self._dataset,
+            self._auto_collation,
+            self._collate_fn,
+            self._drop_last,
+        )
+
+    def _next_data(self):
+        index = self._next_index()
+        return self._dataset_fetcher.fetch(index)
diff --git a/python/oneflow/utils/data/dataset.py b/python/oneflow/utils/data/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..77bd5db77200e6de2df965a24735c77360d01bdf
--- /dev/null
+++ b/python/oneflow/utils/data/dataset.py
@@ -0,0 +1,306 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import bisect
+import functools
+import warnings
+from typing import (
+    Callable,
+    Dict,
+    Generic,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Sequence,
+    Tuple,
+    TypeVar,
+)
+
+import oneflow as flow
+from oneflow.framework.tensor import Tensor
+
+default_generator = flow.Generator()
+
+
+def _accumulate(iterable, fn=lambda x, y: x + y):
+    """Return running totals"""
+    it = iter(iterable)
+    try:
+        total = next(it)
+    except StopIteration:
+        return
+    yield total
+    for element in it:
+        total = fn(total, element)
+        yield total
+
+
+T_co = TypeVar("T_co", covariant=True)
+T = TypeVar("T")
+
+
+class Dataset(Generic[T_co]):
+    """An abstract class representing a :class:`Dataset`.
+
+    All datasets that represent a map from keys to data samples should subclass
+    it. All subclasses should overwrite :meth:`__getitem__`, supporting fetching a
+    data sample for a given key. Subclasses could also optionally overwrite
+    :meth:`__len__`, which is expected to return the size of the dataset by many
+    :class:`~flow.utils.data.Sampler` implementations and the default options
+    of :class:`~flow.utils.data.DataLoader`.
+
+    .. note::
+      :class:`~flow.utils.data.DataLoader` by default constructs a index
+      sampler that yields integral indices.  To make it work with a map-style
+      dataset with non-integral indices/keys, a custom sampler must be provided.
+    """
+
+    def __getitem__(self, index) -> T_co:
+        raise NotImplementedError
+
+    def __add__(self, other: "Dataset[T_co]") -> "ConcatDataset[T_co]":
+        return ConcatDataset([self, other])
+
+
+class IterableDataset(Dataset[T_co]):
+    """An iterable Dataset.
+
+    All datasets that represent an iterable of data samples should subclass it.
+    Such form of datasets is particularly useful when data come from a stream.
+
+    All subclasses should overwrite :meth:`__iter__`, which would return an
+    iterator of samples in this dataset.
+
+    When a subclass is used with :class:`~flow.utils.data.DataLoader`, each
+    item in the dataset will be yielded from the :class:`~flow.utils.data.DataLoader`
+    iterator. When :attr:`num_workers > 0`, each worker process will have a
+    different copy of the dataset object, so it is often desired to configure
+    each copy independently to avoid having duplicate data returned from the
+    workers.
+
+    Example 1: splitting workload across all workers in :meth:`__iter__`::
+
+        >>> class MyIterableDataset(flow.utils.data.IterableDataset):
+        ...     def __init__(self, start, end):
+        ...         super(MyIterableDataset).__init__()
+        ...         assert end > start, "this example code only works with end >= start"
+        ...         self.start = start
+        ...         self.end = end
+        ...
+        ...     def __iter__(self):
+        ...         iter_start = self.start
+        ...         iter_end = self.end
+        ...         return iter(range(iter_start, iter_end))
+        ...
+        >>> # should give same set of data as range(3, 7), i.e., [3, 4, 5, 6].
+        >>> ds = MyIterableDataset(start=3, end=7)
+
+        >>> # Single-process loading
+        >>> print(list(flow.utils.data.DataLoader(ds, num_workers=0)))
+        [3, 4, 5, 6]
+
+
+    Example 2: splitting workload across all workers using :attr:`worker_init_fn`::
+
+        >>> class MyIterableDataset(flow.utils.data.IterableDataset):
+        ...     def __init__(self, start, end):
+        ...         super(MyIterableDataset).__init__()
+        ...         assert end > start, "this example code only works with end >= start"
+        ...         self.start = start
+        ...         self.end = end
+        ...
+        ...     def __iter__(self):
+        ...         return iter(range(self.start, self.end))
+        ...
+        >>> # should give same set of data as range(3, 7), i.e., [3, 4, 5, 6].
+        >>> ds = MyIterableDataset(start=3, end=7)
+
+        >>> # Single-process loading
+        >>> print(list(flow.utils.data.DataLoader(ds, num_workers=0)))
+        [3, 4, 5, 6]
+
+    """
+
+    functions: Dict[str, Callable] = {}
+    reduce_ex_hook: Optional[Callable] = None
+
+    def __iter__(self) -> Iterator[T_co]:
+        raise NotImplementedError
+
+    def __getattr__(self, attribute_name):
+        if attribute_name in IterableDataset.functions:
+            function = functools.partial(
+                IterableDataset.functions[attribute_name], self
+            )
+            return function
+        else:
+            raise AttributeError
+
+    @classmethod
+    def register_function(cls, function_name, function):
+        IterableDataset.functions[function_name] = function
+
+    @classmethod
+    def register_datapipe_as_function(cls, function_name, cls_to_register):
+        if function_name in IterableDataset.functions:
+            raise Exception(
+                "Unable to add DataPipe function name {} as it is already taken".format(
+                    function_name
+                )
+            )
+
+        def class_function(cls, source_dp, *args, **kwargs):
+            return cls(source_dp, *args, **kwargs)
+
+        function = functools.partial(class_function, cls_to_register)
+        IterableDataset.functions[function_name] = function
+
+    def __reduce_ex__(self, *args, **kwargs):
+        if IterableDataset.reduce_ex_hook is not None:
+            try:
+                return IterableDataset.reduce_ex_hook(self)
+            except NotImplementedError:
+                pass
+        return super().__reduce_ex__(*args, **kwargs)
+
+    @classmethod
+    def set_reduce_ex_hook(cls, hook_fn):
+        if IterableDataset.reduce_ex_hook is not None and hook_fn is not None:
+            raise Exception("Attempt to override existing reduce_ex_hook")
+        IterableDataset.reduce_ex_hook = hook_fn
+
+
+class TensorDataset(Dataset[Tuple[Tensor, ...]]):
+    """Dataset wrapping tensors.
+
+    Each sample will be retrieved by indexing tensors along the first dimension.
+
+    Args:
+        *tensors (Tensor): tensors that have the same size of the first dimension.
+    """
+
+    tensors: Tuple[Tensor, ...]
+
+    def __init__(self, *tensors: Tensor) -> None:
+        assert all(
+            (tensors[0].size(0) == tensor.size(0) for tensor in tensors)
+        ), "Size mismatch between tensors"
+        self.tensors = tensors
+
+    def __getitem__(self, index):
+        return tuple((tensor[index] for tensor in self.tensors))
+
+    def __len__(self):
+        return self.tensors[0].size(0)
+
+
+class ConcatDataset(Dataset[T_co]):
+    """Dataset as a concatenation of multiple datasets.
+
+    This class is useful to assemble different existing datasets.
+
+    Args:
+        datasets (sequence): List of datasets to be concatenated
+    """
+
+    datasets: List[Dataset[T_co]]
+    cumulative_sizes: List[int]
+
+    @staticmethod
+    def cumsum(sequence):
+        (r, s) = ([], 0)
+        for e in sequence:
+            l = len(e)
+            r.append(l + s)
+            s += l
+        return r
+
+    def __init__(self, datasets: Iterable[Dataset]) -> None:
+        super(ConcatDataset, self).__init__()
+        assert len(datasets) > 0, "datasets should not be an empty iterable"
+        self.datasets = list(datasets)
+        for d in self.datasets:
+            assert not isinstance(
+                d, IterableDataset
+            ), "ConcatDataset does not support IterableDataset"
+        self.cumulative_sizes = self.cumsum(self.datasets)
+
+    def __len__(self):
+        return self.cumulative_sizes[-1]
+
+    def __getitem__(self, idx):
+        if idx < 0:
+            if -idx > len(self):
+                raise ValueError(
+                    "absolute value of index should not exceed dataset length"
+                )
+            idx = len(self) + idx
+        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
+        if dataset_idx == 0:
+            sample_idx = idx
+        else:
+            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+        return self.datasets[dataset_idx][sample_idx]
+
+
+class Subset(Dataset[T_co]):
+    """
+    Subset of a dataset at specified indices.
+
+    Args:
+        dataset (Dataset): The whole Dataset
+        indices (sequence): Indices in the whole set selected for subset
+    """
+
+    dataset: Dataset[T_co]
+    indices: Sequence[int]
+
+    def __init__(self, dataset: Dataset[T_co], indices: Sequence[int]) -> None:
+        self.dataset = dataset
+        self.indices = indices
+
+    def __getitem__(self, idx):
+        return self.dataset[self.indices[idx]]
+
+    def __len__(self):
+        return len(self.indices)
+
+
+def random_split(
+    dataset: Dataset[T],
+    lengths: Sequence[int],
+    generator: Optional[flow.Generator] = default_generator,
+) -> List[Subset[T]]:
+    """
+    Randomly split a dataset into non-overlapping new datasets of given lengths.
+    Optionally fix the generator for reproducible results, e.g.:
+
+    >>> random_split(range(10), [3, 7], generator=flow.Generator().manual_seed(42))
+
+    Args:
+        dataset (Dataset): Dataset to be split
+        lengths (sequence): lengths of splits to be produced
+        generator (Generator): Generator used for the random permutation.
+    """
+    if sum(lengths) != len(dataset):
+        raise ValueError(
+            "Sum of input lengths does not equal the length of the input dataset!"
+        )
+    indices = flow.randperm(sum(lengths), generator=generator).tolist()
+    return [
+        Subset(dataset, indices[offset - length : offset])
+        for (offset, length) in zip(_accumulate(lengths), lengths)
+    ]
diff --git a/python/oneflow/utils/data/decorator.py b/python/oneflow/utils/data/decorator.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2e69278cc9ac1f0016ffe3f7a1bfe62242ec1d3
--- /dev/null
+++ b/python/oneflow/utils/data/decorator.py
@@ -0,0 +1,112 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Any, Callable, Optional, Type, Union
+
+from oneflow.utils.data import IterDataPipe
+
+
+class functional_datapipe(object):
+    name: str
+
+    def __init__(self, name: str) -> None:
+        self.name = name
+
+    def __call__(self, cls):
+        if isinstance(cls, Type):
+            if not issubclass(cls, IterDataPipe):
+                raise TypeError("`functional_datapipe` can only decorate IterDataPipe")
+        elif not isinstance(cls, non_deterministic) and (
+            not (
+                hasattr(cls, "__self__") and isinstance(cls.__self__, non_deterministic)
+            )
+        ):
+            raise TypeError("`functional_datapipe` can only decorate IterDataPipe")
+        IterDataPipe.register_datapipe_as_function(self.name, cls)
+        return cls
+
+
+_determinism: bool = False
+
+
+class guaranteed_datapipes_determinism(object):
+    prev: bool
+
+    def __init__(self) -> None:
+        global _determinism
+        self.prev = _determinism
+        _determinism = True
+
+    def __enter__(self) -> None:
+        pass
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+        global _determinism
+        _determinism = self.prev
+
+
+class non_deterministic(object):
+    cls: Optional[Type[IterDataPipe]] = None
+    deterministic_fn: Callable[[], bool]
+
+    def __init__(self, arg: Union[Type[IterDataPipe], Callable[[], bool]]) -> None:
+        if isinstance(arg, Type):
+            if not issubclass(arg, IterDataPipe):
+                raise TypeError(
+                    "Only `IterDataPipe` can be decorated with `non_deterministic`, but {} is found".format(
+                        arg.__name__
+                    )
+                )
+            self.cls = arg
+        elif isinstance(arg, Callable):
+            self.deterministic_fn = arg
+        else:
+            raise TypeError("{} can not be decorated by non_deterministic".format(arg))
+
+    def __call__(self, *args, **kwargs):
+        global _determinism
+        if self.cls is not None:
+            if _determinism:
+                raise TypeError(
+                    "{} is non-deterministic, but you set 'guaranteed_datapipes_determinism'. You can turn off determinism for this DataPipe if that is acceptable for your application".format(
+                        self.cls.__name__
+                    )
+                )
+            return self.cls(*args, **kwargs)
+        if not (isinstance(args[0], Type) and issubclass(args[0], IterDataPipe)):
+            raise TypeError(
+                "Only `IterDataPipe` can be decorated, but {} is found".format(
+                    args[0].__name__
+                )
+            )
+        self.cls = args[0]
+        return self.deterministic_wrapper_fn
+
+    def deterministic_wrapper_fn(self, *args, **kwargs) -> IterDataPipe:
+        res = self.deterministic_fn(*args, **kwargs)
+        if not isinstance(res, bool):
+            raise TypeError(
+                "deterministic_fn of `non_deterministic` decorator is required to return a boolean value, but {} is found".format(
+                    type(res)
+                )
+            )
+        global _determinism
+        if _determinism and res:
+            raise TypeError(
+                "{} is non-deterministic with the inputs, but you set 'guaranteed_datapipes_determinism'. You can turn off determinism for this DataPipe if that is acceptable for your application".format(
+                    self.cls.__name__
+                )
+            )
+        return self.cls(*args, **kwargs)
diff --git a/python/oneflow/utils/data/sampler.py b/python/oneflow/utils/data/sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..7567ca028b6f9fddbd5ea4d6bc812531da5b750c
--- /dev/null
+++ b/python/oneflow/utils/data/sampler.py
@@ -0,0 +1,207 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import builtins
+from typing import Generic, Iterator, List, Optional, Sequence, Sized, TypeVar, Union
+
+import numpy as np
+
+import oneflow as flow
+from oneflow.framework.tensor import Tensor
+
+T_co = TypeVar("T_co", covariant=True)
+
+
+class Sampler(Generic[T_co]):
+    """Base class for all Samplers.
+
+    Every Sampler subclass has to provide an :meth:`__iter__` method, providing a
+    way to iterate over indices of dataset elements, and a :meth:`__len__` method
+    that returns the length of the returned iterators.
+
+    .. note:: The :meth:`__len__` method isn't strictly required by
+              :class:`~flow.utils.data.DataLoader`, but is expected in any
+              calculation involving the length of a :class:`~flow.utils.data.DataLoader`.
+    """
+
+    def __init__(self, data_source: Optional[Sized]) -> None:
+        pass
+
+    def __iter__(self) -> Iterator[T_co]:
+        raise NotImplementedError
+
+
+class SequentialSampler(Sampler[int]):
+    """Samples elements sequentially, always in the same order.
+
+    Args:
+        data_source (Dataset): dataset to sample from
+    """
+
+    data_source: Sized
+
+    def __init__(self, data_source):
+        self.data_source = data_source
+
+    def __iter__(self):
+        return iter(range(len(self.data_source)))
+
+    def __len__(self) -> int:
+        return len(self.data_source)
+
+
+class RandomSampler(Sampler[int]):
+    """Samples elements randomly. If without replacement, then sample from a shuffled dataset.
+    If with replacement, then user can specify :attr:`num_samples` to draw.
+
+    Args:
+        data_source (Dataset): dataset to sample from
+        replacement (bool): samples are drawn on-demand with replacement if ``True``, default=``False``
+        num_samples (int): number of samples to draw, default=`len(dataset)`. This argument
+            is supposed to be specified only when `replacement` is ``True``.
+        generator (Generator): Generator used in sampling.
+    """
+
+    data_source: Sized
+    replacement: bool
+
+    def __init__(
+        self,
+        data_source: Sized,
+        replacement: bool = False,
+        num_samples: Optional[int] = None,
+        generator=None,
+    ) -> None:
+        self.data_source = data_source
+        self.replacement = replacement
+        self._num_samples = num_samples
+        self.generator = generator
+        if not isinstance(self.replacement, bool):
+            raise TypeError(
+                "replacement should be a boolean value, but got replacement={}".format(
+                    self.replacement
+                )
+            )
+        if self._num_samples is not None and (not replacement):
+            raise ValueError(
+                "With replacement=False, num_samples should not be specified, since a random permute will be performed."
+            )
+        if not isinstance(self.num_samples, int) or self.num_samples <= 0:
+            raise ValueError(
+                "num_samples should be a positive integer value, but got num_samples={}".format(
+                    self.num_samples
+                )
+            )
+
+    @property
+    def num_samples(self) -> int:
+        if self._num_samples is None:
+            return len(self.data_source)
+        return self._num_samples
+
+    def __iter__(self):
+        n = len(self.data_source)
+        if self.generator is None:
+            generator = flow.Generator()
+            generator.manual_seed(
+                int(flow.Tensor(1, dtype=flow.int64).xavier_uniform_().numpy()[0])
+            )
+        else:
+            generator = self.generator
+        if self.replacement:
+            raise NotImplementedError("Not support replacement yet!")
+        else:
+            yield from np.random.permutation(n).tolist()
+
+    def __len__(self):
+        return self.num_samples
+
+
+class SubsetRandomSampler(Sampler[int]):
+    """Samples elements randomly from a given list of indices, without replacement.
+
+    Args:
+        indices (sequence): a sequence of indices
+        generator (Generator): Generator used in sampling.
+    """
+
+    indices: Sequence[int]
+
+    def __init__(self, indices: Sequence[int], generator=None) -> None:
+        self.indices = indices
+        self.generator = generator
+
+    def __iter__(self):
+        return (
+            self.indices[i]
+            for i in flow.randperm(len(self.indices), generator=self.generator)
+        )
+
+    def __len__(self):
+        return len(self.indices)
+
+
+class BatchSampler(Sampler[List[int]]):
+    """Wraps another sampler to yield a mini-batch of indices.
+
+    Args:
+        sampler (Sampler or Iterable): Base sampler. Can be any iterable object
+        batch_size (int): Size of mini-batch.
+        drop_last (bool): If ``True``, the sampler will drop the last batch if
+            its size would be less than ``batch_size``
+
+    Example:
+        >>> list(BatchSampler(SequentialSampler(range(10)), batch_size=3, drop_last=False))
+        [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
+        >>> list(BatchSampler(SequentialSampler(range(10)), batch_size=3, drop_last=True))
+        [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
+    """
+
+    def __init__(self, sampler: Sampler[int], batch_size: int, drop_last: bool) -> None:
+        if (
+            not isinstance(batch_size, int)
+            or isinstance(batch_size, bool)
+            or batch_size <= 0
+        ):
+            raise ValueError(
+                "batch_size should be a positive integer value, but got batch_size={}".format(
+                    batch_size
+                )
+            )
+        if not isinstance(drop_last, bool):
+            raise ValueError(
+                "drop_last should be a boolean value, but got drop_last={}".format(
+                    drop_last
+                )
+            )
+        self.sampler = sampler
+        self.batch_size = batch_size
+        self.drop_last = drop_last
+
+    def __iter__(self):
+        batch = []
+        for idx in self.sampler:
+            batch.append(idx)
+            if len(batch) == self.batch_size:
+                yield batch
+                batch = []
+        if len(batch) > 0 and (not self.drop_last):
+            yield batch
+
+    def __len__(self):
+        if self.drop_last:
+            return len(self.sampler) // self.batch_size
+        else:
+            return (len(self.sampler) + self.batch_size - 1) // self.batch_size
diff --git a/python/oneflow/vm.py b/python/oneflow/vm.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/setup.py b/python/setup.py
similarity index 57%
rename from setup.py
rename to python/setup.py
index 1c9dfd645e9dbe01eed58a942ad642e76a9382b8..6ed0d27fb0e2f5e820d7c857b70f2581681455c4 100644
--- a/setup.py
+++ b/python/setup.py
@@ -1,14 +1,29 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 from __future__ import absolute_import
 
-import os
-import sys
 import argparse
 import glob
+import os
 import platform
-from setuptools import find_packages
-from setuptools import setup
-from setuptools.dist import Distribution
+import sys
+
+from setuptools import find_packages, setup
 from setuptools.command.install import install
+from setuptools.dist import Distribution
 
 
 # https://github.com/google/or-tools/issues/616
@@ -21,22 +36,9 @@ class InstallPlatlib(install):
 
 parser = argparse.ArgumentParser()
 parser.register("type", "bool", lambda v: v.lower() == "true")
-parser.add_argument(
-    "--with_xla",
-    type="bool",
-    default=False,
-    help="Package xla libraries if true, otherwise not.",
-)
-parser.add_argument("--build_dir", type=str, default="build")
 parser.add_argument("--package_name", type=str, default="oneflow")
 args, remain_args = parser.parse_known_args()
 sys.argv = ["setup.py"] + remain_args
-build_dir_from_env = os.getenv("ONEFLOW_CMAKE_BUILD_DIR")
-build_dir = args.build_dir
-if build_dir_from_env:
-    build_dir = build_dir_from_env
-
-print("using cmake build dir:", build_dir)
 REQUIRED_PACKAGES = [
     "numpy",
     "protobuf>=3.9.2",
@@ -53,29 +55,17 @@ class BinaryDistribution(Distribution):
         return True
 
 
-python_scripts_dir = os.path.join(build_dir, "python_scripts")
-packages = find_packages(python_scripts_dir)
-package_dir = {
-    "": python_scripts_dir,
-}
-
-include_files = glob.glob(
-    "{}/python_scripts/oneflow/include/**/*".format(build_dir), recursive=True
-)
-include_files = [
-    os.path.relpath(p, "{}/python_scripts/oneflow".format(build_dir))
-    for p in include_files
-]
+include_files = glob.glob("oneflow/include/**/*", recursive=True)
+include_files = [os.path.relpath(p, "oneflow") for p in include_files]
+assert len(include_files) > 0, os.path.abspath("oneflow/include")
 
 
 def get_oneflow_internal_so_path():
     import imp
 
-    fp, pathname, description = imp.find_module(
-        "_oneflow_internal", ["{}/python_scripts/oneflow".format(build_dir)]
-    )
+    fp, pathname, description = imp.find_module("_oneflow_internal", ["oneflow"])
     assert os.path.isfile(pathname)
-    return os.path.relpath(pathname, "{}/python_scripts/oneflow".format(build_dir))
+    return os.path.relpath(pathname, "oneflow")
 
 
 package_data = {"oneflow": [get_oneflow_internal_so_path()] + include_files}
@@ -85,7 +75,7 @@ def get_version():
     import importlib.util
 
     spec = importlib.util.spec_from_file_location(
-        "version", os.path.join(python_scripts_dir, "oneflow", "python", "version.py")
+        "version", os.path.join("oneflow", "version.py")
     )
     m = importlib.util.module_from_spec(spec)
     spec.loader.exec_module(m)
@@ -97,8 +87,8 @@ setup(
     version=get_version(),
     url="https://www.oneflow.org/",
     install_requires=REQUIRED_PACKAGES,
-    packages=packages,
-    package_dir=package_dir,
+    packages=find_packages(),
+    package_dir={"oneflow": "oneflow"},
     package_data=package_data,
     zip_safe=False,
     distclass=BinaryDistribution,
diff --git a/tools/check_src.py b/tools/check_src.py
index 7177f834429ec25a10ecdf3f3889d5fc928e0cb0..e3ec06cd5c8356f245ff89bf7640189c13142d1d 100644
--- a/tools/check_src.py
+++ b/tools/check_src.py
@@ -41,13 +41,13 @@ def check_unwanted_test_scripts(python_test_dir=None, allowed=None):
 
 
 check_unwanted_test_scripts(
-    python_test_dir=os.path.join(src_root, "oneflow/python/test"),
+    python_test_dir=os.path.join(src_root, "python/oneflow/test"),
     allowed=["custom_ops", "dataloader", "graph", "models", "modules", "tensor"],
 )
 
 check_unwanted_test_scripts(
     python_test_dir=os.path.join(
-        src_root, "oneflow/compatible_single_client_python/test"
+        src_root, "python/oneflow/compatible/single_client/test"
     ),
     allowed=["models", "ops", "serving", "xrt",],
 )
diff --git a/tools/extract_oneflow_export.py b/tools/extract_oneflow_export.py
new file mode 100644
index 0000000000000000000000000000000000000000..06d20eba4acfe6fa42b12c5a7e00c196f2085fb0
--- /dev/null
+++ b/tools/extract_oneflow_export.py
@@ -0,0 +1,676 @@
+# python3 -m pip install isort autoflake astpretty black
+# requires python3.9 to run
+import os
+import argparse
+import ast
+import subprocess
+import multiprocessing
+from pathlib import Path
+import astpretty
+import sys
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--out_dir", type=str, default="python",
+)
+parser.add_argument("--verbose", "-v", action="store_true")
+parser.add_argument("--debug", "-d", action="store_true")
+parser.add_argument("--autoflake", "-a", action="store_true")
+parser.add_argument("--black", "-b", action="store_true")
+parser.add_argument("--isort", "-i", action="store_true")
+parser.add_argument("--license", "-l", action="store_true")
+parser.add_argument("--ast", action="store_true")
+args = parser.parse_args()
+
+OUT_PATH = Path(args.out_dir)
+SHOULD_SAVE_AST = args.ast
+COMPATIBLE_MODULE = "oneflow.compatible.single_client"
+
+
+def dumpprint(node):
+    astpretty.pprint(node)
+
+
+def is_decorator(d, name=None):
+    return (isinstance(d, ast.Name) and d.id == name) or (
+        isinstance(d, ast.Call) and isinstance(d.func, ast.Name) and d.func.id == name
+    )
+
+
+def is_stable(node: ast.AST):
+    for d in node.decorator_list:
+        if is_decorator(d, "stable_api"):
+            return True
+    return False
+
+
+def is_experimental(node: ast.AST):
+    for d in node.decorator_list:
+        if is_decorator(d, "experimental_api"):
+            return True
+    return False
+
+
+def get_parent_module(value):
+    return ".".join(value.split(".")[0:-1])
+
+
+def join_module(*args):
+    return ".".join([m for m in args if m])
+
+
+def path_from_module(module, is_init=False):
+    if is_init:
+        return Path("/".join(module.split(".") + ["__init__.py"]))
+    else:
+        return Path("/".join(module.split(".")) + ".py")
+
+
+def module_from_path(path: Path):
+    assert path.name.endswith(".py")
+    parts = path.parts
+    if parts[-1] == "__init__.py":
+        return ".".join(path.parts[0:-1])
+    else:
+        return ".".join(path.parts)[0:-3]
+
+
+def is_compatible_root_module(module: str):
+    if module == COMPATIBLE_MODULE:
+        return True
+    assert module == "oneflow"
+    return False
+
+
+class ReservedKeywordsVisitor(ast.NodeVisitor):
+    def __init__(self, keywords=None) -> None:
+        self.keywords = keywords
+        self.has_reserved_keyword = False
+
+    def visit_Name(self, node: ast.Name):
+        if node.id in self.keywords:
+            self.has_reserved_keyword = True
+
+
+def replace_filename(name: str):
+    return name.replace("name_scope", "namescope")
+
+
+def replace_str(name: str):
+    name = replace_filename(name)
+    name = name.replace("lib.core", "support")
+    name = name.replace("compatible.single_client.core", "core")
+    name = name.replace("enable_typing_check", "typing_check")
+    if name.startswith("oneflow.python."):
+        return name.replace("oneflow.python.", "oneflow.")
+    elif name == "oneflow.python":
+        return "oneflow"
+    elif "single_client.python." in name or name.endswith("single_client.python"):
+        return name.replace("single_client.python", "single_client")
+    else:
+        return name
+
+
+class ExportVisitor(ast.NodeTransformer):
+    def __init__(self, root_module="oneflow", src_target_module: str = None) -> None:
+        super().__init__()
+        self.staging_decorators = []
+        self.root_module = root_module
+        self.export_modules = {}
+        self.top_imports = []
+        self.src_target_module = src_target_module
+
+    def append_export(self, target_module=None, node=None):
+        if target_module not in self.export_modules:
+            module = ast.Module(body=[], type_ignores=[])
+            self.export_modules[target_module] = module
+        else:
+            module = self.export_modules[target_module]
+        if isinstance(node, list):
+            module.body += node
+        else:
+            module.body.append(node)
+
+    def visit_Expr(self, node):
+        if isinstance(node.value, ast.Constant) and isinstance(node.value.value, str):
+            if "Copyright 2020 The OneFlow Authors" in node.value.value:
+                return None
+        return node
+
+    def visit_ImportFrom(self, node):
+        for name in node.names:
+            if isinstance(name, ast.alias) and name.name == "oneflow_deprecate":
+                return ast.ImportFrom(
+                    module="oneflow",
+                    names=[ast.alias(name="oneflow_deprecate")],
+                    level=0,
+                )
+        for name in node.names:
+            if not self.visit(name):
+                return None
+        if node.module:
+            if node.module == "__future__" or "oneflow_export" in node.module:
+                return None
+            node.module = replace_str(node.module)
+        self.top_imports.append(node)
+        return node
+
+    def visit_Import(self, node):
+        for name in node.names:
+            if not super().visit(name):
+                return None
+        self.top_imports.append(node)
+        return node
+
+    def visit_alias(self, node: ast.alias) -> ast.alias:
+        if node.name.startswith("oneflow.python."):
+            node.name = replace_str(node.name)
+            return node
+        elif node.name == "oneflow_export":
+            return None
+        elif "__export_symbols__" in node.name:
+            return None
+        else:
+            node.name = replace_str(node.name)
+            return node
+
+    def visit_Name(self, node: ast.AST):
+        if node.id == "oneflow_export":
+            return None
+        if node.id == "stable_api":
+            return None
+        if node.id == "experimental_api":
+            return None
+        return node
+
+    def visit_Call(self, node: ast.AST):
+        if not self.visit(node.func):
+            return None
+        return node
+
+    def visit_ClassDef(self, node):
+        node.body = [self.visit(n) for n in node.body]
+        return self.visit_FunctionDef(node)
+
+    def visit_FunctionDef(self, node):
+        is_compatible_and_experimental = is_compatible_root_module(
+            self.root_module
+        ) and is_experimental(node)
+        if not is_compatible_root_module(self.root_module) and is_stable(node):
+            return None
+        compact_decorator_list = [self.visit(d) for d in node.decorator_list]
+        compact_decorator_list = [d for d in compact_decorator_list if d]
+        node.body = [self.visit(n) for n in node.body]
+        rkv = ReservedKeywordsVisitor(keywords=set({"int", "float"}))
+        rkv.visit(node)
+        has_reserved_keyword = rkv.has_reserved_keyword
+        is_deprecated = False
+        for d in node.decorator_list:
+            if is_decorator(d, name="oneflow_deprecate"):
+                is_deprecated = True
+        for d in node.decorator_list:
+            if is_decorator(d, name="register_tensor_op"):
+                import_src = ast.parse(f"import {self.src_target_module}")
+                self.append_export(target_module=self.root_module, node=import_src)
+            if is_decorator(d, name="oneflow_export"):
+                is_kept_in_src = (
+                    True
+                    or has_reserved_keyword
+                    or self.src_target_module == target_module
+                    or target_module in ["oneflow", "oneflow.scope", COMPATIBLE_MODULE]
+                )
+                arg0 = d.args[0]
+                experimental_module = None
+                if is_compatible_and_experimental:
+                    experimental_module = "experimental"
+                target_module0 = join_module(
+                    self.root_module, experimental_module, get_parent_module(arg0.value)
+                )
+                target_symbol0 = arg0.value.split(".")[-1]
+
+                if ".".join([target_module0, target_symbol0]) == self.src_target_module:
+                    raise ValueError(
+                        "[colition][both func and module]", self.src_target_module
+                    )
+                if is_kept_in_src:
+                    target_module = self.src_target_module
+                    target_symbol = node.name
+                else:
+                    target_module = target_module0
+                    target_symbol = target_symbol0
+                # nth export: import from first export
+                for argN in d.args[1::]:
+                    target_moduleN = join_module(
+                        self.root_module,
+                        experimental_module,
+                        get_parent_module(argN.value),
+                    )
+                    target_nameN = argN.value.split(".")[-1]
+                    assert arg0 != argN, {"arg0": arg0, "argN": argN}
+                    import_from_first_export = ast.ImportFrom(
+                        module=target_module,
+                        names=[ast.alias(name=target_symbol, asname=target_nameN),],
+                        level=0,
+                    )
+                    self.append_export(
+                        target_module=target_moduleN, node=import_from_first_export
+                    )
+
+                if is_deprecated:
+                    import_oneflow_deprecate = ast.ImportFrom(
+                        module="oneflow",
+                        names=[ast.alias(name="oneflow_deprecate")],
+                        level=0,
+                    )
+
+                node.decorator_list = compact_decorator_list
+                if is_kept_in_src:
+                    asname = target_symbol0
+                    if node.name == target_symbol0:
+                        asname = None
+                    if target_module0 == target_module and node.name == target_symbol0:
+                        # print("[skip]", target_module0, target_symbol0)
+                        pass
+                    else:
+                        import_from_src = ast.ImportFrom(
+                            module=self.src_target_module,
+                            names=[ast.alias(name=node.name, asname=asname),],
+                            level=0,
+                        )
+                        self.append_export(
+                            target_module=target_module0, node=import_from_src
+                        )
+                    if is_deprecated:
+                        return [import_oneflow_deprecate, node]
+                    else:
+                        return node
+                else:
+                    if is_deprecated:
+                        self.append_export(
+                            target_module=target_module, node=import_oneflow_deprecate
+                        )
+                    # prepend imports in target module
+                    self.append_export(
+                        target_module=target_module, node=self.top_imports
+                    )
+                    if target_module != "oneflow":
+                        import_star_from_src = ast.ImportFrom(
+                            module=self.src_target_module,
+                            names=[ast.alias(name="*")],
+                            level=0,
+                        )
+                        # node.body.insert(0, import_star_from_src)
+                        self.append_export(
+                            target_module=target_module, node=import_star_from_src
+                        )
+                    # save func name for src import as before modifing node.name
+                    src_asname = None
+                    if node.name != target_symbol:
+                        src_asname = node.name
+                    # save first export in target module
+                    node.name = target_symbol
+                    self.append_export(target_module=target_module, node=node)
+
+                    # src: import from first export
+                    return ast.ImportFrom(
+                        module=target_module,
+                        names=[ast.alias(name=target_symbol, asname=src_asname),],
+                        level=0,
+                    )
+            if is_decorator(d, name="oneflow_export_value"):
+                assert len(node.body) == 2
+                assert len(d.args) == 1
+                target_module = join_module(
+                    self.root_module, get_parent_module(d.args[0].value)
+                )
+                call = node.body[1].value
+                assign = ast.Assign(
+                    targets=[
+                        ast.Name(id=d.args[0].value.split(".")[-1], ctx=ast.Store())
+                    ],
+                    value=call,
+                )
+                self.append_export(target_module=target_module, node=assign)
+                # TODO: the doc is not dumped properly
+                # doc = node.body[0]
+                # self.append_export(target_module=target_module, node=doc)
+                return None
+        node.decorator_list = compact_decorator_list
+        return node
+
+
+class SrcFile:
+    def __init__(self, spec) -> None:
+        is_test = "is_test" in spec and spec["is_test"]
+        self.export_visitor = None
+        self.tree = None
+        self.dst = Path(spec["dst"])
+        self.src: Path = spec["src"]
+        self.target_module = module_from_path(self.dst)
+        self.target_module = replace_str(self.target_module)
+        if is_test and args.verbose:
+            print("[skip test]", self.src)
+        else:
+            txt = self.src.read_text()
+            self.tree = ast.parse(txt)
+            root_module = "oneflow"
+            if (
+                "compatible_single_client_python" in self.src.parts
+                or self.src.name == "single_client_init.py"
+                or self.src.name == "single_client_main.py"
+            ):
+                root_module = COMPATIBLE_MODULE
+            self.export_visitor = ExportVisitor(
+                root_module=root_module, src_target_module=self.target_module
+            )
+            self.export_visitor.visit(self.tree)
+            if self.target_module == root_module:
+                self.export_visitor.append_export(
+                    target_module=root_module,
+                    node=ast.parse(f"from . import distributed"),
+                )
+                if self.target_module == "oneflow":
+                    self.export_visitor.append_export(
+                        target_module=root_module,
+                        node=ast.parse(f"from . import saved_model"),
+                    )
+                else:
+                    self.export_visitor.append_export(
+                        target_module="oneflow.compatible.single_client",
+                        node=ast.parse(
+                            f"from . import env, scope, math, optimizer, losses, config, layers, summary, random, typing, train, data, profiler, sysconfig, checkpoint, distribute, util, model, image, tensorrt, saved_model, regularizers"
+                        ),
+                    )
+                    self.export_visitor.append_export(
+                        target_module="oneflow.compatible.single_client.experimental",
+                        node=ast.parse(
+                            f"""from . import scope
+from oneflow.compatible.single_client import unittest
+"""
+                        ),
+                    )
+                    self.export_visitor.append_export(
+                        target_module="oneflow.compatible.single_client.deprecated",
+                        node=ast.parse(f"from . import nn"),
+                    )
+                    self.export_visitor.append_export(
+                        target_module="oneflow.compatible.single_client.config",
+                        node=ast.parse(f"from . import collective_boxing"),
+                    )
+                    self.export_visitor.append_export(
+                        target_module="oneflow.compatible.single_client.unittest",
+                        node=ast.parse(f"from . import env"),
+                    )
+                    self.export_visitor.append_export(
+                        target_module="oneflow.compatible.single_client.optimizer",
+                        node=ast.parse(
+                            f"from . import warmup, loss_scale, grad_clipping"
+                        ),
+                    )
+            #     self.export_visitor.append_export(
+            #         target_module=".".join([root_module, "lib.core"]), node=ast.parse(f"from . import async_util")
+            #     )
+
+
+def get_specs_under_python(python_path=None, dst_path=None):
+    specs = []
+    for p in Path(python_path).rglob("*.py"):
+        if p.name == "version.py":
+            continue
+        rel = p.relative_to(python_path)
+        dst = Path(dst_path).joinpath(rel)
+        dst = Path(replace_filename(str(dst)))
+        spec = {"src": p, "dst": dst}
+        if rel.parts[0] == "test":
+            spec["is_test"] = True
+        specs.append(spec)
+    return specs
+
+
+def get_files():
+    srcs = (
+        get_specs_under_python(python_path="oneflow/python", dst_path="oneflow")
+        + get_specs_under_python(
+            python_path="oneflow/compatible_single_client_python",
+            dst_path="oneflow/compatible/single_client",
+        )
+        + [
+            {"src": Path("oneflow/init.py"), "dst": "oneflow/__init__.py"},
+            {"src": Path("oneflow/__main__.py"), "dst": "oneflow/__main__.py"},
+            {
+                "src": Path("oneflow/single_client_init.py"),
+                "dst": "oneflow/compatible/single_client/__init__.py",
+            },
+            {
+                "src": Path("oneflow/single_client_main.py"),
+                "dst": "oneflow/compatible/single_client/__main__.py",
+            },
+        ]
+    )
+    srcs = list(filter(lambda x: ("oneflow_export" not in x["src"].name), srcs))
+    if args.debug:
+        srcs = [
+            {
+                "src": Path("oneflow/python/ops/nn_ops.py"),
+                "dst": "oneflow/ops/nn_ops.py",
+            },
+            {
+                "src": Path("oneflow/python/advanced/distribute_ops.py"),
+                "dst": "oneflow/advanced/distribute_ops.py",
+            },
+        ]
+    pool = multiprocessing.Pool()
+    srcs = pool.map(SrcFile, srcs,)
+    pool.close()
+    return srcs
+
+
+class ModuleNode:
+    def __init__(self, name=None, parent=None) -> None:
+        self.children = dict()
+        self.parent = parent
+        self.level = 0
+        if parent:
+            self.level = parent.level + 1
+        self.name = name
+
+    def add_or_get_child(self, name):
+        if name in self.children:
+            return self.children[name]
+        else:
+            self.children[name] = ModuleNode(name=name, parent=self)
+            return self.children[name]
+
+    @property
+    def is_leaf(self):
+        return len(self.children.keys()) == 0
+
+    def walk(self, cb):
+        cb(self)
+        for child in self.children.values():
+            child.walk(cb)
+
+    @property
+    def leafs(self):
+        ret = []
+
+        def add_leafs(node: ModuleNode):
+            if node.is_leaf:
+                ret.append(node)
+
+        self.walk(add_leafs)
+        return ret
+
+    @property
+    def full_name(self):
+        current_parent = self
+        ret = self.name
+        while current_parent.parent:
+            current_parent = current_parent.parent
+            ret = current_parent.name + "." + ret
+        return ret
+
+    def __str__(self) -> str:
+        return "\n".join(
+            [f"{self.full_name}"]
+            + [child.__str__() for child in self.children.values()]
+        )
+
+    @staticmethod
+    def add_sub_module(root=None, module=None):
+        parts = module.split(".")
+        current_node = root
+        assert current_node.name == parts[0]
+        for part in parts[1::]:
+            current_node = current_node.add_or_get_child(part)
+
+
+def save_trees(args=None):
+    dst: Path = args["dst"]
+    trees = args["trees"]
+    dst_full = OUT_PATH.joinpath(dst)
+    dst_full.parent.mkdir(parents=True, exist_ok=True)
+    dst_full.touch(exist_ok=False)
+    # TODO: append "doctest.testmod(raise_on_error=True)"
+    trees = [ast.fix_missing_locations(tree) for tree in trees]
+    if SHOULD_SAVE_AST:
+        new_txt = "\n".join([str(astpretty.pformat(tree)) for tree in trees])
+        new_txt = f"""from ast import *
+{new_txt}
+"""
+        dst_full.with_suffix(".ast.py").write_text(new_txt)
+    new_txt = ""
+    if dst.name.startswith("test_"):
+        if "compatible" in str(dst):
+            new_txt += f"""
+import {COMPATIBLE_MODULE}.unittest
+"""
+        else:
+            new_txt += """
+import oneflow.unittest
+"""
+    new_txt += "\n".join([ast.unparse(tree) for tree in trees])
+    dst_full.write_text(new_txt)
+
+
+def append_trees(tree_dict: dict, module: str, tree: ast.AST):
+    tree_dict[module] = tree_dict.get(module, [])
+    tree_dict[module].append(tree)
+
+
+if __name__ == "__main__":
+    out_oneflow_dir = os.path.join(args.out_dir, "oneflow")
+    assert args.out_dir
+    assert args.out_dir != "~"
+    assert args.out_dir != "/"
+    subprocess.check_call(f"mkdir -p {OUT_PATH}", shell=True)
+
+    for py_f in Path(out_oneflow_dir).glob("**/*.py"):
+        if py_f.name != "version.py":
+            py_f.unlink()
+    for pyc in Path(out_oneflow_dir).glob("**/*.pyc"):
+        pyc.unlink()
+    for d in reversed(
+        [
+            Path(dirpath)
+            for dirpath, dirnames, files in os.walk(out_oneflow_dir)
+            if not files and not dirnames
+        ]
+    ):
+        if "include" not in str(d) and "core" not in str(d):
+            d.rmdir()
+
+    srcs = get_files()
+    final_trees = {}
+
+    root_module = ModuleNode(name="oneflow")
+    src_module_added = {}
+    for s in srcs:
+        # src
+        append_trees(tree_dict=final_trees, module=s.target_module, tree=s.tree)
+        if (
+            str(s.src) == "oneflow/python/__init__.py"
+            or str(s.src) == "oneflow/compatible_single_client_python/__init__.py"
+        ):
+            assert not s.src.read_text()
+            continue
+        assert s.target_module not in src_module_added, {
+            "target_module": s.target_module,
+            "new": str(s.src),
+            "exist": str(src_module_added[s.target_module]),
+        }
+        src_module_added[s.target_module] = s.src
+        ModuleNode.add_sub_module(root=root_module, module=s.target_module)
+    for s in srcs:
+        # exports
+        for export_path, export_tree in s.export_visitor.export_modules.items():
+            append_trees(tree_dict=final_trees, module=export_path, tree=export_tree)
+            ModuleNode.add_sub_module(root=root_module, module=export_path)
+    leaf_modules = set([leaf.full_name for leaf in root_module.leafs])
+    pool = multiprocessing.Pool()
+
+    def is_init(module: str):
+        is_leaf = module in leaf_modules
+        is_magic = module.endswith("__")
+        return is_leaf == False and is_magic == False
+
+    srcs = pool.map(
+        save_trees,
+        [
+            {"dst": path_from_module(module, is_init=is_init(module)), "trees": trees,}
+            for module, trees in final_trees.items()
+        ],
+    )
+    pool.close()
+    # TODO: touch __init__.py, oneflow/F/__init__.py
+    Path(os.path.join(OUT_PATH, "oneflow", "F")).mkdir(exist_ok=True)
+    Path(os.path.join(OUT_PATH, "oneflow", "F/__init__.py")).touch()
+    Path(os.path.join(OUT_PATH, COMPATIBLE_MODULE.replace(".", "/"), "F")).mkdir(
+        parents=True, exist_ok=True
+    )
+    Path(
+        os.path.join(OUT_PATH, COMPATIBLE_MODULE.replace(".", "/"), "F/__init__.py")
+    ).touch()
+    Path(
+        os.path.join(OUT_PATH, COMPATIBLE_MODULE.replace(".", "/"), "experimental/F")
+    ).mkdir(parents=True, exist_ok=True)
+    Path(
+        os.path.join(
+            OUT_PATH, COMPATIBLE_MODULE.replace(".", "/"), "experimental/F/__init__.py"
+        )
+    ).touch()
+    Path(os.path.join(OUT_PATH, f"oneflow/compatible/__init__.py")).touch()
+    # step 1: extract all exports
+    # step 2: merge exports into src in python/
+    # step 4: finalize __all__, if it is imported by another module or wrapped in 'oneflow.export', it should appears in __all__
+    # step 5: save file and post process (sort imports, format, etc)
+    extra_arg = ""
+    if args.verbose == False:
+        extra_arg += "--quiet"
+    if args.autoflake:
+        print("[postprocess]", "autoflake")
+        subprocess.check_call(
+            f"{sys.executable} -m autoflake --in-place --remove-all-unused-imports --exclude '**/*.ast.py' --recursive .",
+            shell=True,
+            cwd=args.out_dir,
+        )
+    if args.isort:
+        print("[postprocess]", "isort")
+        subprocess.check_call(
+            f"{sys.executable} -m isort --skip oneflow/utils/data/__init__.py . {extra_arg}",
+            shell=True,
+            cwd=args.out_dir,
+        )
+    if args.license:
+        print("[postprocess]", "license")
+        subprocess.check_call(
+            f"`which python3` ci/check/run_license_format.py -i {OUT_PATH} --fix --silent",
+            shell=True,
+        )
+    if args.black:
+        print("[postprocess]", "black")
+        subprocess.check_call(
+            f"`which python3` -m black --exclude '\\.ast\\.py' . {extra_arg}",
+            shell=True,
+            cwd=args.out_dir,
+        )
diff --git a/tools/generate_pip_version.py b/tools/generate_pip_version.py
index fa700786a24c77d609781660af4455fa54b93d0b..f523d7ec0918d0e99a23ce5b81f1145637f8c97e 100644
--- a/tools/generate_pip_version.py
+++ b/tools/generate_pip_version.py
@@ -7,6 +7,7 @@ parser = argparse.ArgumentParser()
 parser.add_argument("--xla", default=False, action="store_true", required=False)
 parser.add_argument("--cuda", type=str, required=False)
 parser.add_argument("--src", type=str, required=False)
+parser.add_argument("--out", type=str, required=False)
 args = parser.parse_args()
 
 local_label = ""
@@ -54,8 +55,7 @@ if not os.getenv("ONEFLOW_RELEASE_VERSION") and not os.getenv(
     version += f".git.{git_hash}"
 
 
-dst = os.path.join(args.src, "oneflow/python/version.py")
-print(f"-- Generating pip version: {version}, writing to: {dst}")
-assert args.src
-with open(dst, "w+") as f:
+print(f"-- Generating pip version: {version}, writing to: {args.out}")
+assert args.out
+with open(args.out, "w+") as f:
     f.write(f'__version__ = "{version}"')